def NCRNAAnnot ( expression: CSV , gtf: BinaryFile , knownFeatures: String = "", ensembl_host: String = "feb2014.archive.ensembl.org" , ensembl_dataset: String = "hsapiens_gene_ensembl" ): (CSV) = { /* Retrieve known ncRNA annotations and identify neighbouring or overlapping ncRNAs to putative novel miRNAs */ val for_PD3 = if (knownFeatures == "") { // Create reference GTF file. Takes more time! REvaluate( script = INPUT(path="PDannotations.r"), table1 = expression, table2 = gtf.asInstanceOf[CSV] ) } else { // Extract known ncRNA positions in genome if (new java.io.File(knownFeatures).exists()) { REvaluate( script = INPUT(path="PDannotations.r"), table1 = expression, param1 = knownFeatures ) } else { sys.error("Input knownFeatures does not exist: " + knownFeatures) } } // Calculate coordinate distance and find nearest neighbour of novel miRNAs val novelExpr_PD3 = PointDistance( in = for_PD3.table, reference = for_PD3.outArray("novel"), inId = "knownID", referenceId = "novelID", cols = "chr,start,end", nth = 2 ) // Extract just the best overlapping/neighbouring known ncRNA of novel miRNAs val best_PD3 = CSVFilter( in = novelExpr_PD3.out, highBound = "Distance=1000" ) // Add annotations with expression matrix for putative miRNAs // NOTE: Difference between "host" genes and "nearestNCRNA" // is that "host" completely overlaps the putative miRNA location, // but "nearestNCRNA" is nearby but NOT necessarily overlapping. val geneOverlap = REvaluate( script = INPUT(path="geneoverlap.r"), param1 = ensembl_host, param2 = ensembl_dataset, //param2 = "/mnt/csc-gc5/icay/_ref/Ensembl37v72/EnsemblGene37v72.txt", table1 = expression, table2 = best_PD3.out, table3 = for_PD3.table, table4 = gtf.asInstanceOf[CSV] ) return geneOverlap.table }