library(componentSkeleton) execute <- function(cf) { # Create array output dir. counts <- get.output(cf, "countArray") dir.create(counts) # Array object counts.array.output <- Array.new() # Parameters + input minCount <- get.parameter(cf,"minCount","int") minPercent <- get.parameter(cf,"minPercent","float") inclusionKey <- get.parameter(cf,"inclusionKey","string") samples <- CSV.read(get.input(cf, "samples")) allsWell <- TRUE # Check if TPM inputs are given and # raise error if TPMcolumn is not a column name of TPM if (input.defined(cf,"TPM")) { TPM <- CSV.read(get.input(cf,"TPM")) TPMcolumn <- get.parameter(cf,"TPMcolumn","string") if (TPMcolumn == "") { TPM <- NULL } else { if (length(grep(TPMcolumn,colnames(TPM))) != 0) { colnames(TPM) <- gsub("\\."," ",colnames(TPM)) total <- TPM[,c(1:2,grep(TPMcolumn,colnames(TPM)))] } else { write.error(paste(TPMcolumn, "is not a column in input TPM:", colnames(TPM),sep=" ")) return(1) } } } else { TPM <- NULL } # Check that minPercent = [0,1] if (minPercent > 1 | minPercent < 0) { write.error("ERROR WITH INPUT minPercent. Value must be between 0 and 1.") return(1) } # Load first sample count file of current type Sample <- read.table(samples[1,2],sep="\t",header=FALSE) colnames(Sample) <- c("ID",samples[1,1]) # Merge rest of sample count samples of current type for (n in 2:nrow(samples)) { Add <- read.table(samples[n,2],sep="\t",header=FALSE) colnames(Add) <- c("ID",samples[n,1]) Sample <- merge(Sample,Add,by="ID",all=TRUE) } # Extract no_features, ambiguous, non-biological counts if (inclusionKey == "") { # Include everything but the last 5 lines found <- grep(paste(c("no_feature", "ambiguous", "too_low_aQual", "not_aligned", "alignment_not_unique"),collapse="|", sep=""), Sample[,1]) No_feature <- Sample[found,] Sample <- Sample[-found,] } else { No_feature <- Sample[grep(inclusionKey,Sample[,"ID"], invert=TRUE),] Sample <- Sample[-grep(inclusionKey,Sample[,"ID"], invert=TRUE),] } # Filter matrices by minCount and minPercent and remove rows with rowSums = 0. # Filtering is a little different when expression matrix contains only 1 sample.. if (!is.null(TPM)) { Remove <- c() SampleFiltered <- Sample if (ncol(No_feature) > 2) { for (r in 1:nrow(SampleFiltered)) { if (length(which(SampleFiltered[r,2:ncol(SampleFiltered)] >= minCount))/(ncol(SampleFiltered)-1) < minPercent) { Remove <- c(Remove,r) } } if (!is.null(Remove)) { SampleFiltered <- SampleFiltered[-Remove,] } No_feature <- No_feature[which(rowSums(No_feature[,2:ncol(No_feature)],na.rm=TRUE) > 0),] } else { No_feature <- No_feature[which(No_feature[,2] > 0),] SampleFiltered <- SampleFiltered[which(SampleFiltered[,2:ncol(SampleFiltered)] > minCount),] } } else { SampleFiltered <- NULL } # Normalize each sample by total mapped reads per million (TPM) if (!is.null(TPM)) { SampleTPM <- Sample for (sample in 1:nrow(total)) { sampleName <- total[sample,1] scale <- total[sample,TPMcolumn]/1000000 SampleTPM[,sampleName] <- SampleTPM[,sampleName]/scale } } if (!is.null(SampleFiltered) && !is.null(TPM)) { SampleFTPM <- SampleFiltered for (sample in 1:nrow(total)) { sampleName <- total[sample,1] scale <- total[sample,TPMcolumn]/1000000 SampleFTPM[,sampleName] <- SampleFTPM[,sampleName]/scale } } # Save to arrays (using FCSReader as template) CSV.write(paste(counts,"/","all.csv",sep=""),Sample) counts.array.output <- Array.add(counts.array.output,"all","all.csv") if (!is.null(SampleFiltered)) { CSV.write(paste(counts,"/","filtered.csv",sep=""),SampleFiltered) counts.array.output <- Array.add(counts.array.output,"filtered","filtered.csv") } if(!is.null(TPM)) { CSV.write(paste(counts,"/","allTPM.csv",sep=""),SampleTPM) counts.array.output <- Array.add(counts.array.output,"allTPM","allTPM.csv") } if (!is.null(SampleFiltered) && !is.null(TPM)) { CSV.write(paste(counts,"/","filteredTPM.csv",sep=""),SampleFTPM) counts.array.output <- Array.add(counts.array.output,"filteredTPM","filteredTPM.csv") } # Write outputs CSV.write(get.output(cf,"no_feature"),No_feature) Array.write(cf,counts.array.output,"countArray") return(0) } main(execute)