2. Bulk RNA-Seq Analysis Example.
Here is an example pipeline to use the Star2Pass, which runs STAR aligner in two pass mode, for aligning RNA-seq data.
There are two main steps:
- Generating a STAR genome directory containing a transcriptome index.
- Performing STAR alignment in two pass mode.
Inputs for the pipeline include:
- A array of quality controlled fastq files (this pipeline can take the output of QCFasta directly). For paired end reads, one array for the reads and one array for the mates. An array can be created from a CSV file with a column named Key (which has the ids of your samples) and a column named File (with the paths to the fastq file) using makeArray.
- Reference genome in fasta format.
- A genome annotation file in GTF/GFF3 format.
- A parameter file. This file overrides default STAR parameters, but will itself be overridden by the command line. Use parametersDefault from STAR source as template.
Outputs of the pipeline are:
- Aligned BAM files ready for downstream analyses.
Dependencies:
STAR
#!/usr/bin/env anduril
//$OPT --threads 100
//$OPT --wrapper slurm-prefix
import anduril.builtin._
import anduril.tools._
import org.anduril.runtime._
import anduril.microarray._
import anduril.sequencing._
object twoPassAlignment {
val reads = INPUT(path="files/reads")
val mates = INPUT(path="files/mates")
val annotation = INPUT(path="files/annotation.gtf")
val reference = INPUT(path="files/genome.fa")
val STARparameters=INPUT(path="files/parametersDefault")
val genome = STARGenome(
genomeFasta = reference,
annotation = annotation,
genomeParameters= "--sjdbOverhang 64",
threads = 5
)
val custom = QuickBash(
in = reads,
script=
"""
tsvecho Key Custom > $out
for i in $( tail -n +2 $in/_index | cut -f1 )
do
tsvecho $i "--outSAMattrRGline ID:$key1 PU:unknown LB:Illumina SM:$key1 PL:Illumina" >> $out
done
"""
)
val twoPass = Star2Pass(
reference = reference,
reads = reads,
mates = mates,
genome = genome.genome,
annotation = annotation,
parameterFile = STARparameters,
custom = custom.out,
mainAlignmentType = "toTranscriptome",
genomeParameters = "--sjdbOverhang 64",
genomeLoad = "LoadAndRemove",
readFilesCommand = "zcat",
useEncodeParams = true,
alignParameters = "--outSAMtype BAM Unsorted --outReadsUnmapped Fastx --quantMode TranscriptomeSAM GeneCounts --quantTranscriptomeBan Singleend --limitBAMsortRAM 10000000000",
memory = 39000,
threads = 15,
execute = "once"
)
val getGenomeAligned = QuickBash(
in=twoPass.folder,
script=
"""
awk -F"\t" '{if (NR>1) {print $0"/Aligned.out.bam"} else {print $0}}' OFS="\t" $in/_index > $out
"""
).out
val genomeAlignments = CSV2Array(in=getGenomeAligned,keys="column").out
OUTPUT(twoPass.alignments)
OUTPUT(genomeAlignments)
}