pipeline/other/singlecancer_singlesample_u...

615 lines
18 KiB
Plaintext

workflow singlecancer_singlesample_umi{
String project = "肺癌17基因"
String bed = "/dataseq/jmdna/database/bed/lung17gene.hg19.liftover.bed"
String tumor
String inputDir
String outputDir
String need_fusion_test = "yes"
String need_cnv_test = "yes"
String need_chemotherapy = "no"
String ref = "/dataseq/jmdna/database/genome/hg19/hg19.fa"
String codes_dir = "/dataseq/jmdna/codes/singlecancer_singlesample"
String accessBed = "/dataseq/jmdna/software/cnvkit-0.9.7/data/access-5k-mappable.hg19.bed"
String annotateGene = "/dataseq/jmdna/software/cnvkit-0.9.7/data/refFlat.txt"
String gc_wiggle = "/dataseq/jmdna/codes/pancancer_controlsample/hg19.gc200Base.txt.gz"
#创建目录
call create_dir{
input:
outputDir=outputDir,
need_fusion_test=need_fusion_test,
need_cnv_test=need_cnv_test,
need_chemotherapy=need_chemotherapy
}
call qc{
input:
tumor=tumor,
inputDir=inputDir,
outputDir=outputDir
}
call alignment_bwa{
input:
tumor=tumor,
outputDir=outputDir,
ref=ref,
read1=qc.outputFile[0],
read2=qc.outputFile[1]
}
call consensusreads{
input:
tumor=tumor,
outputDir=outputDir,
ref=ref,
bam=alignment_bwa.bam
}
call generater_mpileup{
input:
tumor=tumor,
ref=ref,
bed=bed,
outputDir=outputDir,
rmdupBam=consensusreads.outputFile[1]
}
call qc_2{
input:
ref=ref,
bed=bed,
tumor=tumor,
outputDir=outputDir,
codes_dir=codes_dir,
rmdupBam=consensusreads.outputFile[1]
}
call mutation_calling{
input:
codes_dir=codes_dir,
tumor=tumor,
outputDir=outputDir,
pileup=generater_mpileup.pileup
}
call annovar{
input:
tumor=tumor,
outputDir=outputDir,
ref=ref,
vcf=mutation_calling.vcf,
rmdupBam=consensusreads.outputFile[1]
}
if (need_fusion_test=="yes"){
call fusion{
input:
ref=ref,
codes_dir=codes_dir,
tumor=tumor,
outputDir=outputDir,
rmdupBam=consensusreads.outputFile[1],
project=project
}
}
# call chemoTherapy{
# input:
# codes_dir=codes_dir,
# normal=normal,
# outputDir=outputDir,
# ref=ref,
# project=project,
# rmdupBam=consensusreads.outputFile[1]
# }
if (need_cnv_test=="yes"){
call cnvkit{
input:
tumor=tumor,
ref=ref,
bed=bed,
outputDir=outputDir,
rmdupBam=consensusreads.outputFile[1],
accessBed=accessBed,
codes_dir=codes_dir,
annotateGene=annotateGene,
project=project
}
}
call dealwithsnvindel{
input:
codes_dir=codes_dir,
project=project,
outputDir=outputDir,
tumor=tumor,
anno=annovar.anno
}
call auto_report{
input:
codes_dir=codes_dir,
outputDir=outputDir,
tumor=tumor,
snv_result=dealwithsnvindel.snv,
cnv_result=cnvkit.cnv,
fusion_result=fusion.fusion
}
call hotspot{
input:
tumor=tumor,
outputDir=outputDir,
ref=ref,
rmdupBam=consensusreads.outputFile[1],
codes_dir=codes_dir
}
}
#create project directory
task create_dir{
String outputDir
String need_fusion_test
String need_cnv_test
String need_chemotherapy
command <<<
#创建目录
if [ ! -d ${outputDir} ];then
mkdir ${outputDir}
fi
#创建qc目录
if [ ! -d ${outputDir}/qc ];then
mkdir ${outputDir}/qc
fi
#创建alignment目录
if [ ! -d ${outputDir}/alignment ];then
mkdir ${outputDir}/alignment
fi
#创建mutation目录
if [ ! -d ${outputDir}/mutation ];then
mkdir ${outputDir}/mutation
fi
#创建cnv目录
if [ ${need_cnv_test} = "yes" ] && [ ! -d ${outputDir}/cnvkit ];then
mkdir ${outputDir}/cnvkit
fi
#创建report目录
if [ ! -d ${outputDir}/report ];then
mkdir -p ${outputDir}/report
fi
#创建chemo目录
if [ ${need_chemotherapy} = "yes" ] && [ ! -d ${outputDir}/chemo ];then
mkdir ${outputDir}/chemo
fi
#创建fusion目录
if [ ${need_fusion_test} = "yes" ] && [ ! -d ${outputDir}/fusion ];then
mkdir ${outputDir}/fusion
fi
>>>
output{
String newdir = "${outputDir}/report/qc"
}
}
#generator raw fastq to clean fastq
task qc{
String tumor
String inputDir
String outputDir
command <<<
echo processing raw reads with fastp
fastp -i ${inputDir}/*_${tumor}_*1.fq.gz -o ${outputDir}/qc/${tumor}_clean_R1.fq.gz \
-I ${inputDir}/*_${tumor}_*2.fq.gz -O ${outputDir}/qc/${tumor}_clean_R2.fq.gz \
-w 10 \
--disable_trim_poly_g \
--disable_quality_filtering \
--adapter_sequence AGATCGGAAGAGCACACGTCTGAACTCCAGTCA \
--adapter_sequence_r2 AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT \
-j ${outputDir}/qc/${tumor}.json \
-h ${outputDir}/qc/${tumor}.html --report_title ${tumor}
>>>
output{
Array[String] outputFile = [
"${outputDir}/qc/${tumor}_clean_R1.fq.gz",
"${outputDir}/qc/${tumor}_clean_R2.fq.gz",
"${outputDir}/qc/${tumor}.json",
"${outputDir}/qc/${tumor}.html"
]
}
}
#alignment clean fastq to reference
task alignment_bwa{
String tumor
String ref
String outputDir
String read1
String read2
command<<<
#fastqtosam
java -Xmx8G -jar $PICARD FastqToSam \
FASTQ=${read1} \
FASTQ2=${read2} \
OUTPUT=${outputDir}/alignment/unmapped.bam \
SAMPLE_name=${tumor}
#ExtractUmisFromBam
java -jar /dataseq/jmdna/software/fgbio/target/scala-2.13/fgbio-1.4.0-468a843-SNAPSHOT.jar ExtractUmisFromBam \
--input=${outputDir}/alignment/unmapped.bam \
--output=${outputDir}/alignment/unmapped.withUMI.bam \
--read-structure=1S3M3S+T 1S3M3S+T \
--single-tag=RX \
--molecular-index-tags=ZA ZB
#align reads
java -Xmx4G -jar $PICARD SamToFastq \
I=${outputDir}/alignment/unmapped.withUMI.bam \
F=/dev/stdout \
INTERLEAVE=true \
| bwa mem -p -t 10 ${ref} /dev/stdin \
| java -Xmx4G -jar $PICARD MergeBamAlignment \
UNMAPPED=${outputDir}/alignment/unmapped.withUMI.bam \
ALIGNED=/dev/stdin \
O=${outputDir}/alignment/${tumor}.mapped.bam \
R=${ref} \
SO=coordinate \
ALIGNER_PROPER_PAIR_FLAGS=True \
MAX_GAPS=-1 \
ORIENTATIONS=FR \
VALIDATION_STRINGENCY=SILENT \
CREATE_INDEX=True
rm ${outputDir}/alignment/unmapped.bam ${outputDir}/alignment/unmapped.withUMI.bam
>>>
output{
String bam = "${outputDir}/alignment/${tumor}.mapped.bam"
}
}
#group by umi and call consensusreads
task consensusreads{
String tumor
String ref
String outputDir
String bam
command<<<
#GroupReadsByUmi
java -Xmx4g -jar /dataseq/jmdna/software/fgbio/target/scala-2.13/fgbio-1.4.0-468a843-SNAPSHOT.jar GroupReadsByUmi \
--input=${bam} \
--output=${outputDir}/alignment/${tumor}.grouped.bam \
--strategy=paired \
--edits=1 \
--min-map-q=20 \
--allow-inter-contig=true
#generate consensus reads
java -Xmx4g -jar /dataseq/jmdna/software/fgbio/target/scala-2.13/fgbio-1.4.0-468a843-SNAPSHOT.jar CallDuplexConsensusReads \
--input=${outputDir}/alignment/${tumor}.grouped.bam \
--output=${outputDir}/alignment/${tumor}.consensus.unmapped.bam \
--error-rate-pre-umi=45 \
--error-rate-post-umi=30 \
--min-input-base-quality=20 \
--min-reads=1 0 0 \
--threads 20
#remap consensusreads
java -Xmx4G -jar $PICARD SamToFastq \
I=${outputDir}/alignment/${tumor}.consensus.unmapped.bam \
F=/dev/stdout \
INTERLEAVE=true \
| bwa mem -p -t 10 ${ref} /dev/stdin \
| java -Xmx4G -jar $PICARD MergeBamAlignment \
UNMAPPED=${outputDir}/alignment/${tumor}.consensus.unmapped.bam \
ALIGNED=/dev/stdin \
O=${outputDir}/alignment/${tumor}.consensus.mapped.bam \
R=${ref} \
SO=coordinate \
ALIGNER_PROPER_PAIR_FLAGS=True \
MAX_GAPS=-1 \
ORIENTATIONS=FR \
VALIDATION_STRINGENCY=SILENT \
CREATE_INDEX=True
rm ${outputDir}/alignment/${tumor}.consensus.unmapped.bam
>>>
output{
Array[String] outputFile = [
"${outputDir}/alignment/${tumor}.grouped.bam",
"${outputDir}/alignment/${tumor}.consensus.mapped.bam"
]
}
}
# generater mpileup file
task generater_mpileup{
String tumor
String ref
String bed
String outputDir
String rmdupBam
command<<<
samtools mpileup -Bq 20 -Q 20 -f ${ref} -l ${bed} \
${rmdupBam} -o ${outputDir}/alignment/${tumor}.pileup
>>>
output{
String pileup = "${outputDir}/alignment/${tumor}.pileup"
}
}
task mutation_calling{
String codes_dir
String tumor
String pileup
String outputDir
command<<<
java -jar $VARSCAN mpileup2cns ${pileup} \
--min-var-freq 0.002 --min-avg-qual 20 --output-vcf 1 --variants --p-value 0.99 --min-reads2 3 --strand-filter 1 >${outputDir}/mutation/${tumor}.snp.indel.vcf
>>>
output{
String vcf = "${outputDir}/mutation/${tumor}.snp.indel.vcf"
}
}
# hotspot
task hotspot{
String tumor
String outputDir
String ref
String rmdupBam
String codes_dir
command<<<
mkdir -p ${outputDir}/mutation/hotspot
samtools mpileup -Bq 20 -Q 20 -f ${ref} -l ${codes_dir}/hotspot.bed ${rmdupBam} -o ${outputDir}/mutation/hotspot/${tumor}.hotspot.pileup
java -jar $VARSCAN mpileup2cns ${outputDir}/mutation/hotspot/${tumor}.hotspot.pileup --min-var-freq 0.001 --min-avg-qual 20 --output-vcf 1 --variants --p-value 0.99 --min-reads2 2 --strand-filter 0 >${outputDir}/mutation/hotspot/${tumor}.hotspot.L.snp.indel.vcf
java -jar $VARSCAN mpileup2cns ${outputDir}/mutation/hotspot/${tumor}.hotspot.pileup --min-var-freq 0.002 --min-avg-qual 20 --output-vcf 1 --variants --p-value 0.99 --min-reads2 3 --strand-filter 1 >${outputDir}/mutation/hotspot/${tumor}.hotspot.H.snp.indel.vcf
perl ${codes_dir}/hotspot.hvl.pl ${outputDir} ${tumor}
if [ -e "${outputDir}/mutation/hotspot/${tumor}.hotspot.snp.indel.vcf" ]; then
table_annovar.pl \
${outputDir}/mutation/hotspot/${tumor}.hotspot.snp.indel.vcf \
/dataseq/jmdna/software/annovar/humandb/ \
-buildver hg19 -nastring . -vcfinput -remove -otherinfo \
-protocol refGene,avsnp150,cosmic91,clinvar_20220320,1000g2015aug_all,1000g2015aug_eas,esp6500siv2_all,exac03nontcga,gnomad_genome,dbnsfp35c \
-operation g,f,f,f,f,f,f,f,f,f \
--outfile ${outputDir}/mutation/hotspot/${tumor}.hotspot.snp.indel.anno
# java -jar /dataseq/jmdna/software/GenomeAnalysisTK.3.7.jar -T VariantAnnotator \
# -R ${ref} \
# -I ${outputDir}/alignment/${tumor}.rmdup.bam \
# -V ${outputDir}/mutation/hotspot/${tumor}.hotspot.snp.indel.vcf \
# -o ${outputDir}/mutation/hotspot/${tumor}.hotspot.TandemRepeatAnnotator.vcf \
# --annotation TandemRepeatAnnotator
# grep -v "^##" ${outputDir}/mutation/hotspot/${tumor}.hotspot.TandemRepeatAnnotator.vcf | cut -f8| paste ${outputDir}/mutation/hotspot/${tumor}.hotspot.snp.indel.anno.hg19_multianno.txt - >${outputDir}/mutation/hotspot/${tumor}.hotspot.snp.indel.Somatic.annoall.hg19_multianno.txt
cp ${outputDir}/mutation/hotspot/${tumor}.hotspot.snp.indel.anno.hg19_multianno.txt ${outputDir}/report/${tumor}.hotspot.snp.indel.anno.hg19_multianno.txt
fi
>>>
output{
String hotspot = "${outputDir}/mutation/hotspot/${tumor}.hotspot.H.snp.indel.vcf"
}
}
task annovar{
String tumor
String outputDir
String ref
String vcf
String rmdupBam
command<<<
table_annovar.pl \
${outputDir}/mutation/${tumor}.snp.indel.vcf \
/dataseq/jmdna/software/annovar/humandb/ \
-buildver hg19 -nastring . -vcfinput -remove -otherinfo \
-protocol refGene,avsnp150,cosmic91,clinvar_20220320,1000g2015aug_all,1000g2015aug_eas,esp6500siv2_all,exac03nontcga,gnomad_genome,dbnsfp35c \
-argument '-splicing_threshold 2 -hgvs',,,,,,,,, \
--intronhgvs 50 \
-operation g,f,f,f,f,f,f,f,f,f \
--outfile ${outputDir}/mutation/${tumor}.snp.indel.annoall
# java -jar /dataseq/jmdna/software/GenomeAnalysisTK.3.7.jar -T VariantAnnotator \
# -R ${ref} \
# -I ${rmdupBam} \
# -V ${vcf} \
# -o ${outputDir}/mutation/${tumor}.TandemRepeatAnnotator.vcf \
# --annotation TandemRepeatAnnotator
# grep -v "^##" ${outputDir}/mutation/${tumor}.TandemRepeatAnnotator.vcf | cut -f8| paste ${outputDir}/mutation/${tumor}.snp.indel.hg19_multianno.txt - >${outputDir}/mutation/${tumor}.snp.indel.annoall.hg19_multianno.txt
>>>
output{
String anno = "${outputDir}/mutation/${tumor}.snp.indel.annoall.hg19_multianno.txt"
}
}
task qc_2{
String ref
String bed
String tumor
String outputDir
String codes_dir
#for task chain
String rmdupBam
command <<<
mkdir ${outputDir}/qc/group_bamdst ${outputDir}/qc/consensus_bamdst
bamdst -p ${bed} -o ${outputDir}/qc/group_bamdst ${outputDir}/alignment/${tumor}.mapped.bam
bamdst -p ${bed} -o ${outputDir}/qc/consensus_bamdst ${outputDir}/alignment/${tumor}.consensus.mapped.bam
java -Xmx4g -jar /dataseq/jmdna/software/fgbio/target/scala-2.13/fgbio-1.4.0-468a843-SNAPSHOT.jar CollectDuplexSeqMetrics \
-i ${outputDir}/alignment/${tumor}.grouped.bam \
-o ${outputDir}/qc/${tumor}_umi_qc \
-d ${tumor} \
-u true
samtools flagstat -@ 10 ${outputDir}/alignment/${tumor}.mapped.bam >${outputDir}/qc/${tumor}.flagstat
samtools stats --reference ${ref} -t ${bed} -@ 10 ${outputDir}/alignment/${tumor}.consensus.mapped.bam > ${outputDir}/alignment/${tumor}.stat
Rscript ${codes_dir}/qc_umi.r ${outputDir} ${tumor}
Rscript ${codes_dir}/InsertAndDepthStat.R ${outputDir}/qc/${tumor}_InsertAndDepthStat ${outputDir}/qc/group_bamdst/insertsize.plot ${outputDir}/qc/group_bamdst/depth_distribution.plot
# mv ${outputDir}/qc/${tumor}_qcstat.txt ${outputDir}/report/qc/${tumor}_qcstat.txt
>>>
}
task fusion{
String ref
String codes_dir
String tumor
String outputDir
#for task chain
String rmdupBam
String project
command<<<
# Extract the discordant paired-end alignments.
samtools view -b -F 1294 ${rmdupBam} > ${outputDir}/fusion/${tumor}.discordants.bam
# Extract the split-read alignments
samtools view -h ${rmdupBam} \
| /dataseq/jmdna/software/lumpy-sv/scripts/extractSplitReads_BwaMem -i stdin \
| samtools view -Sb - \
> ${outputDir}/fusion/${tumor}.splitters.bam
lumpyexpress \
-B ${rmdupBam} \
-S ${outputDir}/fusion/${tumor}.splitters.bam \
-D ${outputDir}/fusion/${tumor}.discordants.bam \
-o ${outputDir}/fusion/${tumor}.fusion.vcf
perl ${codes_dir}/fusion.filter.pl ${outputDir}/fusion/${tumor}.fusion.vcf ${outputDir}/fusion/${tumor}.fusion.filter.vcf
svtyper \
-B ${rmdupBam} \
-i ${outputDir}/fusion/${tumor}.fusion.filter.vcf \
-T ${ref} \
-o ${outputDir}/fusion/${tumor}.fusion.gt.vcf
table_annovar.pl \
${outputDir}/fusion/${tumor}.fusion.gt.vcf \
/dataseq/jmdna/software/annovar/humandb/ \
-buildver hg19 -nastring . -vcfinput -remove -otherinfo \
-protocol refGene \
-operation g \
--outfile ${outputDir}/fusion/${tumor}.fusion
perl ${codes_dir}/fusion.reanno.pl ${outputDir}/qc/consensus_bamdst/depth.tsv.gz ${outputDir} ${tumor}
perl ${codes_dir}/fusion_targetTherapy.pl ${codes_dir} ${tumor} ${outputDir} ${project}
>>>
output{
String fusion = "${outputDir}/fusion/${tumor}.fusion.pos.txt"
}
}
task chemoTherapy{
String codes_dir
String normal
String outputDir
String ref
String project
#for task chain
Array[String] rmdupBam
command <<<
perl ${codes_dir}/chemo/singlecancer_chemo_2.pl ${codes_dir} ${outputDir} ${normal} ${ref} ${project}
>>>
}
task cnvkit{
String ref
String bed
String tumor
String outputDir
String accessBed
String annotateGene
String codes_dir
String project
#for task chain
String rmdupBam
command <<<
echo run cnvkit batch to processing cnv calling
cnvkit.py batch \
${rmdupBam} \
-r ${codes_dir}/cnvkit-lung17gene-cfDNA/lung17gene_pool_tumor_reference.cnn \
--drop-low-coverage --scatter --diagram --output-dir ${outputDir}/cnvkit
perl ${codes_dir}/cnv_targetTherapy_umi.pl ${codes_dir} ${tumor} ${outputDir} ${project}
>>>
output{
String cnv = "${outputDir}/cnvkit/${tumor}.cnv.pos.txt"
}
}
task dealwithsnvindel{
String codes_dir
String project
String outputDir
String tumor
String anno
command <<<
perl ${codes_dir}/pick_mut_splice_promoter.pl ${codes_dir} ${tumor} ${outputDir} ${project}
perl ${codes_dir}/targetTherapy.pl ${codes_dir} ${tumor} ${outputDir} ${project}
>>>
output{
String snv = "${outputDir}/mutation/${tumor}.snvindel.pos.txt"
}
}
task auto_report{
String codes_dir
String outputDir
String tumor
String snv_result
String cnv_result
String fusion_result
command <<<
python ${codes_dir}/sample_post.py -s ${tumor} -o ${outputDir}
python ${codes_dir}/drug_dedup.py ${outputDir} ${tumor}
python ${codes_dir}/report_template/lung_17gene_umi_report.py ${outputDir} ${tumor}
python ${codes_dir}/qc_check.py ${outputDir} ${tumor} c
python3 ${codes_dir}/wdl_check.py -o ${outputDir}
cp ${outputDir}/fusion/${tumor}.fuison.vus.txt ${outputDir}/report/
cp ${outputDir}/mutation/${tumor}.snvindel.vus.txt ${outputDir}/report/
cp ${outputDir}/mutation/${tumor}.target.splicing.txt ${outputDir}/report/
cp ${outputDir}/qc/${tumor}_qc.txt ${outputDir}/report/
cp ${outputDir}/qc/qc_fail.txt ${outputDir}/report/
# cp ${outputDir}/fusion/${tumor}.longindel.pos.txt ${outputDir}/report/
cp ${outputDir}/mutation/${tumor}.target.splicing.txt ${outputDir}/report/
>>>
output{
String report = "${outputDir}/report/${tumor}_report.docx"
}
}