#!/usr/bin/bash ################################################################ ## step1: recording all version informations ## ################################################################ ################# pipeline_version ######################## # pipeline: ${pipeline_version} #################### soft_version ######################## # fastp: ${fastp_version} # bwa: ${bwa_version} # samtools: ${samtools_version} # sentieon: ${sentieon_version} # bcftools: ${bcftools_version} # annovar: ${annovar_version} # vep: ${vep_version} # msisensor2: ${msisensor2_version} # genefuse: ${genefuse_version} # cnvkit: ${cnvkit_version} # bamdst: ${bamdst_version} # seqtk: ${seqtk_version} # htslib: ${htslib_version} # python2: ${python2_version} # python3: ${python3_version} # delly: ${delly_version} #################### bed_version ######################## # bed: ${bed_version} ################# database_version ######################## # reference: ${reference_version} # refgene: ${refgene_version} # cytoBand: ${cytoBand_version} # 1000g: ${g1000_version} # esp: ${esp_version} # exac: ${exac_version} # cosmic: ${cosmic_version} # clinvar: ${clinvar_version} # intervar: ${intervar_version} # dbnsfp: ${dbnsfp_version} # avsnp: ${avsnp_version} # private_database: ${private_database} # pon_database: ${pon_database} ################################################################ ## step2: pipeline for analysis ## ################################################################ # Update with the location of the Sentieon software package and license file export SENTIEON_LICENSE=${SENTIEON_LICENSE} threads=${THREAD} cd $outDir echo "$sampleID -- beginning at: $(date)" $python3 ${checkUMI} -f $fq1 -s ${sampleType} ################# fastp ######################## # fastp is used for remove lower quality bases echo "fastp" echo "$(date)" $fastp -w ${threads} -i $fq1 -o ${sampleID}.R1_001.trim.fq.gz -I $fq2 -O ${sampleID}.R2_001.trim.fq.gz -l 35 -c --detect_adapter_for_pe -j ${sampleID}.jion -h ${sampleID}.html ################# UMI + alignment + dedup ######################## # extract umi info and alignment to reference if [[ ${bedType} =~ ^(${umi_panel})$ ]]; then # remove UMI from following panels echo $'\nsentieon umi extract' echo "$(date)" $sentieon umi extract 21S+T,12M11S+T ${sampleID}.R1_001.trim.fq.gz ${sampleID}.R2_001.trim.fq.gz | \ ( $sentieon bwa mem -t ${threads} -p -C -M -R "@RG\\tID:${sampleID}\\tSM:${sampleID}\\tPL:ILLUMINA\\tLB:lib" -K 10000000 ${hg19fa} - || echo -n 'error') | \ $sentieon util sort -t ${threads} --sam2bam -i - -o ${sampleID}.sorted.bam echo $'\nsentieon umi consensus' echo "$(date)" $sentieon umi consensus -i ${sampleID}.sorted.bam --input_format BAM -o ${sampleID}.umi_consensus.fq.gz echo $'\nbwa' echo "$(date)" ( $sentieon bwa mem -t ${threads} -p -C -M -R "@RG\\tID:${sampleID}\\tSM:${sampleID}\\tPL:ILLUMINA\\tLB:lib" -K 10000000 ${hg19fa} ${sampleID}.umi_consensus.fq.gz || echo -n 'error' ) | \ $sentieon util sort --umi_post_process --sam2bam -i - -o ${sampleID}.sorted.dedup.bam elif [[ ${bedType} =~ (${probe_panel}) && ${sampleType} =~ (ctdna) ]]; then # remove UMI from following panels echo $'\nsentieon umi' echo "$(date)" $sentieon umi extract -d 6M1S+T,6M1S+T ${sampleID}.R1_001.trim.fq.gz ${sampleID}.R2_001.trim.fq.gz | \ ( $sentieon bwa mem -t ${threads} -p -C -M -R "@RG\\tID:${sampleID}\\tSM:${sampleID}\\tPL:ILLUMINA\\tLB:lib" -K 10000000 ${hg19fa} - || echo -n 'error') | \ $sentieon util sort -t ${threads} --sam2bam -i - -o ${sampleID}.sorted.bam echo $'\nsentieon umi consensus' echo "$(date)" $sentieon umi consensus -i ${sampleID}.sorted.bam --input_format BAM -o ${sampleID}.umi_consensus.fq.gz echo $'\nbwa' echo "$(date)" ( $sentieon bwa mem -t ${threads} -p -C -M -R "@RG\\tID:${sampleID}\\tSM:${sampleID}\\tPL:ILLUMINA\\tLB:lib" -K 10000000 ${hg19fa} ${sampleID}.umi_consensus.fq.gz || echo -n 'error' ) | \ $sentieon util sort --umi_post_process --sam2bam -i - -o ${sampleID}.sorted.dedup.bam else # following panels does not contain UMI echo $'\nbwa' echo "$(date)" ( $sentieon bwa mem -t ${threads} -M -R "@RG\\tID:${sampleID}\\tSM:${sampleID}\\tPL:ILLUMINA\\tLB:lib" -K 10000000 ${hg19fa} ${sampleID}.R1_001.trim.fq.gz ${sampleID}.R2_001.trim.fq.gz || echo -n 'error' ) | \ $sentieon util sort -o ${sampleID}.sorted.bam -t ${threads} --sam2bam -i - if [[ ${bedType} =~ ^(${probe_panel})$ ]]; then echo $'\nRemove Duplicate' echo "$(date)" $sentieon driver -t ${threads} -i ${sampleID}.sorted.bam --algo LocusCollector --fun score_info ${sampleID}.score.txt $sentieon driver -t ${threads} -i ${sampleID}.sorted.bam --algo Dedup --score_info ${sampleID}.score.txt --metrics ${sampleID}.dedup_metrics.txt ${sampleID}.sorted.dedup.bam rm ${sampleID}.score.txt* ${sampleID}.dedup_metrics.txt fi fi ################## Variant Calling ###################### # TNscope is used for somatic SNVs & small INDELs calling if [[ ${bedType} =~ ^(${pcr_panel})$ ]]; then echo $'\nHaplotyper' echo "$(date)" $sentieon driver -r ${hg19fa} -t ${threads} -i ${sampleID}.sorted.bam \ --interval ${bedPath}/${bedType}.bed \ --algo Haplotyper --emit_conf=10 --call_conf=10 \ ${sampleID}.germline.raw.vcf echo $'\nTNscope' echo "$(date)" if [[ ${bedType} =~ (AT|CA) ]]; then $sentieon driver -r ${hg19fa} -t ${threads} -i ${sampleID}.sorted.bam \ --interval ${bedPath}/${bedType}.bed --interval_padding 10 \ --algo TNscope \ --tumor_sample ${sampleID} \ --dbsnp ${dbsnp138vcf} \ --disable_detector sv \ --max_fisher_pv_active 0.05 \ --min_tumor_allele_frac 0.009 \ --sv_mask_ext 10 \ --prune_factor 0 \ --assemble_mode 2 \ --resample_depth 100000 \ --pon ${pon} \ ${sampleID}.output_tnscope.pre_filter.vcf else $sentieon driver -r ${hg19fa} -t ${threads} -i ${sampleID}.sorted.bam \ --interval ${bedPath}/${bedType}.bed --interval_padding 10 \ --algo TNscope \ --tumor_sample ${sampleID} \ --dbsnp ${dbsnp138vcf} \ --disable_detector sv \ --max_fisher_pv_active 0.05 \ --min_tumor_allele_frac 0.009 \ --sv_mask_ext 10 \ --prune_factor 0 \ --assemble_mode 2 \ --resample_depth 100000 \ --trim_primer ${bedPath}/${bedType}.amplicon.bed \ --pon ${pon} \ ${sampleID}.output_tnscope.pre_filter.vcf fi echo $'\nbcftools filter' echo "$(date)" $bcftools annotate -x "FILTER/triallelic_site" ${sampleID}.output_tnscope.pre_filter.vcf | \ $bcftools filter -m + -s "insignificant" -e "(PV>0.25 && PV2>0.25)" | \ $bcftools filter -m + -s "insignificant" -e "(INFO/STR == 1 && PV>0.05)" | \ $bcftools filter -m + -s "low_qual" -e "QUAL < 20" | \ $bcftools filter -m + -s "short_tandem_repeat" -e "RPA[0]>=10" | \ $bcftools filter -m + -s "noisy_region" -e "ECNT>5" | \ $sentieon util vcfconvert - ${sampleID}.somatic.raw.vcf rm ${sampleID}.output_tnscope.pre_filter.vcf* elif [[ ${bedType} =~ ^(${probe_panel})$ && ${sampleType} != "ctdna" ]]; then echo $'\nHaplotyper' echo "$(date)" $sentieon driver -r ${hg19fa} -t ${threads} -i ${sampleID}.sorted.dedup.bam \ --interval ${bedPath}/${bedType}.bed \ --algo Haplotyper --emit_conf=10 --call_conf=10 \ ${sampleID}.germline.raw.vcf echo $'\nTNscope' echo "$(date)" $sentieon driver -r ${hg19fa} -t ${threads} -i ${sampleID}.sorted.dedup.bam \ --interval ${bedPath}/${bedType}.bed --interval_padding 10 \ --algo TNscope \ --tumor_sample ${sampleID} \ --dbsnp ${dbsnp138vcf} \ --disable_detector sv \ --max_fisher_pv_active 0.05 \ --min_tumor_allele_frac 0.005 \ --filter_t_alt_frac 0.005 \ --resample_depth 100000 \ --assemble_mode 4 \ --pon ${pon} \ ${sampleID}.output_tnscope.pre_filter.vcf echo $'\nbcftools filter' echo "$(date)" echo $'\nbcftools filter' echo "$(date)" $bcftools annotate -x "FILTER/triallelic_site" ${sampleID}.output_tnscope.pre_filter.vcf | \ $bcftools filter -m + -s "insignificant" -e "(PV>0.25 && PV2>0.25)" | \ $bcftools filter -m + -s "insignificant" -e "(INFO/STR == 1 && PV>0.05)" | \ $bcftools filter -m + -s "orientation_bias" -e "FMT/FOXOG[0] == 1" | \ $bcftools filter -m + -s "strand_bias" -e "SOR > 3" | \ $bcftools filter -m + -s "low_qual" -e "QUAL < 20" | \ $bcftools filter -m + -s "short_tandem_repeat" -e "RPA[0]>=10" | \ $bcftools filter -m + -s "noisy_region" -e "ECNT>5" | \ $bcftools filter -m + -s "read_pos_bias" -e "FMT/ReadPosRankSumPS[0] < -8" | \ $sentieon util vcfconvert - ${sampleID}.somatic.raw.vcf rm ${sampleID}.output_tnscope.pre_filter.vcf* else echo $'\nHaplotyper' echo "$(date)" $sentieon driver -r ${hg19fa} -t ${threads} -i ${sampleID}.sorted.dedup.bam \ --interval ${bedPath}/${bedType}.bed \ --algo Haplotyper --emit_conf=10 --call_conf=10 \ ${sampleID}.germline.raw.vcf echo $'\nTNscope' echo "$(date)" $sentieon driver -r ${hg19fa} -t ${threads} -i ${sampleID}.sorted.dedup.bam --interval ${bedPath}/${bedType}.bed \ --algo TNscope \ --tumor_sample ${sampleID} \ --dbsnp ${dbsnp138vcf} \ --pcr_indel_model NONE \ --disable_detector sv \ --min_tumor_allele_frac 0.0003 \ --filter_t_alt_frac 0.0003 \ --max_error_per_read 3 \ --min_init_tumor_lod 3.0 \ --min_tumor_lod 3.0 \ --min_base_qual 40 \ --min_base_qual_asm 40 \ --resample_depth 100000 \ --assemble_mode 4 \ --pon ${pon} \ ${sampleID}.output_tnscope.pre_filter.vcf echo $'\nbcftools filter' echo "$(date)" $bcftools annotate -x "FILTER/triallelic_site" ${sampleID}.output_tnscope.pre_filter.vcf | \ $bcftools filter -m + -s "low_qual" -e "QUAL < 10" | \ $bcftools filter -m + -s "short_tandem_repeat" -e "RPA[0]>=10" | \ $bcftools filter -m + -s "read_pos_bias" -e "FMT/ReadPosRankSumPS[0] < -5" | \ $bcftools filter -m + -s "base_qual_bias" -e "FMT/BaseQRankSumPS[0] < -5" | \ $sentieon util vcfconvert - ${sampleID}.somatic.raw.vcf rm ${sampleID}.output_tnscope.pre_filter.vcf* fi echo $'\nfilter repeat regions' echo "$(date)" grep "#" ${sampleID}.somatic.raw.vcf > ${sampleID}.somatic.filter.vcf grep -v "#" ${sampleID}.somatic.raw.vcf | awk '{len=split($5,a,",");if(len<3 || (len>=3 && $7 !~ "panel_of_normals")) print $0}' >> ${sampleID}.somatic.filter.vcf ################# 1p19q ######################## if [[ ${bedType} =~ (AT) ]]; then $samtools mpileup -l ${bedPath}/AT.force.bed -f ${hg19fa} -d 30000 -Q 0 -O -s ${sampleID}.sorted.bam > ${sampleID}.mpileup.snp.txt $python3 ${dig_pileup} ${sampleID}.mpileup.snp.txt -o ${sampleID}.mpileup.snp.vcf $python3 ${p1q19_paint} ${sampleID}.mpileup.snp.vcf rm ${sampleID}.mpileup.snp.txt fi ################# bcftools ######################## # bcftools is used for retrieving mutations of interest, or mutations with higher confidence echo $'\nbcftools' echo "$(date)" #germline $bcftools norm -m -both ${sampleID}.germline.raw.vcf -o ${sampleID}.germline.raw.split.vcf $bcftools norm -f ${hg19fa} ${sampleID}.germline.raw.split.vcf -o ${sampleID}.germline.split.norm.vcf rm ${sampleID}.germline.raw.vcf* ${sampleID}.germline.raw.split.vcf # #somatic $bcftools norm -m -both ${sampleID}.somatic.filter.vcf -o ${sampleID}.somatic.raw.split.vcf $bcftools norm -f ${hg19fa} ${sampleID}.somatic.raw.split.vcf -o ${sampleID}.somatic.split.norm.vcf rm ${sampleID}.somatic.filter.vcf ${sampleID}.somatic.raw.split.vcf ###################### ANNOVAR ######################## # used for annotation ###################### germline ######################## echo $'\ngermline' echo "$(date)" mkdir anno ${annovar} ${sampleID}.germline.split.norm.vcf ${humandb} \ --outfile anno/${sampleID}.germline.anno --buildver hg19 \ --protocol refGene,cytoBand,1000g2015aug_all,esp6500siv2_all,exac03nonpsych,cosmic70,clinvar_20220624,intervar_20180118,dbnsfp35a,avsnp150 \ --operation g,r,f,f,f,f,f,f,f,f --vcfinput --remove rm anno/${sampleID}.germline.anno.avinput if ([[ "$bedType" =~ (${germline_panel1}) ]]) || ([[ "$bedType" =~ (${germline_panel2}) ]]); then $python3 ${germline_filter} anno/${sampleID}.germline.anno.hg19_multianno.vcf -o anno/${sampleID}.germline.inital.filter.vcf > anno/${sampleID}.germline.filter_reason.txt # vep注释(主要为HGVS) ${vep} --dir ${vep_dir} --cache --offline --cache_version 98 \ --use_given_ref --refseq --assembly GRCh37 --format vcf \ --fa ${hg19fa} --force_overwrite --vcf --variant_class --gene_phenotype --vcf_info_field ANN \ --hgvs --hgvsg --transcript_version \ -i anno/${sampleID}.germline.inital.filter.vcf -o anno/${sampleID}.germline.vep_anno.vcf rm anno/${sampleID}.germline.inital.filter.vcf # HGVS命名注释 $python3 ${anno_hgvs} anno/${sampleID}.germline.vep_anno.vcf ${clinic_transcripts} ${refFlat} -o anno/${sampleID}.germline.hgvs_anno.vcf rm anno/${sampleID}.germline.vep_anno.vcf* # VEP修复后假阳性删除 $python3 ${snv_second_filter} anno/${sampleID}.germline.hgvs_anno.vcf -k germline -o anno/${sampleID}.germline.final.filter.vcf >>anno/${sampleID}.germline.filter_reason.txt fi ###################### somatic ######################## echo $'\nsomatic' echo "$(date)" # 过滤 nopass 突变 grep "#" ${sampleID}.somatic.split.norm.vcf > anno/${sampleID}.somatic.split.filter.vcf grep -v "#" ${sampleID}.somatic.split.norm.vcf | awk '{len=split($7,a,";");split($NF,b,":");if(len<2 || (len>=2 && b[3] >=0.01)) print $0}' >> anno/${sampleID}.somatic.split.filter.vcf # annovar注释 ${annovar} anno/${sampleID}.somatic.split.filter.vcf ${humandb} \ --outfile anno/${sampleID}.somatic.annovar --buildver hg19 \ --protocol refGene,1000g2015aug_all,esp6500siv2_all,exac03nonpsych,cosmic70,clinvar_20220624,intervar_20180118,dbnsfp35a,avsnp150,dbscsnv11 \ --operation g,f,f,f,f,f,f,f,f,f --vcfinput --remove # vep注释(主要为HGVS) ${vep} --dir ${vep_dir} --cache --offline --cache_version 98 \ --use_given_ref --refseq --assembly GRCh37 --format vcf \ --fa ${hg19fa} --force_overwrite --vcf --variant_class --gene_phenotype --vcf_info_field ANN \ --hgvs --hgvsg --transcript_version \ -i anno/${sampleID}.somatic.annovar.hg19_multianno.vcf -o anno/${sampleID}.somatic.vep_anno.vcf # HGVS命名注释 $python3 ${anno_hgvs} anno/${sampleID}.somatic.vep_anno.vcf ${clinic_transcripts} ${refFlat} -o anno/${sampleID}.somatic.hgvs_anno.vcf # 初步过滤(假阳性等) $python3 ${snv_init_filter} anno/${sampleID}.somatic.hgvs_anno.vcf \ -o anno/${sampleID}.somatic.inital.filter.vcf -t ${sampleType} -m ${mode} > anno/${sampleID}.somatic.filter_reason.txt # 合并同一单倍型上的复杂突变 $python3 ${merge_mnv} anno/${sampleID}.somatic.inital.filter.vcf $hg19fa -o anno/${sampleID}.somatic.merge_mnv.vcf # 将合并后的mnp位置左移校准 $bcftools norm -m -both -f ${hg19fa} anno/${sampleID}.somatic.merge_mnv.vcf -o anno/${sampleID}.somatic.merged_norm.vcf # 拆分MNV突变 $python3 ${find_mnp} anno/${sampleID}.somatic.merged_norm.vcf if [ -f "anno/${sampleID}.somatic.mnp.vcf" ]; then # annovar注释MNV ${annovar} anno/${sampleID}.somatic.mnp.vcf ${humandb} \ --outfile anno/${sampleID}.somatic.mnp.annovar --buildver hg19 \ --protocol refGene,1000g2015aug_all,esp6500siv2_all,exac03nonpsych,cosmic70,clinvar_20220624,intervar_20180118,dbnsfp35a,avsnp150,dbscsnv11 \ --operation g,f,f,f,f,f,f,f,f,f --vcfinput --remove # vep注释MNV(主要为HGVS) ${vep} --dir ${vep_dir} --cache --offline --cache_version 98 \ --use_given_ref --refseq --assembly GRCh37 --format vcf \ --fa ${hg19fa} --force_overwrite --vcf --variant_class --gene_phenotype --vcf_info_field ANN \ --hgvs --hgvsg --transcript_version \ -i anno/${sampleID}.somatic.mnp.annovar.hg19_multianno.vcf -o anno/${sampleID}.somatic.mnp.vep_anno.vcf # HGVS命名注释MNV $python3 ${anno_hgvs} anno/${sampleID}.somatic.mnp.vep_anno.vcf ${clinic_transcripts} ${refFlat} -o anno/${sampleID}.somatic.mnp.hgvs_anno.vcf # 合并突变 $bgzip -f anno/${sampleID}.somatic.common.vcf $tabix -p vcf anno/${sampleID}.somatic.common.vcf.gz $bgzip -f anno/${sampleID}.somatic.mnp.hgvs_anno.vcf $tabix -p vcf anno/${sampleID}.somatic.mnp.hgvs_anno.vcf.gz $bcftools concat anno/${sampleID}.somatic.common.vcf.gz anno/${sampleID}.somatic.mnp.hgvs_anno.vcf.gz -a -d all -o anno/${sampleID}.somatic.merge.vcf else mv anno/${sampleID}.somatic.merged_norm.vcf anno/${sampleID}.somatic.merge.vcf fi # VEP修复后假阳性及良性变异删除 $python3 ${snv_second_filter} anno/${sampleID}.somatic.merge.vcf -k somatic -o anno/${sampleID}.somatic.final.filter.vcf >>anno/${sampleID}.somatic.filter_reason.txt # 统计变异删除的原因 $python3 ${get_filter_reason} . -k somatic ###################### tmb ######################## if [[ ${bedType} =~ ^(${tmb_panel})$ ]]; then echo $'\nTMB' echo "$(date)" # 过滤低频假阳性 grep "#" ${sampleID}.somatic.split.norm.vcf > anno/${sampleID}.tmb.filter_1.vcf if [[ ${sampleType} =~ (ctdna) ]]; then grep -v "#" ${sampleID}.somatic.split.norm.vcf | awk '{split($NF,a,":");if(a[3]>0.05 && a[3] <= 0.5) print $0}' >> anno/${sampleID}.tmb.filter_1.vcf else grep -v "#" ${sampleID}.somatic.split.norm.vcf | awk '{split($NF,a,":");if(a[3]>0.05) print $0}' >> anno/${sampleID}.tmb.filter_1.vcf fi # 过滤filter信息大于2的突变 grep "#" anno/${sampleID}.tmb.filter_1.vcf > anno/${sampleID}.tmb.filter_2.vcf grep -v "#" anno/${sampleID}.tmb.filter_1.vcf | awk '{len=split($7,a,";");if(len<2) print $0}' >> anno/${sampleID}.tmb.filter_2.vcf # 合并同一单倍型上的复杂突变 $python3 ${merge_mnv} anno/${sampleID}.tmb.filter_2.vcf $hg19fa -o anno/${sampleID}.tmb.merged.vcf # 将合并后的mnp位置左移校准 $bcftools norm -m -both -f ${hg19fa} anno/${sampleID}.tmb.merged.vcf -o anno/${sampleID}.tmb.merged_norm.vcf rm anno/${sampleID}.tmb.merged.vcf # annovar 注释相关数据库 ${annovar} anno/${sampleID}.tmb.merged_norm.vcf ${humandb} \ --outfile anno/${sampleID}.tmb.annovar --buildver hg19 \ --protocol refGene,cytoBand,1000g2015aug_all,esp6500siv2_all,exac03nonpsych,cosmic70,clinvar_20220624,intervar_20180118,dbnsfp35a,avsnp150 \ --operation g,r,f,f,f,f,f,f,f,f --vcfinput --remove rm anno/${sampleID}.tmb.merged_norm.vcf # vep注释(主要为HGVS) ${vep} --dir ${vep_dir} --cache --offline --cache_version 98 \ --use_given_ref --refseq --assembly GRCh37 --format vcf \ --fa ${hg19fa} --force_overwrite --vcf --variant_class --gene_phenotype --vcf_info_field ANN \ --hgvs --hgvsg --transcript_version \ -i anno/${sampleID}.tmb.annovar.hg19_multianno.vcf -o anno/${sampleID}.tmb.vep_anno.vcf rm anno/${sampleID}.tmb.annovar* # HGVS命名注释 $python3 ${anno_hgvs} anno/${sampleID}.tmb.vep_anno.vcf ${clinic_transcripts} ${refFlat} -o anno/${sampleID}.tmb.hgvs_anno.vcf rm anno/${sampleID}.tmb.vep_anno.vcf* # 过滤其他假阳性,germline等 $python3 ${tmb_filter} anno/${sampleID}.tmb.hgvs_anno.vcf -t ${sampleType} -o anno/${sampleID}.tmb.final.filter.vcf -m ${mode} > anno/${sampleID}.tmb.filter_reason.txt rm anno/${sampleID}.tmb.hgvs_anno.vcf # 统计变异删除的原因 $python3 ${get_filter_reason} . -k tmb rm anno/${sampleID}.tmb.filter_1.vcf anno/${sampleID}.tmb.filter_2.vcf fi rm ${sampleID}.somatic.split.norm.vcf ###################### concat vcf ######################## if ([[ "$bedType" =~ (${germline_panel1}) ]]) || ([[ "$bedType" =~ (QR) && ${sampleType} != "blood" && ${sampleType} != "dna" ]]);then echo $'\nmerge_vcf' echo "$(date)" cp anno/${sampleID}.germline.final.filter.vcf anno/${sampleID}.germline.final.filter.concat.vcf $bgzip -f anno/${sampleID}.germline.final.filter.concat.vcf $tabix -p vcf anno/${sampleID}.germline.final.filter.concat.vcf.gz cp anno/${sampleID}.somatic.final.filter.vcf anno/${sampleID}.somatic.final.filter.concat.vcf $bgzip -f anno/${sampleID}.somatic.final.filter.concat.vcf $tabix -p vcf anno/${sampleID}.somatic.final.filter.concat.vcf.gz if [[ "$bedType" =~ (${tmb_panel}) ]];then cp anno/${sampleID}.tmb.final.filter.vcf anno/${sampleID}.tmb.final.filter.concat.vcf $bgzip -f anno/${sampleID}.tmb.final.filter.concat.vcf $tabix -p vcf anno/${sampleID}.tmb.final.filter.concat.vcf.gz $bcftools concat anno/${sampleID}.somatic.final.filter.concat.vcf.gz anno/${sampleID}.germline.final.filter.concat.vcf.gz anno/${sampleID}.tmb.final.filter.concat.vcf.gz -a -d all -o anno/${sampleID}.somatic.concat.vcf rm anno/${sampleID}.tmb.final.filter.concat.vcf.gz* else $bcftools concat anno/${sampleID}.somatic.final.filter.concat.vcf.gz anno/${sampleID}.germline.final.filter.concat.vcf.gz -a -d all -o anno/${sampleID}.somatic.concat.vcf fi rm anno/${sampleID}.somatic.final.filter.concat.vcf.gz* anno/${sampleID}.germline.final.filter.concat.vcf.gz* fi ###################### anno drugs ######################## echo $'\nanno_drugs' echo "$(date)" # 注释药物相关数据库 if ([[ "$bedType" =~ (${germline_panel1}) ]]) || ([[ "$bedType" =~ (QR) && ${sampleType} != "blood" && ${sampleType} != "dna" ]]);then $python3 ${snv_out} anno/${sampleID}.somatic.concat.vcf -o anno/${sampleID}.somatic.xlsx -t ${sampleType} >> anno/${sampleID}.somatic.filter_reason.txt elif [[ "$bedType" =~ (${germline_panel2}) && "$sampleType" =~ (blood|dna) ]];then $python3 ${snv_out} anno/${sampleID}.germline.final.filter.vcf -o anno/${sampleID}.somatic.xlsx -t ${sampleType} >> anno/${sampleID}.somatic.filter_reason.txt else $python3 ${snv_out} anno/${sampleID}.somatic.final.filter.vcf -o anno/${sampleID}.somatic.xlsx -t ${sampleType} >> anno/${sampleID}.somatic.filter_reason.txt fi ###################### add_pipe_tag ###################### # 添加流程pipe tag $python3 ${add_pipe_tag} anno/${sampleID}.somatic.xlsx #################### sanger_determine #################### # 确定germline突变是否需要验证 $python3 ${sanger_determine} anno/${sampleID}.somatic.xlsx #################### cis_trans #################### # 确定T790M和G797S共突变的顺式或反式 $python3 ${cis_trans} anno/${sampleID}.somatic.xlsx ###################### core_filter ######################## if [[ "$bedType" =~ (${core_panel}) ]];then $python3 ${core_filter} anno/${sampleID}.somatic.xlsx fi ###################### qc_bamdst ######################## echo $'\nqc_bamdst' echo "$(date)" if [[ ${bedType} =~ ^(${pcr_panel})$ || ${bedType} =~ ^(${umi_panel})$ ]]; then mkdir -p qc/bamdst $bamdst --uncover 30 -p ${bedPath}/${bedType}.bed --cutoffdepth 500 -o qc/bamdst ${sampleID}.sorted.bam ls qc/bamdst/ | xargs -I{} mv qc/bamdst/{} qc/bamdst/${sampleID}.{} elif [[ ${bedType} =~ ^(${probe_panel})$ && ${sampleType} != "ctdna" ]]; then mkdir -p qc/bamdst $bamdst --uncover 30 -p ${bedPath}/${bedType}.bed --cutoffdepth 1000 -o qc/bamdst ${sampleID}.sorted.dedup.bam ls qc/bamdst/ | xargs -I{} mv qc/bamdst/{} qc/bamdst/${sampleID}.{} else mkdir -p qc/bamdst/raw if [[ "$sampleType" =~ (ctdna) ]]; then $bamdst --uncover 30 -p ${bedPath}/${bedType}.bed --cutoffdepth 1000 -o qc/bamdst/raw ${sampleID}.sorted.bam else $bamdst --uncover 30 -p ${bedPath}/${bedType}.bed --cutoffdepth 500 -o qc/bamdst/raw ${sampleID}.sorted.bam fi ls qc/bamdst/raw/ | xargs -I{} mv qc/bamdst/raw/{} qc/bamdst/raw/${sampleID}.raw.{} mkdir -p qc/bamdst/dedup if [[ "$sampleType" =~ (ctdna) ]]; then $bamdst --uncover 30 -p ${bedPath}/${bedType}.bed --cutoffdepth 1000 -o qc/bamdst/dedup ${sampleID}.sorted.dedup.bam else $bamdst --uncover 30 -p ${bedPath}/${bedType}.bed --cutoffdepth 500 -o qc/bamdst/dedup ${sampleID}.sorted.dedup.bam fi ls qc/bamdst/dedup | xargs -I{} mv qc/bamdst/dedup/{} qc/bamdst/dedup/${sampleID}.dedup.{} fi ###################### chemical ######################## echo $'\nchemical' echo "$(date)" if [[ ${bedType} =~ ^(DA)$ ]]; then $samtools bedcov ${bedPath}/chemical_DA.bed ${sampleID}.sorted.bam >${sampleID}.chemical.depth.txt else $samtools bedcov ${bedPath}/chemical_DF.bed ${sampleID}.sorted.bam >${sampleID}.chemical.depth.txt fi if [[ "$bedType" =~ (${chemical_panel}) ]]; then $python3 ${germline_chemical} anno/${sampleID}.germline.anno.hg19_multianno.txt >anno/${sampleID}.chemical.txt $python3 ${check_sample_data} anno/${sampleID}.chemical.txt rm ${sampleID}.chemical.depth.txt anno/${sampleID}.chemical.txt fi ###################### Copy Number Aberration ###################### ### hybrid capture: DG|DS|DH|DT|DL|DW|DR|DJ|DF|DQ|DE ### targeted amplicon seq: DP|HH|HT|HW|HR|HC|HF|ZJ|ZG|YW|QJ|YR|YZ|YW|YF ### 需要做cnv的panel:DF, DG, DJ, DQ, DR, DS, DW, DE ##################### CNVkit ######################## if [[ ${bedType} =~ ^(${cnv_panel1})$ ]]; then echo "CNA starts at -- `date` " if [[ ${bedType} =~ ^(${cnv_panel2})$ ]]; then $python3 $cnvkit coverage ${sampleID}.sorted.dedup.bam ${cnv_target}/${bedType}.target.bed -p ${threads} -o ${sampleID}.targetcoverage.cnn $python3 $cnvkit fix ${sampleID}.targetcoverage.cnn ${cnv_ref}/empty.cnn ${cnv_ref}/${bedType}.flat.cnn -o ${sampleID}.cnr $python3 $cnvkit segment -p ${threads} ${sampleID}.cnr --drop-low-coverage -o ${sampleID}.cns elif [[ ${bedType} =~ ^(${cnv_panel3})$ ]]; then $python3 $cnvkit coverage ${sampleID}.sorted.bam ${cnv_target}/${bedType}.target.bed -p ${threads} -o ${sampleID}.targetcoverage.cnn $python3 $cnvkit fix ${sampleID}.targetcoverage.cnn ${cnv_ref}/empty.cnn ${cnv_ref}/${bedType}.flat.cnn --no-edge -o ${sampleID}.cnr $python3 $cnvkit segment -p ${threads} ${sampleID}.cnr --drop-low-coverage -o ${sampleID}.cns else $python3 $cnvkit coverage ${sampleID}.sorted.dedup.bam ${cnv_target}/${bedType}.target.bed -p ${threads} -o ${sampleID}.targetcoverage.cnn $python3 $cnvkit coverage ${sampleID}.sorted.dedup.bam ${cnv_anti}/${bedType}.antitarget.bed -p ${threads} -o ${sampleID}.antitargetcoverage.cnn $python3 $cnvkit fix ${sampleID}.targetcoverage.cnn ${sampleID}.antitargetcoverage.cnn ${cnv_ref}/${bedType}.flat.cnn -o ${sampleID}.cnr $python3 $cnvkit segment -p ${threads} ${sampleID}.cnr --drop-low-coverage -o ${sampleID}.cns fi $python3 $cnvkit call ${sampleID}.cns --center --drop-low-coverage -o ${sampleID}.call.cns $python3 $cnvkit export vcf ${sampleID}.call.cns -i "${sampleID}" -o ${sampleID}.cnv.vcf $python3 $annotate_vcf -i ${sampleID}.cnv.vcf -r $refFlat -o ${sampleID}.annotated.cnv.csv $python3 ${cna_out} ${sampleID}.annotated.cnv.csv ##################### THeTA ######################## # THeTA -- calculate sample purity (tumour/normal ratio) $python3 $cnvkit export theta ${sampleID}.cns -r ${cnv_ref}/${bedType}.flat.cnn ${theta} ${sampleID}.interval_count --NUM_PROCESSES ${threads} echo "CNA analysis completed at: `date` " rm ${sampleID}.*coverage.cnn ${sampleID}.cnr ${sampleID}.cnv.vcf ${sampleID}.cns ${sampleID}.call.cns ${sampleID}.interval_count ${sampleID}.n2.graph.pdf ${sampleID}.n2.withBounds fi ################### Microsatelite instability ###################### # ###################### MSIsensor2 ###################### # # Detect msi on below panel using msisensor if [[ ${bedType} =~ ^(${msi_panel})$ ]]; then echo $'\nmsisensor2' echo "$(date)" $msisensor2 msi -M ${msi_models} -t ${sampleID}.sorted.dedup.bam -o ${sampleID}.prefix fi ########################## Fusion of genes ######################## ##################### genefuse ######################## # Detect potential gene fusion on below panels. if [[ ${bedType} =~ ^(${fusion_panel})$ ]]; then echo $'\ngenefuse' echo "$(date)" $python3 ${make_fusion_gene} ${sampleID} if [[ "$sampleType" =~ (ctdna) ]]; then $genefuse -t ${threads} -r ${hg19fa} -f ${sampleID}.fusion_genes.csv -1 ${sampleID}.umi_consensus.fq.gz -h ${sampleID}.fusion.html -j ${sampleID}.fusion.json 1> ${sampleID}.fusion.txt rm ${sampleID}.umi_consensus.fq.gz else $genefuse -t ${threads} -r ${hg19fa} -f ${sampleID}.fusion_genes.csv -1 ${sampleID}.R1_001.trim.fq.gz -2 ${sampleID}.R2_001.trim.fq.gz -h ${sampleID}.fusion.html -j ${sampleID}.fusion.json 1> ${sampleID}.fusion.txt fi echo $'\ndelly' echo "$(date)" $delly call -x ${delly_low_regions} -g ${hg19fa} -o ${sampleID}.bcf ${sampleID}.sorted.dedup.bam $bcftools view ${sampleID}.bcf > ${sampleID}.sv.raw.vcf rm ${sampleID}.bcf* $python3 $delly_sv_filter ${sampleID}.sv.raw.vcf rm ${sampleID}.delly* $python3 $fusion_filter ${sampleID}.fusion.json -o ${sampleID}.fusion_filtered.json $python3 ${fusion_out} ${sampleID}.fusion_filtered.json $python3 ${check_fusion_result} ${sampleID}.fusion.xlsx rm ${sampleID}.fusion.txt fi ########################## Quality control ######################### ##################### Fastp ######################## # This step is used for quality control by fastqc. mkdir -p qc/fastp echo $'\nQC_fastp' echo "$(date)" mv ${sampleID}.jion ${sampleID}.html qc/fastp/ ##################### samtools ######################## # This step is used for quality control by samtools. mkdir -p qc/samtools echo $'\nQC_samtools' echo "$(date)" $samtools stats -@ ${threads} ${sampleID}.sorted.bam >qc/samtools/${sampleID}.sorted.bam.stats.txt ##################### hla_type_neo_predict ######################## # This step is used for hla_type_neo_predict by hlahd and MuPeXI. if [[ "$bedType" =~ ^(${hla_panel})$ ]]; then echo $'\nHLA' echo "$(date)" cp anno/${sampleID}.tmb.final.filter.vcf anno/${sampleID}.tmb_copy.vcf grep '#' anno/${sampleID}.tmb_copy.vcf > anno/${sampleID}.header grep -v '#' anno/${sampleID}.tmb_copy.vcf > anno/${sampleID}.body gunzip -k ${sampleID}.R1_001.trim.fq.gz gunzip -k ${sampleID}.R2_001.trim.fq.gz # get 300w reads, about 1GB to to hla typing $seqtk sample -s100 ${sampleID}.R1_001.trim.fq 3000000 > ${sampleID}.R1_001.trim.sub.fq $seqtk sample -s100 ${sampleID}.R2_001.trim.fq 3000000 > ${sampleID}.R2_001.trim.sub.fq $python2 ${hla_optitype} -sampleID $sampleID -outDir $outDir -results_TMB_txt anno/${sampleID}.body -fq1 ${sampleID}.R1_001.trim.sub.fq -fq2 ${sampleID}.R2_001.trim.sub.fq # 拼接vcf cat anno/${sampleID}.header anno/${sampleID}.select_variant_from_TMB.txt > anno/${sampleID}.mupexi.vcf $python2 ${hla_optitype} -sampleID $sampleID -outDir $outDir -vcf anno/${sampleID}.mupexi.vcf -determine_tnb anno/${sampleID}.body rm ${sampleID}.R*_001.trim.fq ${sampleID}.R*_001.trim.sub.fq \ anno/${sampleID}.body anno/${sampleID}.tmb_copy.vcf anno/${sampleID}.header anno/${sampleID}.select_variant_from_TMB.txt \ neo/${sampleID}_coverage_plot.pdf neo/${sampleID}.fasta neo/${sampleID}.g \ neo/${sampleID}_netMHCpan* neo/${sampleID}_pepmatch_* neo/${sampleID}_peptide_netMHCinput.txt neo/${sampleID}_reference_peptide_* \ neo/${sampleID}_vep_compatible_vcf.vcf neo/${sampleID}_vep.vep fi ########################## merge_data ######################### $python3 ${single_sample_qc} . ${sampleID} ${sampleType} $python3 ${parser_single_sample_data} . ${sampleID} $python3 ${collect_single_sample_data} . ${sampleID} rm ${sampleID}.R*_001.trim.fq.gz if [[ ${bedType} =~ ^(${probe_panel})$ ]]; then rm ${sampleID}.sorted.bam* fi echo "$sampleID -- ending at: $(date)" echo "pipeline finished!" >finish.txt