pipeline/other/pipeline_single.sh

715 lines
30 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#!/usr/bin/bash
################################################################
## step1: recording all version informations ##
################################################################
################# pipeline_version ########################
# pipeline: ${pipeline_version}
#################### soft_version ########################
# fastp: ${fastp_version}
# bwa: ${bwa_version}
# samtools: ${samtools_version}
# sentieon: ${sentieon_version}
# bcftools: ${bcftools_version}
# annovar: ${annovar_version}
# vep: ${vep_version}
# msisensor2: ${msisensor2_version}
# genefuse: ${genefuse_version}
# cnvkit: ${cnvkit_version}
# bamdst: ${bamdst_version}
# seqtk: ${seqtk_version}
# htslib: ${htslib_version}
# python2: ${python2_version}
# python3: ${python3_version}
# delly: ${delly_version}
#################### bed_version ########################
# bed: ${bed_version}
################# database_version ########################
# reference: ${reference_version}
# refgene: ${refgene_version}
# cytoBand: ${cytoBand_version}
# 1000g: ${g1000_version}
# esp: ${esp_version}
# exac: ${exac_version}
# cosmic: ${cosmic_version}
# clinvar: ${clinvar_version}
# intervar: ${intervar_version}
# dbnsfp: ${dbnsfp_version}
# avsnp: ${avsnp_version}
# private_database: ${private_database}
# pon_database: ${pon_database}
################################################################
## step2: pipeline for analysis ##
################################################################
# Update with the location of the Sentieon software package and license file
export SENTIEON_LICENSE=${SENTIEON_LICENSE}
threads=${THREAD}
cd $outDir
echo "$sampleID -- beginning at: $(date)"
$python3 ${checkUMI} -f $fq1 -s ${sampleType}
################# fastp ########################
# fastp is used for remove lower quality bases
echo "fastp"
echo "$(date)"
$fastp -w ${threads} -i $fq1 -o ${sampleID}.R1_001.trim.fq.gz -I $fq2 -O ${sampleID}.R2_001.trim.fq.gz -l 35 -c --detect_adapter_for_pe -j ${sampleID}.jion -h ${sampleID}.html
################# UMI + alignment + dedup ########################
# extract umi info and alignment to reference
if [[ ${bedType} =~ ^(${umi_panel})$ ]];
then
# remove UMI from following panels
echo $'\nsentieon umi extract'
echo "$(date)"
$sentieon umi extract 21S+T,12M11S+T ${sampleID}.R1_001.trim.fq.gz ${sampleID}.R2_001.trim.fq.gz | \
( $sentieon bwa mem -t ${threads} -p -C -M -R "@RG\\tID:${sampleID}\\tSM:${sampleID}\\tPL:ILLUMINA\\tLB:lib" -K 10000000 ${hg19fa} - || echo -n 'error') | \
$sentieon util sort -t ${threads} --sam2bam -i - -o ${sampleID}.sorted.bam
echo $'\nsentieon umi consensus'
echo "$(date)"
$sentieon umi consensus -i ${sampleID}.sorted.bam --input_format BAM -o ${sampleID}.umi_consensus.fq.gz
echo $'\nbwa'
echo "$(date)"
( $sentieon bwa mem -t ${threads} -p -C -M -R "@RG\\tID:${sampleID}\\tSM:${sampleID}\\tPL:ILLUMINA\\tLB:lib" -K 10000000 ${hg19fa} ${sampleID}.umi_consensus.fq.gz || echo -n 'error' ) | \
$sentieon util sort --umi_post_process --sam2bam -i - -o ${sampleID}.sorted.dedup.bam
elif [[ ${bedType} =~ (${probe_panel}) && ${sampleType} =~ (ctdna) ]];
then
# remove UMI from following panels
echo $'\nsentieon umi'
echo "$(date)"
$sentieon umi extract -d 6M1S+T,6M1S+T ${sampleID}.R1_001.trim.fq.gz ${sampleID}.R2_001.trim.fq.gz | \
( $sentieon bwa mem -t ${threads} -p -C -M -R "@RG\\tID:${sampleID}\\tSM:${sampleID}\\tPL:ILLUMINA\\tLB:lib" -K 10000000 ${hg19fa} - || echo -n 'error') | \
$sentieon util sort -t ${threads} --sam2bam -i - -o ${sampleID}.sorted.bam
echo $'\nsentieon umi consensus'
echo "$(date)"
$sentieon umi consensus -i ${sampleID}.sorted.bam --input_format BAM -o ${sampleID}.umi_consensus.fq.gz
echo $'\nbwa'
echo "$(date)"
( $sentieon bwa mem -t ${threads} -p -C -M -R "@RG\\tID:${sampleID}\\tSM:${sampleID}\\tPL:ILLUMINA\\tLB:lib" -K 10000000 ${hg19fa} ${sampleID}.umi_consensus.fq.gz || echo -n 'error' ) | \
$sentieon util sort --umi_post_process --sam2bam -i - -o ${sampleID}.sorted.dedup.bam
else
# following panels does not contain UMI
echo $'\nbwa'
echo "$(date)"
( $sentieon bwa mem -t ${threads} -M -R "@RG\\tID:${sampleID}\\tSM:${sampleID}\\tPL:ILLUMINA\\tLB:lib" -K 10000000 ${hg19fa} ${sampleID}.R1_001.trim.fq.gz ${sampleID}.R2_001.trim.fq.gz || echo -n 'error' ) | \
$sentieon util sort -o ${sampleID}.sorted.bam -t ${threads} --sam2bam -i -
if [[ ${bedType} =~ ^(${probe_panel})$ ]]; then
echo $'\nRemove Duplicate'
echo "$(date)"
$sentieon driver -t ${threads} -i ${sampleID}.sorted.bam --algo LocusCollector --fun score_info ${sampleID}.score.txt
$sentieon driver -t ${threads} -i ${sampleID}.sorted.bam --algo Dedup --score_info ${sampleID}.score.txt --metrics ${sampleID}.dedup_metrics.txt ${sampleID}.sorted.dedup.bam
rm ${sampleID}.score.txt* ${sampleID}.dedup_metrics.txt
fi
fi
################## Variant Calling ######################
# TNscope is used for somatic SNVs & small INDELs calling
if [[ ${bedType} =~ ^(${pcr_panel})$ ]]; then
echo $'\nHaplotyper'
echo "$(date)"
$sentieon driver -r ${hg19fa} -t ${threads} -i ${sampleID}.sorted.bam \
--interval ${bedPath}/${bedType}.bed \
--algo Haplotyper --emit_conf=10 --call_conf=10 \
${sampleID}.germline.raw.vcf
echo $'\nTNscope'
echo "$(date)"
if [[ ${bedType} =~ (AT|CA) ]];
then
$sentieon driver -r ${hg19fa} -t ${threads} -i ${sampleID}.sorted.bam \
--interval ${bedPath}/${bedType}.bed --interval_padding 10 \
--algo TNscope \
--tumor_sample ${sampleID} \
--dbsnp ${dbsnp138vcf} \
--disable_detector sv \
--max_fisher_pv_active 0.05 \
--min_tumor_allele_frac 0.009 \
--sv_mask_ext 10 \
--prune_factor 0 \
--assemble_mode 2 \
--resample_depth 100000 \
--pon ${pon} \
${sampleID}.output_tnscope.pre_filter.vcf
else
$sentieon driver -r ${hg19fa} -t ${threads} -i ${sampleID}.sorted.bam \
--interval ${bedPath}/${bedType}.bed --interval_padding 10 \
--algo TNscope \
--tumor_sample ${sampleID} \
--dbsnp ${dbsnp138vcf} \
--disable_detector sv \
--max_fisher_pv_active 0.05 \
--min_tumor_allele_frac 0.009 \
--sv_mask_ext 10 \
--prune_factor 0 \
--assemble_mode 2 \
--resample_depth 100000 \
--trim_primer ${bedPath}/${bedType}.amplicon.bed \
--pon ${pon} \
${sampleID}.output_tnscope.pre_filter.vcf
fi
echo $'\nbcftools filter'
echo "$(date)"
$bcftools annotate -x "FILTER/triallelic_site" ${sampleID}.output_tnscope.pre_filter.vcf | \
$bcftools filter -m + -s "insignificant" -e "(PV>0.25 && PV2>0.25)" | \
$bcftools filter -m + -s "insignificant" -e "(INFO/STR == 1 && PV>0.05)" | \
$bcftools filter -m + -s "low_qual" -e "QUAL < 20" | \
$bcftools filter -m + -s "short_tandem_repeat" -e "RPA[0]>=10" | \
$bcftools filter -m + -s "noisy_region" -e "ECNT>5" | \
$sentieon util vcfconvert - ${sampleID}.somatic.raw.vcf
rm ${sampleID}.output_tnscope.pre_filter.vcf*
elif [[ ${bedType} =~ ^(${probe_panel})$ && ${sampleType} != "ctdna" ]]; then
echo $'\nHaplotyper'
echo "$(date)"
$sentieon driver -r ${hg19fa} -t ${threads} -i ${sampleID}.sorted.dedup.bam \
--interval ${bedPath}/${bedType}.bed \
--algo Haplotyper --emit_conf=10 --call_conf=10 \
${sampleID}.germline.raw.vcf
echo $'\nTNscope'
echo "$(date)"
$sentieon driver -r ${hg19fa} -t ${threads} -i ${sampleID}.sorted.dedup.bam \
--interval ${bedPath}/${bedType}.bed --interval_padding 10 \
--algo TNscope \
--tumor_sample ${sampleID} \
--dbsnp ${dbsnp138vcf} \
--disable_detector sv \
--max_fisher_pv_active 0.05 \
--min_tumor_allele_frac 0.005 \
--filter_t_alt_frac 0.005 \
--resample_depth 100000 \
--assemble_mode 4 \
--pon ${pon} \
${sampleID}.output_tnscope.pre_filter.vcf
echo $'\nbcftools filter'
echo "$(date)"
echo $'\nbcftools filter'
echo "$(date)"
$bcftools annotate -x "FILTER/triallelic_site" ${sampleID}.output_tnscope.pre_filter.vcf | \
$bcftools filter -m + -s "insignificant" -e "(PV>0.25 && PV2>0.25)" | \
$bcftools filter -m + -s "insignificant" -e "(INFO/STR == 1 && PV>0.05)" | \
$bcftools filter -m + -s "orientation_bias" -e "FMT/FOXOG[0] == 1" | \
$bcftools filter -m + -s "strand_bias" -e "SOR > 3" | \
$bcftools filter -m + -s "low_qual" -e "QUAL < 20" | \
$bcftools filter -m + -s "short_tandem_repeat" -e "RPA[0]>=10" | \
$bcftools filter -m + -s "noisy_region" -e "ECNT>5" | \
$bcftools filter -m + -s "read_pos_bias" -e "FMT/ReadPosRankSumPS[0] < -8" | \
$sentieon util vcfconvert - ${sampleID}.somatic.raw.vcf
rm ${sampleID}.output_tnscope.pre_filter.vcf*
else
echo $'\nHaplotyper'
echo "$(date)"
$sentieon driver -r ${hg19fa} -t ${threads} -i ${sampleID}.sorted.dedup.bam \
--interval ${bedPath}/${bedType}.bed \
--algo Haplotyper --emit_conf=10 --call_conf=10 \
${sampleID}.germline.raw.vcf
echo $'\nTNscope'
echo "$(date)"
$sentieon driver -r ${hg19fa} -t ${threads} -i ${sampleID}.sorted.dedup.bam --interval ${bedPath}/${bedType}.bed \
--algo TNscope \
--tumor_sample ${sampleID} \
--dbsnp ${dbsnp138vcf} \
--pcr_indel_model NONE \
--disable_detector sv \
--min_tumor_allele_frac 0.0003 \
--filter_t_alt_frac 0.0003 \
--max_error_per_read 3 \
--min_init_tumor_lod 3.0 \
--min_tumor_lod 3.0 \
--min_base_qual 40 \
--min_base_qual_asm 40 \
--resample_depth 100000 \
--assemble_mode 4 \
--pon ${pon} \
${sampleID}.output_tnscope.pre_filter.vcf
echo $'\nbcftools filter'
echo "$(date)"
$bcftools annotate -x "FILTER/triallelic_site" ${sampleID}.output_tnscope.pre_filter.vcf | \
$bcftools filter -m + -s "low_qual" -e "QUAL < 10" | \
$bcftools filter -m + -s "short_tandem_repeat" -e "RPA[0]>=10" | \
$bcftools filter -m + -s "read_pos_bias" -e "FMT/ReadPosRankSumPS[0] < -5" | \
$bcftools filter -m + -s "base_qual_bias" -e "FMT/BaseQRankSumPS[0] < -5" | \
$sentieon util vcfconvert - ${sampleID}.somatic.raw.vcf
rm ${sampleID}.output_tnscope.pre_filter.vcf*
fi
echo $'\nfilter repeat regions'
echo "$(date)"
grep "#" ${sampleID}.somatic.raw.vcf > ${sampleID}.somatic.filter.vcf
grep -v "#" ${sampleID}.somatic.raw.vcf | awk '{len=split($5,a,",");if(len<3 || (len>=3 && $7 !~ "panel_of_normals")) print $0}' >> ${sampleID}.somatic.filter.vcf
################# 1p19q ########################
if [[ ${bedType} =~ (AT) ]];
then
$samtools mpileup -l ${bedPath}/AT.force.bed -f ${hg19fa} -d 30000 -Q 0 -O -s ${sampleID}.sorted.bam > ${sampleID}.mpileup.snp.txt
$python3 ${dig_pileup} ${sampleID}.mpileup.snp.txt -o ${sampleID}.mpileup.snp.vcf
$python3 ${p1q19_paint} ${sampleID}.mpileup.snp.vcf
rm ${sampleID}.mpileup.snp.txt
fi
################# bcftools ########################
# bcftools is used for retrieving mutations of interest, or mutations with higher confidence
echo $'\nbcftools'
echo "$(date)"
#germline
$bcftools norm -m -both ${sampleID}.germline.raw.vcf -o ${sampleID}.germline.raw.split.vcf
$bcftools norm -f ${hg19fa} ${sampleID}.germline.raw.split.vcf -o ${sampleID}.germline.split.norm.vcf
rm ${sampleID}.germline.raw.vcf* ${sampleID}.germline.raw.split.vcf
# #somatic
$bcftools norm -m -both ${sampleID}.somatic.filter.vcf -o ${sampleID}.somatic.raw.split.vcf
$bcftools norm -f ${hg19fa} ${sampleID}.somatic.raw.split.vcf -o ${sampleID}.somatic.split.norm.vcf
rm ${sampleID}.somatic.filter.vcf ${sampleID}.somatic.raw.split.vcf
###################### ANNOVAR ########################
# used for annotation
###################### germline ########################
echo $'\ngermline'
echo "$(date)"
mkdir anno
${annovar} ${sampleID}.germline.split.norm.vcf ${humandb} \
--outfile anno/${sampleID}.germline.anno --buildver hg19 \
--protocol refGene,cytoBand,1000g2015aug_all,esp6500siv2_all,exac03nonpsych,cosmic70,clinvar_20220624,intervar_20180118,dbnsfp35a,avsnp150 \
--operation g,r,f,f,f,f,f,f,f,f --vcfinput --remove
rm anno/${sampleID}.germline.anno.avinput
if ([[ "$bedType" =~ (${germline_panel1}) ]]) || ([[ "$bedType" =~ (${germline_panel2}) ]]); then
$python3 ${germline_filter} anno/${sampleID}.germline.anno.hg19_multianno.vcf -o anno/${sampleID}.germline.inital.filter.vcf > anno/${sampleID}.germline.filter_reason.txt
# vep注释主要为HGVS
${vep} --dir ${vep_dir} --cache --offline --cache_version 98 \
--use_given_ref --refseq --assembly GRCh37 --format vcf \
--fa ${hg19fa} --force_overwrite --vcf --variant_class --gene_phenotype --vcf_info_field ANN \
--hgvs --hgvsg --transcript_version \
-i anno/${sampleID}.germline.inital.filter.vcf -o anno/${sampleID}.germline.vep_anno.vcf
rm anno/${sampleID}.germline.inital.filter.vcf
# HGVS命名注释
$python3 ${anno_hgvs} anno/${sampleID}.germline.vep_anno.vcf ${clinic_transcripts} ${refFlat} -o anno/${sampleID}.germline.hgvs_anno.vcf
rm anno/${sampleID}.germline.vep_anno.vcf*
# VEP修复后假阳性删除
$python3 ${snv_second_filter} anno/${sampleID}.germline.hgvs_anno.vcf -k germline -o anno/${sampleID}.germline.final.filter.vcf >>anno/${sampleID}.germline.filter_reason.txt
fi
###################### somatic ########################
echo $'\nsomatic'
echo "$(date)"
# 过滤 nopass 突变
grep "#" ${sampleID}.somatic.split.norm.vcf > anno/${sampleID}.somatic.split.filter.vcf
grep -v "#" ${sampleID}.somatic.split.norm.vcf | awk '{len=split($7,a,";");split($NF,b,":");if(len<2 || (len>=2 && b[3] >=0.01)) print $0}' >> anno/${sampleID}.somatic.split.filter.vcf
# annovar注释
${annovar} anno/${sampleID}.somatic.split.filter.vcf ${humandb} \
--outfile anno/${sampleID}.somatic.annovar --buildver hg19 \
--protocol refGene,1000g2015aug_all,esp6500siv2_all,exac03nonpsych,cosmic70,clinvar_20220624,intervar_20180118,dbnsfp35a,avsnp150,dbscsnv11 \
--operation g,f,f,f,f,f,f,f,f,f --vcfinput --remove
# vep注释(主要为HGVS)
${vep} --dir ${vep_dir} --cache --offline --cache_version 98 \
--use_given_ref --refseq --assembly GRCh37 --format vcf \
--fa ${hg19fa} --force_overwrite --vcf --variant_class --gene_phenotype --vcf_info_field ANN \
--hgvs --hgvsg --transcript_version \
-i anno/${sampleID}.somatic.annovar.hg19_multianno.vcf -o anno/${sampleID}.somatic.vep_anno.vcf
# HGVS命名注释
$python3 ${anno_hgvs} anno/${sampleID}.somatic.vep_anno.vcf ${clinic_transcripts} ${refFlat} -o anno/${sampleID}.somatic.hgvs_anno.vcf
# 初步过滤(假阳性等)
$python3 ${snv_init_filter} anno/${sampleID}.somatic.hgvs_anno.vcf \
-o anno/${sampleID}.somatic.inital.filter.vcf -t ${sampleType} -m ${mode} > anno/${sampleID}.somatic.filter_reason.txt
# 合并同一单倍型上的复杂突变
$python3 ${merge_mnv} anno/${sampleID}.somatic.inital.filter.vcf $hg19fa -o anno/${sampleID}.somatic.merge_mnv.vcf
# 将合并后的mnp位置左移校准
$bcftools norm -m -both -f ${hg19fa} anno/${sampleID}.somatic.merge_mnv.vcf -o anno/${sampleID}.somatic.merged_norm.vcf
# 拆分MNV突变
$python3 ${find_mnp} anno/${sampleID}.somatic.merged_norm.vcf
if [ -f "anno/${sampleID}.somatic.mnp.vcf" ];
then
# annovar注释MNV
${annovar} anno/${sampleID}.somatic.mnp.vcf ${humandb} \
--outfile anno/${sampleID}.somatic.mnp.annovar --buildver hg19 \
--protocol refGene,1000g2015aug_all,esp6500siv2_all,exac03nonpsych,cosmic70,clinvar_20220624,intervar_20180118,dbnsfp35a,avsnp150,dbscsnv11 \
--operation g,f,f,f,f,f,f,f,f,f --vcfinput --remove
# vep注释MNV(主要为HGVS)
${vep} --dir ${vep_dir} --cache --offline --cache_version 98 \
--use_given_ref --refseq --assembly GRCh37 --format vcf \
--fa ${hg19fa} --force_overwrite --vcf --variant_class --gene_phenotype --vcf_info_field ANN \
--hgvs --hgvsg --transcript_version \
-i anno/${sampleID}.somatic.mnp.annovar.hg19_multianno.vcf -o anno/${sampleID}.somatic.mnp.vep_anno.vcf
# HGVS命名注释MNV
$python3 ${anno_hgvs} anno/${sampleID}.somatic.mnp.vep_anno.vcf ${clinic_transcripts} ${refFlat} -o anno/${sampleID}.somatic.mnp.hgvs_anno.vcf
# 合并突变
$bgzip -f anno/${sampleID}.somatic.common.vcf
$tabix -p vcf anno/${sampleID}.somatic.common.vcf.gz
$bgzip -f anno/${sampleID}.somatic.mnp.hgvs_anno.vcf
$tabix -p vcf anno/${sampleID}.somatic.mnp.hgvs_anno.vcf.gz
$bcftools concat anno/${sampleID}.somatic.common.vcf.gz anno/${sampleID}.somatic.mnp.hgvs_anno.vcf.gz -a -d all -o anno/${sampleID}.somatic.merge.vcf
else
mv anno/${sampleID}.somatic.merged_norm.vcf anno/${sampleID}.somatic.merge.vcf
fi
# VEP修复后假阳性及良性变异删除
$python3 ${snv_second_filter} anno/${sampleID}.somatic.merge.vcf -k somatic -o anno/${sampleID}.somatic.final.filter.vcf >>anno/${sampleID}.somatic.filter_reason.txt
# 统计变异删除的原因
$python3 ${get_filter_reason} . -k somatic
###################### tmb ########################
if [[ ${bedType} =~ ^(${tmb_panel})$ ]]; then
echo $'\nTMB'
echo "$(date)"
# 过滤低频假阳性
grep "#" ${sampleID}.somatic.split.norm.vcf > anno/${sampleID}.tmb.filter_1.vcf
if [[ ${sampleType} =~ (ctdna) ]]; then
grep -v "#" ${sampleID}.somatic.split.norm.vcf | awk '{split($NF,a,":");if(a[3]>0.05 && a[3] <= 0.5) print $0}' >> anno/${sampleID}.tmb.filter_1.vcf
else
grep -v "#" ${sampleID}.somatic.split.norm.vcf | awk '{split($NF,a,":");if(a[3]>0.05) print $0}' >> anno/${sampleID}.tmb.filter_1.vcf
fi
# 过滤filter信息大于2的突变
grep "#" anno/${sampleID}.tmb.filter_1.vcf > anno/${sampleID}.tmb.filter_2.vcf
grep -v "#" anno/${sampleID}.tmb.filter_1.vcf | awk '{len=split($7,a,";");if(len<2) print $0}' >> anno/${sampleID}.tmb.filter_2.vcf
# 合并同一单倍型上的复杂突变
$python3 ${merge_mnv} anno/${sampleID}.tmb.filter_2.vcf $hg19fa -o anno/${sampleID}.tmb.merged.vcf
# 将合并后的mnp位置左移校准
$bcftools norm -m -both -f ${hg19fa} anno/${sampleID}.tmb.merged.vcf -o anno/${sampleID}.tmb.merged_norm.vcf
rm anno/${sampleID}.tmb.merged.vcf
# annovar 注释相关数据库
${annovar} anno/${sampleID}.tmb.merged_norm.vcf ${humandb} \
--outfile anno/${sampleID}.tmb.annovar --buildver hg19 \
--protocol refGene,cytoBand,1000g2015aug_all,esp6500siv2_all,exac03nonpsych,cosmic70,clinvar_20220624,intervar_20180118,dbnsfp35a,avsnp150 \
--operation g,r,f,f,f,f,f,f,f,f --vcfinput --remove
rm anno/${sampleID}.tmb.merged_norm.vcf
# vep注释主要为HGVS
${vep} --dir ${vep_dir} --cache --offline --cache_version 98 \
--use_given_ref --refseq --assembly GRCh37 --format vcf \
--fa ${hg19fa} --force_overwrite --vcf --variant_class --gene_phenotype --vcf_info_field ANN \
--hgvs --hgvsg --transcript_version \
-i anno/${sampleID}.tmb.annovar.hg19_multianno.vcf -o anno/${sampleID}.tmb.vep_anno.vcf
rm anno/${sampleID}.tmb.annovar*
# HGVS命名注释
$python3 ${anno_hgvs} anno/${sampleID}.tmb.vep_anno.vcf ${clinic_transcripts} ${refFlat} -o anno/${sampleID}.tmb.hgvs_anno.vcf
rm anno/${sampleID}.tmb.vep_anno.vcf*
# 过滤其他假阳性germline等
$python3 ${tmb_filter} anno/${sampleID}.tmb.hgvs_anno.vcf -t ${sampleType} -o anno/${sampleID}.tmb.final.filter.vcf -m ${mode} > anno/${sampleID}.tmb.filter_reason.txt
rm anno/${sampleID}.tmb.hgvs_anno.vcf
# 统计变异删除的原因
$python3 ${get_filter_reason} . -k tmb
rm anno/${sampleID}.tmb.filter_1.vcf anno/${sampleID}.tmb.filter_2.vcf
fi
rm ${sampleID}.somatic.split.norm.vcf
###################### concat vcf ########################
if ([[ "$bedType" =~ (${germline_panel1}) ]]) || ([[ "$bedType" =~ (QR) && ${sampleType} != "blood" && ${sampleType} != "dna" ]]);then
echo $'\nmerge_vcf'
echo "$(date)"
cp anno/${sampleID}.germline.final.filter.vcf anno/${sampleID}.germline.final.filter.concat.vcf
$bgzip -f anno/${sampleID}.germline.final.filter.concat.vcf
$tabix -p vcf anno/${sampleID}.germline.final.filter.concat.vcf.gz
cp anno/${sampleID}.somatic.final.filter.vcf anno/${sampleID}.somatic.final.filter.concat.vcf
$bgzip -f anno/${sampleID}.somatic.final.filter.concat.vcf
$tabix -p vcf anno/${sampleID}.somatic.final.filter.concat.vcf.gz
if [[ "$bedType" =~ (${tmb_panel}) ]];then
cp anno/${sampleID}.tmb.final.filter.vcf anno/${sampleID}.tmb.final.filter.concat.vcf
$bgzip -f anno/${sampleID}.tmb.final.filter.concat.vcf
$tabix -p vcf anno/${sampleID}.tmb.final.filter.concat.vcf.gz
$bcftools concat anno/${sampleID}.somatic.final.filter.concat.vcf.gz anno/${sampleID}.germline.final.filter.concat.vcf.gz anno/${sampleID}.tmb.final.filter.concat.vcf.gz -a -d all -o anno/${sampleID}.somatic.concat.vcf
rm anno/${sampleID}.tmb.final.filter.concat.vcf.gz*
else
$bcftools concat anno/${sampleID}.somatic.final.filter.concat.vcf.gz anno/${sampleID}.germline.final.filter.concat.vcf.gz -a -d all -o anno/${sampleID}.somatic.concat.vcf
fi
rm anno/${sampleID}.somatic.final.filter.concat.vcf.gz* anno/${sampleID}.germline.final.filter.concat.vcf.gz*
fi
###################### anno drugs ########################
echo $'\nanno_drugs'
echo "$(date)"
# 注释药物相关数据库
if ([[ "$bedType" =~ (${germline_panel1}) ]]) || ([[ "$bedType" =~ (QR) && ${sampleType} != "blood" && ${sampleType} != "dna" ]]);then
$python3 ${snv_out} anno/${sampleID}.somatic.concat.vcf -o anno/${sampleID}.somatic.xlsx -t ${sampleType} >> anno/${sampleID}.somatic.filter_reason.txt
elif [[ "$bedType" =~ (${germline_panel2}) && "$sampleType" =~ (blood|dna) ]];then
$python3 ${snv_out} anno/${sampleID}.germline.final.filter.vcf -o anno/${sampleID}.somatic.xlsx -t ${sampleType} >> anno/${sampleID}.somatic.filter_reason.txt
else
$python3 ${snv_out} anno/${sampleID}.somatic.final.filter.vcf -o anno/${sampleID}.somatic.xlsx -t ${sampleType} >> anno/${sampleID}.somatic.filter_reason.txt
fi
###################### add_pipe_tag ######################
# 添加流程pipe tag
$python3 ${add_pipe_tag} anno/${sampleID}.somatic.xlsx
#################### sanger_determine ####################
# 确定germline突变是否需要验证
$python3 ${sanger_determine} anno/${sampleID}.somatic.xlsx
#################### cis_trans ####################
# 确定T790M和G797S共突变的顺式或反式
$python3 ${cis_trans} anno/${sampleID}.somatic.xlsx
###################### core_filter ########################
if [[ "$bedType" =~ (${core_panel}) ]];then
$python3 ${core_filter} anno/${sampleID}.somatic.xlsx
fi
###################### qc_bamdst ########################
echo $'\nqc_bamdst'
echo "$(date)"
if [[ ${bedType} =~ ^(${pcr_panel})$ || ${bedType} =~ ^(${umi_panel})$ ]]; then
mkdir -p qc/bamdst
$bamdst --uncover 30 -p ${bedPath}/${bedType}.bed --cutoffdepth 500 -o qc/bamdst ${sampleID}.sorted.bam
ls qc/bamdst/ | xargs -I{} mv qc/bamdst/{} qc/bamdst/${sampleID}.{}
elif [[ ${bedType} =~ ^(${probe_panel})$ && ${sampleType} != "ctdna" ]]; then
mkdir -p qc/bamdst
$bamdst --uncover 30 -p ${bedPath}/${bedType}.bed --cutoffdepth 1000 -o qc/bamdst ${sampleID}.sorted.dedup.bam
ls qc/bamdst/ | xargs -I{} mv qc/bamdst/{} qc/bamdst/${sampleID}.{}
else
mkdir -p qc/bamdst/raw
if [[ "$sampleType" =~ (ctdna) ]]; then
$bamdst --uncover 30 -p ${bedPath}/${bedType}.bed --cutoffdepth 1000 -o qc/bamdst/raw ${sampleID}.sorted.bam
else
$bamdst --uncover 30 -p ${bedPath}/${bedType}.bed --cutoffdepth 500 -o qc/bamdst/raw ${sampleID}.sorted.bam
fi
ls qc/bamdst/raw/ | xargs -I{} mv qc/bamdst/raw/{} qc/bamdst/raw/${sampleID}.raw.{}
mkdir -p qc/bamdst/dedup
if [[ "$sampleType" =~ (ctdna) ]]; then
$bamdst --uncover 30 -p ${bedPath}/${bedType}.bed --cutoffdepth 1000 -o qc/bamdst/dedup ${sampleID}.sorted.dedup.bam
else
$bamdst --uncover 30 -p ${bedPath}/${bedType}.bed --cutoffdepth 500 -o qc/bamdst/dedup ${sampleID}.sorted.dedup.bam
fi
ls qc/bamdst/dedup | xargs -I{} mv qc/bamdst/dedup/{} qc/bamdst/dedup/${sampleID}.dedup.{}
fi
###################### chemical ########################
echo $'\nchemical'
echo "$(date)"
if [[ ${bedType} =~ ^(DA)$ ]]; then
$samtools bedcov ${bedPath}/chemical_DA.bed ${sampleID}.sorted.bam >${sampleID}.chemical.depth.txt
else
$samtools bedcov ${bedPath}/chemical_DF.bed ${sampleID}.sorted.bam >${sampleID}.chemical.depth.txt
fi
if [[ "$bedType" =~ (${chemical_panel}) ]]; then
$python3 ${germline_chemical} anno/${sampleID}.germline.anno.hg19_multianno.txt >anno/${sampleID}.chemical.txt
$python3 ${check_sample_data} anno/${sampleID}.chemical.txt
rm ${sampleID}.chemical.depth.txt anno/${sampleID}.chemical.txt
fi
###################### Copy Number Aberration ######################
### hybrid capture: DG|DS|DH|DT|DL|DW|DR|DJ|DF|DQ|DE
### targeted amplicon seq: DP|HH|HT|HW|HR|HC|HF|ZJ|ZG|YW|QJ|YR|YZ|YW|YF
### 需要做cnv的panelDF, DG, DJ, DQ, DR, DS, DW, DE
##################### CNVkit ########################
if [[ ${bedType} =~ ^(${cnv_panel1})$ ]];
then
echo "CNA starts at -- `date` "
if [[ ${bedType} =~ ^(${cnv_panel2})$ ]];
then
$python3 $cnvkit coverage ${sampleID}.sorted.dedup.bam ${cnv_target}/${bedType}.target.bed -p ${threads} -o ${sampleID}.targetcoverage.cnn
$python3 $cnvkit fix ${sampleID}.targetcoverage.cnn ${cnv_ref}/empty.cnn ${cnv_ref}/${bedType}.flat.cnn -o ${sampleID}.cnr
$python3 $cnvkit segment -p ${threads} ${sampleID}.cnr --drop-low-coverage -o ${sampleID}.cns
elif [[ ${bedType} =~ ^(${cnv_panel3})$ ]];
then
$python3 $cnvkit coverage ${sampleID}.sorted.bam ${cnv_target}/${bedType}.target.bed -p ${threads} -o ${sampleID}.targetcoverage.cnn
$python3 $cnvkit fix ${sampleID}.targetcoverage.cnn ${cnv_ref}/empty.cnn ${cnv_ref}/${bedType}.flat.cnn --no-edge -o ${sampleID}.cnr
$python3 $cnvkit segment -p ${threads} ${sampleID}.cnr --drop-low-coverage -o ${sampleID}.cns
else
$python3 $cnvkit coverage ${sampleID}.sorted.dedup.bam ${cnv_target}/${bedType}.target.bed -p ${threads} -o ${sampleID}.targetcoverage.cnn
$python3 $cnvkit coverage ${sampleID}.sorted.dedup.bam ${cnv_anti}/${bedType}.antitarget.bed -p ${threads} -o ${sampleID}.antitargetcoverage.cnn
$python3 $cnvkit fix ${sampleID}.targetcoverage.cnn ${sampleID}.antitargetcoverage.cnn ${cnv_ref}/${bedType}.flat.cnn -o ${sampleID}.cnr
$python3 $cnvkit segment -p ${threads} ${sampleID}.cnr --drop-low-coverage -o ${sampleID}.cns
fi
$python3 $cnvkit call ${sampleID}.cns --center --drop-low-coverage -o ${sampleID}.call.cns
$python3 $cnvkit export vcf ${sampleID}.call.cns -i "${sampleID}" -o ${sampleID}.cnv.vcf
$python3 $annotate_vcf -i ${sampleID}.cnv.vcf -r $refFlat -o ${sampleID}.annotated.cnv.csv
$python3 ${cna_out} ${sampleID}.annotated.cnv.csv
##################### THeTA ########################
# THeTA -- calculate sample purity (tumour/normal ratio)
$python3 $cnvkit export theta ${sampleID}.cns -r ${cnv_ref}/${bedType}.flat.cnn
${theta} ${sampleID}.interval_count --NUM_PROCESSES ${threads}
echo "CNA analysis completed at: `date` "
rm ${sampleID}.*coverage.cnn ${sampleID}.cnr ${sampleID}.cnv.vcf ${sampleID}.cns ${sampleID}.call.cns ${sampleID}.interval_count ${sampleID}.n2.graph.pdf ${sampleID}.n2.withBounds
fi
################### Microsatelite instability ######################
# ###################### MSIsensor2 ######################
# # Detect msi on below panel using msisensor
if [[ ${bedType} =~ ^(${msi_panel})$ ]];
then
echo $'\nmsisensor2'
echo "$(date)"
$msisensor2 msi -M ${msi_models} -t ${sampleID}.sorted.dedup.bam -o ${sampleID}.prefix
fi
########################## Fusion of genes ########################
##################### genefuse ########################
# Detect potential gene fusion on below panels.
if [[ ${bedType} =~ ^(${fusion_panel})$ ]]; then
echo $'\ngenefuse'
echo "$(date)"
$python3 ${make_fusion_gene} ${sampleID}
if [[ "$sampleType" =~ (ctdna) ]]; then
$genefuse -t ${threads} -r ${hg19fa} -f ${sampleID}.fusion_genes.csv -1 ${sampleID}.umi_consensus.fq.gz -h ${sampleID}.fusion.html -j ${sampleID}.fusion.json 1> ${sampleID}.fusion.txt
rm ${sampleID}.umi_consensus.fq.gz
else
$genefuse -t ${threads} -r ${hg19fa} -f ${sampleID}.fusion_genes.csv -1 ${sampleID}.R1_001.trim.fq.gz -2 ${sampleID}.R2_001.trim.fq.gz -h ${sampleID}.fusion.html -j ${sampleID}.fusion.json 1> ${sampleID}.fusion.txt
fi
echo $'\ndelly'
echo "$(date)"
$delly call -x ${delly_low_regions} -g ${hg19fa} -o ${sampleID}.bcf ${sampleID}.sorted.dedup.bam
$bcftools view ${sampleID}.bcf > ${sampleID}.sv.raw.vcf
rm ${sampleID}.bcf*
$python3 $delly_sv_filter ${sampleID}.sv.raw.vcf
rm ${sampleID}.delly*
$python3 $fusion_filter ${sampleID}.fusion.json -o ${sampleID}.fusion_filtered.json
$python3 ${fusion_out} ${sampleID}.fusion_filtered.json
$python3 ${check_fusion_result} ${sampleID}.fusion.xlsx
rm ${sampleID}.fusion.txt
fi
########################## Quality control #########################
##################### Fastp ########################
# This step is used for quality control by fastqc.
mkdir -p qc/fastp
echo $'\nQC_fastp'
echo "$(date)"
mv ${sampleID}.jion ${sampleID}.html qc/fastp/
##################### samtools ########################
# This step is used for quality control by samtools.
mkdir -p qc/samtools
echo $'\nQC_samtools'
echo "$(date)"
$samtools stats -@ ${threads} ${sampleID}.sorted.bam >qc/samtools/${sampleID}.sorted.bam.stats.txt
##################### hla_type_neo_predict ########################
# This step is used for hla_type_neo_predict by hlahd and MuPeXI.
if [[ "$bedType" =~ ^(${hla_panel})$ ]]; then
echo $'\nHLA'
echo "$(date)"
cp anno/${sampleID}.tmb.final.filter.vcf anno/${sampleID}.tmb_copy.vcf
grep '#' anno/${sampleID}.tmb_copy.vcf > anno/${sampleID}.header
grep -v '#' anno/${sampleID}.tmb_copy.vcf > anno/${sampleID}.body
gunzip -k ${sampleID}.R1_001.trim.fq.gz
gunzip -k ${sampleID}.R2_001.trim.fq.gz
# get 300w reads, about 1GB to to hla typing
$seqtk sample -s100 ${sampleID}.R1_001.trim.fq 3000000 > ${sampleID}.R1_001.trim.sub.fq
$seqtk sample -s100 ${sampleID}.R2_001.trim.fq 3000000 > ${sampleID}.R2_001.trim.sub.fq
$python2 ${hla_optitype} -sampleID $sampleID -outDir $outDir -results_TMB_txt anno/${sampleID}.body -fq1 ${sampleID}.R1_001.trim.sub.fq -fq2 ${sampleID}.R2_001.trim.sub.fq
# 拼接vcf
cat anno/${sampleID}.header anno/${sampleID}.select_variant_from_TMB.txt > anno/${sampleID}.mupexi.vcf
$python2 ${hla_optitype} -sampleID $sampleID -outDir $outDir -vcf anno/${sampleID}.mupexi.vcf -determine_tnb anno/${sampleID}.body
rm ${sampleID}.R*_001.trim.fq ${sampleID}.R*_001.trim.sub.fq \
anno/${sampleID}.body anno/${sampleID}.tmb_copy.vcf anno/${sampleID}.header anno/${sampleID}.select_variant_from_TMB.txt \
neo/${sampleID}_coverage_plot.pdf neo/${sampleID}.fasta neo/${sampleID}.g \
neo/${sampleID}_netMHCpan* neo/${sampleID}_pepmatch_* neo/${sampleID}_peptide_netMHCinput.txt neo/${sampleID}_reference_peptide_* \
neo/${sampleID}_vep_compatible_vcf.vcf neo/${sampleID}_vep.vep
fi
########################## merge_data #########################
$python3 ${single_sample_qc} . ${sampleID} ${sampleType}
$python3 ${parser_single_sample_data} . ${sampleID}
$python3 ${collect_single_sample_data} . ${sampleID}
rm ${sampleID}.R*_001.trim.fq.gz
if [[ ${bedType} =~ ^(${probe_panel})$ ]]; then
rm ${sampleID}.sorted.bam*
fi
echo "$sampleID -- ending at: $(date)"
echo "pipeline finished!" >finish.txt