715 lines
30 KiB
Bash
715 lines
30 KiB
Bash
#!/usr/bin/bash
|
||
|
||
################################################################
|
||
## step1: recording all version informations ##
|
||
################################################################
|
||
|
||
################# pipeline_version ########################
|
||
# pipeline: ${pipeline_version}
|
||
|
||
#################### soft_version ########################
|
||
# fastp: ${fastp_version}
|
||
# bwa: ${bwa_version}
|
||
# samtools: ${samtools_version}
|
||
# sentieon: ${sentieon_version}
|
||
# bcftools: ${bcftools_version}
|
||
# annovar: ${annovar_version}
|
||
# vep: ${vep_version}
|
||
# msisensor2: ${msisensor2_version}
|
||
# genefuse: ${genefuse_version}
|
||
# cnvkit: ${cnvkit_version}
|
||
# bamdst: ${bamdst_version}
|
||
# seqtk: ${seqtk_version}
|
||
# htslib: ${htslib_version}
|
||
# python2: ${python2_version}
|
||
# python3: ${python3_version}
|
||
# delly: ${delly_version}
|
||
|
||
#################### bed_version ########################
|
||
# bed: ${bed_version}
|
||
|
||
################# database_version ########################
|
||
# reference: ${reference_version}
|
||
# refgene: ${refgene_version}
|
||
# cytoBand: ${cytoBand_version}
|
||
# 1000g: ${g1000_version}
|
||
# esp: ${esp_version}
|
||
# exac: ${exac_version}
|
||
# cosmic: ${cosmic_version}
|
||
# clinvar: ${clinvar_version}
|
||
# intervar: ${intervar_version}
|
||
# dbnsfp: ${dbnsfp_version}
|
||
# avsnp: ${avsnp_version}
|
||
# private_database: ${private_database}
|
||
# pon_database: ${pon_database}
|
||
|
||
|
||
|
||
################################################################
|
||
## step2: pipeline for analysis ##
|
||
################################################################
|
||
|
||
# Update with the location of the Sentieon software package and license file
|
||
export SENTIEON_LICENSE=${SENTIEON_LICENSE}
|
||
|
||
|
||
threads=${THREAD}
|
||
|
||
cd $outDir
|
||
echo "$sampleID -- beginning at: $(date)"
|
||
|
||
$python3 ${checkUMI} -f $fq1 -s ${sampleType}
|
||
################# fastp ########################
|
||
# fastp is used for remove lower quality bases
|
||
echo "fastp"
|
||
echo "$(date)"
|
||
$fastp -w ${threads} -i $fq1 -o ${sampleID}.R1_001.trim.fq.gz -I $fq2 -O ${sampleID}.R2_001.trim.fq.gz -l 35 -c --detect_adapter_for_pe -j ${sampleID}.jion -h ${sampleID}.html
|
||
|
||
|
||
################# UMI + alignment + dedup ########################
|
||
# extract umi info and alignment to reference
|
||
if [[ ${bedType} =~ ^(${umi_panel})$ ]];
|
||
then
|
||
# remove UMI from following panels
|
||
echo $'\nsentieon umi extract'
|
||
echo "$(date)"
|
||
$sentieon umi extract 21S+T,12M11S+T ${sampleID}.R1_001.trim.fq.gz ${sampleID}.R2_001.trim.fq.gz | \
|
||
( $sentieon bwa mem -t ${threads} -p -C -M -R "@RG\\tID:${sampleID}\\tSM:${sampleID}\\tPL:ILLUMINA\\tLB:lib" -K 10000000 ${hg19fa} - || echo -n 'error') | \
|
||
$sentieon util sort -t ${threads} --sam2bam -i - -o ${sampleID}.sorted.bam
|
||
|
||
echo $'\nsentieon umi consensus'
|
||
echo "$(date)"
|
||
$sentieon umi consensus -i ${sampleID}.sorted.bam --input_format BAM -o ${sampleID}.umi_consensus.fq.gz
|
||
|
||
echo $'\nbwa'
|
||
echo "$(date)"
|
||
( $sentieon bwa mem -t ${threads} -p -C -M -R "@RG\\tID:${sampleID}\\tSM:${sampleID}\\tPL:ILLUMINA\\tLB:lib" -K 10000000 ${hg19fa} ${sampleID}.umi_consensus.fq.gz || echo -n 'error' ) | \
|
||
$sentieon util sort --umi_post_process --sam2bam -i - -o ${sampleID}.sorted.dedup.bam
|
||
|
||
elif [[ ${bedType} =~ (${probe_panel}) && ${sampleType} =~ (ctdna) ]];
|
||
then
|
||
# remove UMI from following panels
|
||
echo $'\nsentieon umi'
|
||
echo "$(date)"
|
||
$sentieon umi extract -d 6M1S+T,6M1S+T ${sampleID}.R1_001.trim.fq.gz ${sampleID}.R2_001.trim.fq.gz | \
|
||
( $sentieon bwa mem -t ${threads} -p -C -M -R "@RG\\tID:${sampleID}\\tSM:${sampleID}\\tPL:ILLUMINA\\tLB:lib" -K 10000000 ${hg19fa} - || echo -n 'error') | \
|
||
$sentieon util sort -t ${threads} --sam2bam -i - -o ${sampleID}.sorted.bam
|
||
|
||
echo $'\nsentieon umi consensus'
|
||
echo "$(date)"
|
||
$sentieon umi consensus -i ${sampleID}.sorted.bam --input_format BAM -o ${sampleID}.umi_consensus.fq.gz
|
||
|
||
echo $'\nbwa'
|
||
echo "$(date)"
|
||
( $sentieon bwa mem -t ${threads} -p -C -M -R "@RG\\tID:${sampleID}\\tSM:${sampleID}\\tPL:ILLUMINA\\tLB:lib" -K 10000000 ${hg19fa} ${sampleID}.umi_consensus.fq.gz || echo -n 'error' ) | \
|
||
$sentieon util sort --umi_post_process --sam2bam -i - -o ${sampleID}.sorted.dedup.bam
|
||
|
||
else
|
||
# following panels does not contain UMI
|
||
echo $'\nbwa'
|
||
echo "$(date)"
|
||
( $sentieon bwa mem -t ${threads} -M -R "@RG\\tID:${sampleID}\\tSM:${sampleID}\\tPL:ILLUMINA\\tLB:lib" -K 10000000 ${hg19fa} ${sampleID}.R1_001.trim.fq.gz ${sampleID}.R2_001.trim.fq.gz || echo -n 'error' ) | \
|
||
$sentieon util sort -o ${sampleID}.sorted.bam -t ${threads} --sam2bam -i -
|
||
|
||
if [[ ${bedType} =~ ^(${probe_panel})$ ]]; then
|
||
echo $'\nRemove Duplicate'
|
||
echo "$(date)"
|
||
$sentieon driver -t ${threads} -i ${sampleID}.sorted.bam --algo LocusCollector --fun score_info ${sampleID}.score.txt
|
||
$sentieon driver -t ${threads} -i ${sampleID}.sorted.bam --algo Dedup --score_info ${sampleID}.score.txt --metrics ${sampleID}.dedup_metrics.txt ${sampleID}.sorted.dedup.bam
|
||
rm ${sampleID}.score.txt* ${sampleID}.dedup_metrics.txt
|
||
fi
|
||
fi
|
||
|
||
|
||
################## Variant Calling ######################
|
||
# TNscope is used for somatic SNVs & small INDELs calling
|
||
if [[ ${bedType} =~ ^(${pcr_panel})$ ]]; then
|
||
|
||
echo $'\nHaplotyper'
|
||
echo "$(date)"
|
||
$sentieon driver -r ${hg19fa} -t ${threads} -i ${sampleID}.sorted.bam \
|
||
--interval ${bedPath}/${bedType}.bed \
|
||
--algo Haplotyper --emit_conf=10 --call_conf=10 \
|
||
${sampleID}.germline.raw.vcf
|
||
|
||
echo $'\nTNscope'
|
||
echo "$(date)"
|
||
|
||
if [[ ${bedType} =~ (AT|CA) ]];
|
||
then
|
||
$sentieon driver -r ${hg19fa} -t ${threads} -i ${sampleID}.sorted.bam \
|
||
--interval ${bedPath}/${bedType}.bed --interval_padding 10 \
|
||
--algo TNscope \
|
||
--tumor_sample ${sampleID} \
|
||
--dbsnp ${dbsnp138vcf} \
|
||
--disable_detector sv \
|
||
--max_fisher_pv_active 0.05 \
|
||
--min_tumor_allele_frac 0.009 \
|
||
--sv_mask_ext 10 \
|
||
--prune_factor 0 \
|
||
--assemble_mode 2 \
|
||
--resample_depth 100000 \
|
||
--pon ${pon} \
|
||
${sampleID}.output_tnscope.pre_filter.vcf
|
||
else
|
||
$sentieon driver -r ${hg19fa} -t ${threads} -i ${sampleID}.sorted.bam \
|
||
--interval ${bedPath}/${bedType}.bed --interval_padding 10 \
|
||
--algo TNscope \
|
||
--tumor_sample ${sampleID} \
|
||
--dbsnp ${dbsnp138vcf} \
|
||
--disable_detector sv \
|
||
--max_fisher_pv_active 0.05 \
|
||
--min_tumor_allele_frac 0.009 \
|
||
--sv_mask_ext 10 \
|
||
--prune_factor 0 \
|
||
--assemble_mode 2 \
|
||
--resample_depth 100000 \
|
||
--trim_primer ${bedPath}/${bedType}.amplicon.bed \
|
||
--pon ${pon} \
|
||
${sampleID}.output_tnscope.pre_filter.vcf
|
||
fi
|
||
|
||
echo $'\nbcftools filter'
|
||
echo "$(date)"
|
||
$bcftools annotate -x "FILTER/triallelic_site" ${sampleID}.output_tnscope.pre_filter.vcf | \
|
||
$bcftools filter -m + -s "insignificant" -e "(PV>0.25 && PV2>0.25)" | \
|
||
$bcftools filter -m + -s "insignificant" -e "(INFO/STR == 1 && PV>0.05)" | \
|
||
$bcftools filter -m + -s "low_qual" -e "QUAL < 20" | \
|
||
$bcftools filter -m + -s "short_tandem_repeat" -e "RPA[0]>=10" | \
|
||
$bcftools filter -m + -s "noisy_region" -e "ECNT>5" | \
|
||
$sentieon util vcfconvert - ${sampleID}.somatic.raw.vcf
|
||
rm ${sampleID}.output_tnscope.pre_filter.vcf*
|
||
|
||
elif [[ ${bedType} =~ ^(${probe_panel})$ && ${sampleType} != "ctdna" ]]; then
|
||
|
||
echo $'\nHaplotyper'
|
||
echo "$(date)"
|
||
$sentieon driver -r ${hg19fa} -t ${threads} -i ${sampleID}.sorted.dedup.bam \
|
||
--interval ${bedPath}/${bedType}.bed \
|
||
--algo Haplotyper --emit_conf=10 --call_conf=10 \
|
||
${sampleID}.germline.raw.vcf
|
||
|
||
echo $'\nTNscope'
|
||
echo "$(date)"
|
||
$sentieon driver -r ${hg19fa} -t ${threads} -i ${sampleID}.sorted.dedup.bam \
|
||
--interval ${bedPath}/${bedType}.bed --interval_padding 10 \
|
||
--algo TNscope \
|
||
--tumor_sample ${sampleID} \
|
||
--dbsnp ${dbsnp138vcf} \
|
||
--disable_detector sv \
|
||
--max_fisher_pv_active 0.05 \
|
||
--min_tumor_allele_frac 0.005 \
|
||
--filter_t_alt_frac 0.005 \
|
||
--resample_depth 100000 \
|
||
--assemble_mode 4 \
|
||
--pon ${pon} \
|
||
${sampleID}.output_tnscope.pre_filter.vcf
|
||
|
||
echo $'\nbcftools filter'
|
||
echo "$(date)"
|
||
echo $'\nbcftools filter'
|
||
echo "$(date)"
|
||
$bcftools annotate -x "FILTER/triallelic_site" ${sampleID}.output_tnscope.pre_filter.vcf | \
|
||
$bcftools filter -m + -s "insignificant" -e "(PV>0.25 && PV2>0.25)" | \
|
||
$bcftools filter -m + -s "insignificant" -e "(INFO/STR == 1 && PV>0.05)" | \
|
||
$bcftools filter -m + -s "orientation_bias" -e "FMT/FOXOG[0] == 1" | \
|
||
$bcftools filter -m + -s "strand_bias" -e "SOR > 3" | \
|
||
$bcftools filter -m + -s "low_qual" -e "QUAL < 20" | \
|
||
$bcftools filter -m + -s "short_tandem_repeat" -e "RPA[0]>=10" | \
|
||
$bcftools filter -m + -s "noisy_region" -e "ECNT>5" | \
|
||
$bcftools filter -m + -s "read_pos_bias" -e "FMT/ReadPosRankSumPS[0] < -8" | \
|
||
$sentieon util vcfconvert - ${sampleID}.somatic.raw.vcf
|
||
rm ${sampleID}.output_tnscope.pre_filter.vcf*
|
||
|
||
else
|
||
echo $'\nHaplotyper'
|
||
echo "$(date)"
|
||
$sentieon driver -r ${hg19fa} -t ${threads} -i ${sampleID}.sorted.dedup.bam \
|
||
--interval ${bedPath}/${bedType}.bed \
|
||
--algo Haplotyper --emit_conf=10 --call_conf=10 \
|
||
${sampleID}.germline.raw.vcf
|
||
|
||
echo $'\nTNscope'
|
||
echo "$(date)"
|
||
$sentieon driver -r ${hg19fa} -t ${threads} -i ${sampleID}.sorted.dedup.bam --interval ${bedPath}/${bedType}.bed \
|
||
--algo TNscope \
|
||
--tumor_sample ${sampleID} \
|
||
--dbsnp ${dbsnp138vcf} \
|
||
--pcr_indel_model NONE \
|
||
--disable_detector sv \
|
||
--min_tumor_allele_frac 0.0003 \
|
||
--filter_t_alt_frac 0.0003 \
|
||
--max_error_per_read 3 \
|
||
--min_init_tumor_lod 3.0 \
|
||
--min_tumor_lod 3.0 \
|
||
--min_base_qual 40 \
|
||
--min_base_qual_asm 40 \
|
||
--resample_depth 100000 \
|
||
--assemble_mode 4 \
|
||
--pon ${pon} \
|
||
${sampleID}.output_tnscope.pre_filter.vcf
|
||
|
||
echo $'\nbcftools filter'
|
||
echo "$(date)"
|
||
$bcftools annotate -x "FILTER/triallelic_site" ${sampleID}.output_tnscope.pre_filter.vcf | \
|
||
$bcftools filter -m + -s "low_qual" -e "QUAL < 10" | \
|
||
$bcftools filter -m + -s "short_tandem_repeat" -e "RPA[0]>=10" | \
|
||
$bcftools filter -m + -s "read_pos_bias" -e "FMT/ReadPosRankSumPS[0] < -5" | \
|
||
$bcftools filter -m + -s "base_qual_bias" -e "FMT/BaseQRankSumPS[0] < -5" | \
|
||
$sentieon util vcfconvert - ${sampleID}.somatic.raw.vcf
|
||
rm ${sampleID}.output_tnscope.pre_filter.vcf*
|
||
fi
|
||
|
||
|
||
echo $'\nfilter repeat regions'
|
||
echo "$(date)"
|
||
grep "#" ${sampleID}.somatic.raw.vcf > ${sampleID}.somatic.filter.vcf
|
||
grep -v "#" ${sampleID}.somatic.raw.vcf | awk '{len=split($5,a,",");if(len<3 || (len>=3 && $7 !~ "panel_of_normals")) print $0}' >> ${sampleID}.somatic.filter.vcf
|
||
|
||
|
||
################# 1p19q ########################
|
||
|
||
if [[ ${bedType} =~ (AT) ]];
|
||
then
|
||
$samtools mpileup -l ${bedPath}/AT.force.bed -f ${hg19fa} -d 30000 -Q 0 -O -s ${sampleID}.sorted.bam > ${sampleID}.mpileup.snp.txt
|
||
$python3 ${dig_pileup} ${sampleID}.mpileup.snp.txt -o ${sampleID}.mpileup.snp.vcf
|
||
$python3 ${p1q19_paint} ${sampleID}.mpileup.snp.vcf
|
||
rm ${sampleID}.mpileup.snp.txt
|
||
fi
|
||
|
||
################# bcftools ########################
|
||
# bcftools is used for retrieving mutations of interest, or mutations with higher confidence
|
||
|
||
echo $'\nbcftools'
|
||
echo "$(date)"
|
||
#germline
|
||
$bcftools norm -m -both ${sampleID}.germline.raw.vcf -o ${sampleID}.germline.raw.split.vcf
|
||
$bcftools norm -f ${hg19fa} ${sampleID}.germline.raw.split.vcf -o ${sampleID}.germline.split.norm.vcf
|
||
rm ${sampleID}.germline.raw.vcf* ${sampleID}.germline.raw.split.vcf
|
||
|
||
# #somatic
|
||
$bcftools norm -m -both ${sampleID}.somatic.filter.vcf -o ${sampleID}.somatic.raw.split.vcf
|
||
$bcftools norm -f ${hg19fa} ${sampleID}.somatic.raw.split.vcf -o ${sampleID}.somatic.split.norm.vcf
|
||
rm ${sampleID}.somatic.filter.vcf ${sampleID}.somatic.raw.split.vcf
|
||
|
||
|
||
###################### ANNOVAR ########################
|
||
# used for annotation
|
||
|
||
###################### germline ########################
|
||
echo $'\ngermline'
|
||
echo "$(date)"
|
||
mkdir anno
|
||
${annovar} ${sampleID}.germline.split.norm.vcf ${humandb} \
|
||
--outfile anno/${sampleID}.germline.anno --buildver hg19 \
|
||
--protocol refGene,cytoBand,1000g2015aug_all,esp6500siv2_all,exac03nonpsych,cosmic70,clinvar_20220624,intervar_20180118,dbnsfp35a,avsnp150 \
|
||
--operation g,r,f,f,f,f,f,f,f,f --vcfinput --remove
|
||
rm anno/${sampleID}.germline.anno.avinput
|
||
|
||
if ([[ "$bedType" =~ (${germline_panel1}) ]]) || ([[ "$bedType" =~ (${germline_panel2}) ]]); then
|
||
$python3 ${germline_filter} anno/${sampleID}.germline.anno.hg19_multianno.vcf -o anno/${sampleID}.germline.inital.filter.vcf > anno/${sampleID}.germline.filter_reason.txt
|
||
|
||
# vep注释(主要为HGVS)
|
||
${vep} --dir ${vep_dir} --cache --offline --cache_version 98 \
|
||
--use_given_ref --refseq --assembly GRCh37 --format vcf \
|
||
--fa ${hg19fa} --force_overwrite --vcf --variant_class --gene_phenotype --vcf_info_field ANN \
|
||
--hgvs --hgvsg --transcript_version \
|
||
-i anno/${sampleID}.germline.inital.filter.vcf -o anno/${sampleID}.germline.vep_anno.vcf
|
||
rm anno/${sampleID}.germline.inital.filter.vcf
|
||
|
||
# HGVS命名注释
|
||
$python3 ${anno_hgvs} anno/${sampleID}.germline.vep_anno.vcf ${clinic_transcripts} ${refFlat} -o anno/${sampleID}.germline.hgvs_anno.vcf
|
||
rm anno/${sampleID}.germline.vep_anno.vcf*
|
||
|
||
# VEP修复后假阳性删除
|
||
$python3 ${snv_second_filter} anno/${sampleID}.germline.hgvs_anno.vcf -k germline -o anno/${sampleID}.germline.final.filter.vcf >>anno/${sampleID}.germline.filter_reason.txt
|
||
fi
|
||
|
||
|
||
###################### somatic ########################
|
||
echo $'\nsomatic'
|
||
echo "$(date)"
|
||
|
||
# 过滤 nopass 突变
|
||
grep "#" ${sampleID}.somatic.split.norm.vcf > anno/${sampleID}.somatic.split.filter.vcf
|
||
grep -v "#" ${sampleID}.somatic.split.norm.vcf | awk '{len=split($7,a,";");split($NF,b,":");if(len<2 || (len>=2 && b[3] >=0.01)) print $0}' >> anno/${sampleID}.somatic.split.filter.vcf
|
||
|
||
# annovar注释
|
||
${annovar} anno/${sampleID}.somatic.split.filter.vcf ${humandb} \
|
||
--outfile anno/${sampleID}.somatic.annovar --buildver hg19 \
|
||
--protocol refGene,1000g2015aug_all,esp6500siv2_all,exac03nonpsych,cosmic70,clinvar_20220624,intervar_20180118,dbnsfp35a,avsnp150,dbscsnv11 \
|
||
--operation g,f,f,f,f,f,f,f,f,f --vcfinput --remove
|
||
|
||
# vep注释(主要为HGVS)
|
||
${vep} --dir ${vep_dir} --cache --offline --cache_version 98 \
|
||
--use_given_ref --refseq --assembly GRCh37 --format vcf \
|
||
--fa ${hg19fa} --force_overwrite --vcf --variant_class --gene_phenotype --vcf_info_field ANN \
|
||
--hgvs --hgvsg --transcript_version \
|
||
-i anno/${sampleID}.somatic.annovar.hg19_multianno.vcf -o anno/${sampleID}.somatic.vep_anno.vcf
|
||
|
||
# HGVS命名注释
|
||
$python3 ${anno_hgvs} anno/${sampleID}.somatic.vep_anno.vcf ${clinic_transcripts} ${refFlat} -o anno/${sampleID}.somatic.hgvs_anno.vcf
|
||
|
||
# 初步过滤(假阳性等)
|
||
$python3 ${snv_init_filter} anno/${sampleID}.somatic.hgvs_anno.vcf \
|
||
-o anno/${sampleID}.somatic.inital.filter.vcf -t ${sampleType} -m ${mode} > anno/${sampleID}.somatic.filter_reason.txt
|
||
|
||
# 合并同一单倍型上的复杂突变
|
||
$python3 ${merge_mnv} anno/${sampleID}.somatic.inital.filter.vcf $hg19fa -o anno/${sampleID}.somatic.merge_mnv.vcf
|
||
|
||
# 将合并后的mnp位置左移校准
|
||
$bcftools norm -m -both -f ${hg19fa} anno/${sampleID}.somatic.merge_mnv.vcf -o anno/${sampleID}.somatic.merged_norm.vcf
|
||
|
||
# 拆分MNV突变
|
||
$python3 ${find_mnp} anno/${sampleID}.somatic.merged_norm.vcf
|
||
|
||
if [ -f "anno/${sampleID}.somatic.mnp.vcf" ];
|
||
then
|
||
# annovar注释MNV
|
||
${annovar} anno/${sampleID}.somatic.mnp.vcf ${humandb} \
|
||
--outfile anno/${sampleID}.somatic.mnp.annovar --buildver hg19 \
|
||
--protocol refGene,1000g2015aug_all,esp6500siv2_all,exac03nonpsych,cosmic70,clinvar_20220624,intervar_20180118,dbnsfp35a,avsnp150,dbscsnv11 \
|
||
--operation g,f,f,f,f,f,f,f,f,f --vcfinput --remove
|
||
|
||
# vep注释MNV(主要为HGVS)
|
||
${vep} --dir ${vep_dir} --cache --offline --cache_version 98 \
|
||
--use_given_ref --refseq --assembly GRCh37 --format vcf \
|
||
--fa ${hg19fa} --force_overwrite --vcf --variant_class --gene_phenotype --vcf_info_field ANN \
|
||
--hgvs --hgvsg --transcript_version \
|
||
-i anno/${sampleID}.somatic.mnp.annovar.hg19_multianno.vcf -o anno/${sampleID}.somatic.mnp.vep_anno.vcf
|
||
|
||
# HGVS命名注释MNV
|
||
$python3 ${anno_hgvs} anno/${sampleID}.somatic.mnp.vep_anno.vcf ${clinic_transcripts} ${refFlat} -o anno/${sampleID}.somatic.mnp.hgvs_anno.vcf
|
||
|
||
# 合并突变
|
||
$bgzip -f anno/${sampleID}.somatic.common.vcf
|
||
$tabix -p vcf anno/${sampleID}.somatic.common.vcf.gz
|
||
|
||
$bgzip -f anno/${sampleID}.somatic.mnp.hgvs_anno.vcf
|
||
$tabix -p vcf anno/${sampleID}.somatic.mnp.hgvs_anno.vcf.gz
|
||
|
||
$bcftools concat anno/${sampleID}.somatic.common.vcf.gz anno/${sampleID}.somatic.mnp.hgvs_anno.vcf.gz -a -d all -o anno/${sampleID}.somatic.merge.vcf
|
||
|
||
else
|
||
mv anno/${sampleID}.somatic.merged_norm.vcf anno/${sampleID}.somatic.merge.vcf
|
||
fi
|
||
|
||
# VEP修复后假阳性及良性变异删除
|
||
$python3 ${snv_second_filter} anno/${sampleID}.somatic.merge.vcf -k somatic -o anno/${sampleID}.somatic.final.filter.vcf >>anno/${sampleID}.somatic.filter_reason.txt
|
||
|
||
# 统计变异删除的原因
|
||
$python3 ${get_filter_reason} . -k somatic
|
||
|
||
|
||
###################### tmb ########################
|
||
if [[ ${bedType} =~ ^(${tmb_panel})$ ]]; then
|
||
echo $'\nTMB'
|
||
echo "$(date)"
|
||
|
||
# 过滤低频假阳性
|
||
grep "#" ${sampleID}.somatic.split.norm.vcf > anno/${sampleID}.tmb.filter_1.vcf
|
||
if [[ ${sampleType} =~ (ctdna) ]]; then
|
||
grep -v "#" ${sampleID}.somatic.split.norm.vcf | awk '{split($NF,a,":");if(a[3]>0.05 && a[3] <= 0.5) print $0}' >> anno/${sampleID}.tmb.filter_1.vcf
|
||
else
|
||
grep -v "#" ${sampleID}.somatic.split.norm.vcf | awk '{split($NF,a,":");if(a[3]>0.05) print $0}' >> anno/${sampleID}.tmb.filter_1.vcf
|
||
fi
|
||
|
||
# 过滤filter信息大于2的突变
|
||
grep "#" anno/${sampleID}.tmb.filter_1.vcf > anno/${sampleID}.tmb.filter_2.vcf
|
||
grep -v "#" anno/${sampleID}.tmb.filter_1.vcf | awk '{len=split($7,a,";");if(len<2) print $0}' >> anno/${sampleID}.tmb.filter_2.vcf
|
||
|
||
# 合并同一单倍型上的复杂突变
|
||
$python3 ${merge_mnv} anno/${sampleID}.tmb.filter_2.vcf $hg19fa -o anno/${sampleID}.tmb.merged.vcf
|
||
|
||
# 将合并后的mnp位置左移校准
|
||
$bcftools norm -m -both -f ${hg19fa} anno/${sampleID}.tmb.merged.vcf -o anno/${sampleID}.tmb.merged_norm.vcf
|
||
rm anno/${sampleID}.tmb.merged.vcf
|
||
|
||
# annovar 注释相关数据库
|
||
${annovar} anno/${sampleID}.tmb.merged_norm.vcf ${humandb} \
|
||
--outfile anno/${sampleID}.tmb.annovar --buildver hg19 \
|
||
--protocol refGene,cytoBand,1000g2015aug_all,esp6500siv2_all,exac03nonpsych,cosmic70,clinvar_20220624,intervar_20180118,dbnsfp35a,avsnp150 \
|
||
--operation g,r,f,f,f,f,f,f,f,f --vcfinput --remove
|
||
rm anno/${sampleID}.tmb.merged_norm.vcf
|
||
|
||
# vep注释(主要为HGVS)
|
||
${vep} --dir ${vep_dir} --cache --offline --cache_version 98 \
|
||
--use_given_ref --refseq --assembly GRCh37 --format vcf \
|
||
--fa ${hg19fa} --force_overwrite --vcf --variant_class --gene_phenotype --vcf_info_field ANN \
|
||
--hgvs --hgvsg --transcript_version \
|
||
-i anno/${sampleID}.tmb.annovar.hg19_multianno.vcf -o anno/${sampleID}.tmb.vep_anno.vcf
|
||
rm anno/${sampleID}.tmb.annovar*
|
||
|
||
# HGVS命名注释
|
||
$python3 ${anno_hgvs} anno/${sampleID}.tmb.vep_anno.vcf ${clinic_transcripts} ${refFlat} -o anno/${sampleID}.tmb.hgvs_anno.vcf
|
||
rm anno/${sampleID}.tmb.vep_anno.vcf*
|
||
|
||
# 过滤其他假阳性,germline等
|
||
$python3 ${tmb_filter} anno/${sampleID}.tmb.hgvs_anno.vcf -t ${sampleType} -o anno/${sampleID}.tmb.final.filter.vcf -m ${mode} > anno/${sampleID}.tmb.filter_reason.txt
|
||
rm anno/${sampleID}.tmb.hgvs_anno.vcf
|
||
|
||
# 统计变异删除的原因
|
||
$python3 ${get_filter_reason} . -k tmb
|
||
rm anno/${sampleID}.tmb.filter_1.vcf anno/${sampleID}.tmb.filter_2.vcf
|
||
fi
|
||
rm ${sampleID}.somatic.split.norm.vcf
|
||
|
||
|
||
###################### concat vcf ########################
|
||
if ([[ "$bedType" =~ (${germline_panel1}) ]]) || ([[ "$bedType" =~ (QR) && ${sampleType} != "blood" && ${sampleType} != "dna" ]]);then
|
||
echo $'\nmerge_vcf'
|
||
echo "$(date)"
|
||
|
||
cp anno/${sampleID}.germline.final.filter.vcf anno/${sampleID}.germline.final.filter.concat.vcf
|
||
$bgzip -f anno/${sampleID}.germline.final.filter.concat.vcf
|
||
$tabix -p vcf anno/${sampleID}.germline.final.filter.concat.vcf.gz
|
||
|
||
cp anno/${sampleID}.somatic.final.filter.vcf anno/${sampleID}.somatic.final.filter.concat.vcf
|
||
$bgzip -f anno/${sampleID}.somatic.final.filter.concat.vcf
|
||
$tabix -p vcf anno/${sampleID}.somatic.final.filter.concat.vcf.gz
|
||
|
||
if [[ "$bedType" =~ (${tmb_panel}) ]];then
|
||
cp anno/${sampleID}.tmb.final.filter.vcf anno/${sampleID}.tmb.final.filter.concat.vcf
|
||
$bgzip -f anno/${sampleID}.tmb.final.filter.concat.vcf
|
||
$tabix -p vcf anno/${sampleID}.tmb.final.filter.concat.vcf.gz
|
||
$bcftools concat anno/${sampleID}.somatic.final.filter.concat.vcf.gz anno/${sampleID}.germline.final.filter.concat.vcf.gz anno/${sampleID}.tmb.final.filter.concat.vcf.gz -a -d all -o anno/${sampleID}.somatic.concat.vcf
|
||
rm anno/${sampleID}.tmb.final.filter.concat.vcf.gz*
|
||
else
|
||
$bcftools concat anno/${sampleID}.somatic.final.filter.concat.vcf.gz anno/${sampleID}.germline.final.filter.concat.vcf.gz -a -d all -o anno/${sampleID}.somatic.concat.vcf
|
||
fi
|
||
rm anno/${sampleID}.somatic.final.filter.concat.vcf.gz* anno/${sampleID}.germline.final.filter.concat.vcf.gz*
|
||
fi
|
||
|
||
###################### anno drugs ########################
|
||
echo $'\nanno_drugs'
|
||
echo "$(date)"
|
||
|
||
# 注释药物相关数据库
|
||
if ([[ "$bedType" =~ (${germline_panel1}) ]]) || ([[ "$bedType" =~ (QR) && ${sampleType} != "blood" && ${sampleType} != "dna" ]]);then
|
||
$python3 ${snv_out} anno/${sampleID}.somatic.concat.vcf -o anno/${sampleID}.somatic.xlsx -t ${sampleType} >> anno/${sampleID}.somatic.filter_reason.txt
|
||
|
||
elif [[ "$bedType" =~ (${germline_panel2}) && "$sampleType" =~ (blood|dna) ]];then
|
||
$python3 ${snv_out} anno/${sampleID}.germline.final.filter.vcf -o anno/${sampleID}.somatic.xlsx -t ${sampleType} >> anno/${sampleID}.somatic.filter_reason.txt
|
||
|
||
else
|
||
$python3 ${snv_out} anno/${sampleID}.somatic.final.filter.vcf -o anno/${sampleID}.somatic.xlsx -t ${sampleType} >> anno/${sampleID}.somatic.filter_reason.txt
|
||
fi
|
||
|
||
###################### add_pipe_tag ######################
|
||
# 添加流程pipe tag
|
||
$python3 ${add_pipe_tag} anno/${sampleID}.somatic.xlsx
|
||
|
||
#################### sanger_determine ####################
|
||
# 确定germline突变是否需要验证
|
||
$python3 ${sanger_determine} anno/${sampleID}.somatic.xlsx
|
||
|
||
#################### cis_trans ####################
|
||
# 确定T790M和G797S共突变的顺式或反式
|
||
$python3 ${cis_trans} anno/${sampleID}.somatic.xlsx
|
||
|
||
###################### core_filter ########################
|
||
if [[ "$bedType" =~ (${core_panel}) ]];then
|
||
$python3 ${core_filter} anno/${sampleID}.somatic.xlsx
|
||
fi
|
||
|
||
###################### qc_bamdst ########################
|
||
echo $'\nqc_bamdst'
|
||
echo "$(date)"
|
||
|
||
if [[ ${bedType} =~ ^(${pcr_panel})$ || ${bedType} =~ ^(${umi_panel})$ ]]; then
|
||
mkdir -p qc/bamdst
|
||
$bamdst --uncover 30 -p ${bedPath}/${bedType}.bed --cutoffdepth 500 -o qc/bamdst ${sampleID}.sorted.bam
|
||
ls qc/bamdst/ | xargs -I{} mv qc/bamdst/{} qc/bamdst/${sampleID}.{}
|
||
|
||
elif [[ ${bedType} =~ ^(${probe_panel})$ && ${sampleType} != "ctdna" ]]; then
|
||
mkdir -p qc/bamdst
|
||
$bamdst --uncover 30 -p ${bedPath}/${bedType}.bed --cutoffdepth 1000 -o qc/bamdst ${sampleID}.sorted.dedup.bam
|
||
ls qc/bamdst/ | xargs -I{} mv qc/bamdst/{} qc/bamdst/${sampleID}.{}
|
||
|
||
else
|
||
mkdir -p qc/bamdst/raw
|
||
if [[ "$sampleType" =~ (ctdna) ]]; then
|
||
$bamdst --uncover 30 -p ${bedPath}/${bedType}.bed --cutoffdepth 1000 -o qc/bamdst/raw ${sampleID}.sorted.bam
|
||
else
|
||
$bamdst --uncover 30 -p ${bedPath}/${bedType}.bed --cutoffdepth 500 -o qc/bamdst/raw ${sampleID}.sorted.bam
|
||
fi
|
||
ls qc/bamdst/raw/ | xargs -I{} mv qc/bamdst/raw/{} qc/bamdst/raw/${sampleID}.raw.{}
|
||
|
||
mkdir -p qc/bamdst/dedup
|
||
if [[ "$sampleType" =~ (ctdna) ]]; then
|
||
$bamdst --uncover 30 -p ${bedPath}/${bedType}.bed --cutoffdepth 1000 -o qc/bamdst/dedup ${sampleID}.sorted.dedup.bam
|
||
else
|
||
$bamdst --uncover 30 -p ${bedPath}/${bedType}.bed --cutoffdepth 500 -o qc/bamdst/dedup ${sampleID}.sorted.dedup.bam
|
||
fi
|
||
ls qc/bamdst/dedup | xargs -I{} mv qc/bamdst/dedup/{} qc/bamdst/dedup/${sampleID}.dedup.{}
|
||
fi
|
||
|
||
|
||
###################### chemical ########################
|
||
|
||
echo $'\nchemical'
|
||
echo "$(date)"
|
||
|
||
if [[ ${bedType} =~ ^(DA)$ ]]; then
|
||
$samtools bedcov ${bedPath}/chemical_DA.bed ${sampleID}.sorted.bam >${sampleID}.chemical.depth.txt
|
||
else
|
||
$samtools bedcov ${bedPath}/chemical_DF.bed ${sampleID}.sorted.bam >${sampleID}.chemical.depth.txt
|
||
fi
|
||
|
||
if [[ "$bedType" =~ (${chemical_panel}) ]]; then
|
||
$python3 ${germline_chemical} anno/${sampleID}.germline.anno.hg19_multianno.txt >anno/${sampleID}.chemical.txt
|
||
$python3 ${check_sample_data} anno/${sampleID}.chemical.txt
|
||
rm ${sampleID}.chemical.depth.txt anno/${sampleID}.chemical.txt
|
||
fi
|
||
|
||
|
||
###################### Copy Number Aberration ######################
|
||
### hybrid capture: DG|DS|DH|DT|DL|DW|DR|DJ|DF|DQ|DE
|
||
### targeted amplicon seq: DP|HH|HT|HW|HR|HC|HF|ZJ|ZG|YW|QJ|YR|YZ|YW|YF
|
||
### 需要做cnv的panel:DF, DG, DJ, DQ, DR, DS, DW, DE
|
||
|
||
##################### CNVkit ########################
|
||
if [[ ${bedType} =~ ^(${cnv_panel1})$ ]];
|
||
then
|
||
echo "CNA starts at -- `date` "
|
||
|
||
if [[ ${bedType} =~ ^(${cnv_panel2})$ ]];
|
||
then
|
||
$python3 $cnvkit coverage ${sampleID}.sorted.dedup.bam ${cnv_target}/${bedType}.target.bed -p ${threads} -o ${sampleID}.targetcoverage.cnn
|
||
$python3 $cnvkit fix ${sampleID}.targetcoverage.cnn ${cnv_ref}/empty.cnn ${cnv_ref}/${bedType}.flat.cnn -o ${sampleID}.cnr
|
||
$python3 $cnvkit segment -p ${threads} ${sampleID}.cnr --drop-low-coverage -o ${sampleID}.cns
|
||
|
||
elif [[ ${bedType} =~ ^(${cnv_panel3})$ ]];
|
||
then
|
||
$python3 $cnvkit coverage ${sampleID}.sorted.bam ${cnv_target}/${bedType}.target.bed -p ${threads} -o ${sampleID}.targetcoverage.cnn
|
||
$python3 $cnvkit fix ${sampleID}.targetcoverage.cnn ${cnv_ref}/empty.cnn ${cnv_ref}/${bedType}.flat.cnn --no-edge -o ${sampleID}.cnr
|
||
$python3 $cnvkit segment -p ${threads} ${sampleID}.cnr --drop-low-coverage -o ${sampleID}.cns
|
||
|
||
else
|
||
$python3 $cnvkit coverage ${sampleID}.sorted.dedup.bam ${cnv_target}/${bedType}.target.bed -p ${threads} -o ${sampleID}.targetcoverage.cnn
|
||
$python3 $cnvkit coverage ${sampleID}.sorted.dedup.bam ${cnv_anti}/${bedType}.antitarget.bed -p ${threads} -o ${sampleID}.antitargetcoverage.cnn
|
||
$python3 $cnvkit fix ${sampleID}.targetcoverage.cnn ${sampleID}.antitargetcoverage.cnn ${cnv_ref}/${bedType}.flat.cnn -o ${sampleID}.cnr
|
||
$python3 $cnvkit segment -p ${threads} ${sampleID}.cnr --drop-low-coverage -o ${sampleID}.cns
|
||
fi
|
||
|
||
$python3 $cnvkit call ${sampleID}.cns --center --drop-low-coverage -o ${sampleID}.call.cns
|
||
$python3 $cnvkit export vcf ${sampleID}.call.cns -i "${sampleID}" -o ${sampleID}.cnv.vcf
|
||
$python3 $annotate_vcf -i ${sampleID}.cnv.vcf -r $refFlat -o ${sampleID}.annotated.cnv.csv
|
||
|
||
$python3 ${cna_out} ${sampleID}.annotated.cnv.csv
|
||
|
||
##################### THeTA ########################
|
||
# THeTA -- calculate sample purity (tumour/normal ratio)
|
||
$python3 $cnvkit export theta ${sampleID}.cns -r ${cnv_ref}/${bedType}.flat.cnn
|
||
${theta} ${sampleID}.interval_count --NUM_PROCESSES ${threads}
|
||
|
||
|
||
echo "CNA analysis completed at: `date` "
|
||
rm ${sampleID}.*coverage.cnn ${sampleID}.cnr ${sampleID}.cnv.vcf ${sampleID}.cns ${sampleID}.call.cns ${sampleID}.interval_count ${sampleID}.n2.graph.pdf ${sampleID}.n2.withBounds
|
||
fi
|
||
|
||
|
||
################### Microsatelite instability ######################
|
||
# ###################### MSIsensor2 ######################
|
||
# # Detect msi on below panel using msisensor
|
||
|
||
if [[ ${bedType} =~ ^(${msi_panel})$ ]];
|
||
then
|
||
echo $'\nmsisensor2'
|
||
echo "$(date)"
|
||
$msisensor2 msi -M ${msi_models} -t ${sampleID}.sorted.dedup.bam -o ${sampleID}.prefix
|
||
fi
|
||
|
||
|
||
########################## Fusion of genes ########################
|
||
##################### genefuse ########################
|
||
# Detect potential gene fusion on below panels.
|
||
if [[ ${bedType} =~ ^(${fusion_panel})$ ]]; then
|
||
|
||
echo $'\ngenefuse'
|
||
echo "$(date)"
|
||
$python3 ${make_fusion_gene} ${sampleID}
|
||
if [[ "$sampleType" =~ (ctdna) ]]; then
|
||
$genefuse -t ${threads} -r ${hg19fa} -f ${sampleID}.fusion_genes.csv -1 ${sampleID}.umi_consensus.fq.gz -h ${sampleID}.fusion.html -j ${sampleID}.fusion.json 1> ${sampleID}.fusion.txt
|
||
rm ${sampleID}.umi_consensus.fq.gz
|
||
else
|
||
$genefuse -t ${threads} -r ${hg19fa} -f ${sampleID}.fusion_genes.csv -1 ${sampleID}.R1_001.trim.fq.gz -2 ${sampleID}.R2_001.trim.fq.gz -h ${sampleID}.fusion.html -j ${sampleID}.fusion.json 1> ${sampleID}.fusion.txt
|
||
fi
|
||
|
||
echo $'\ndelly'
|
||
echo "$(date)"
|
||
$delly call -x ${delly_low_regions} -g ${hg19fa} -o ${sampleID}.bcf ${sampleID}.sorted.dedup.bam
|
||
$bcftools view ${sampleID}.bcf > ${sampleID}.sv.raw.vcf
|
||
rm ${sampleID}.bcf*
|
||
$python3 $delly_sv_filter ${sampleID}.sv.raw.vcf
|
||
rm ${sampleID}.delly*
|
||
|
||
$python3 $fusion_filter ${sampleID}.fusion.json -o ${sampleID}.fusion_filtered.json
|
||
$python3 ${fusion_out} ${sampleID}.fusion_filtered.json
|
||
$python3 ${check_fusion_result} ${sampleID}.fusion.xlsx
|
||
rm ${sampleID}.fusion.txt
|
||
fi
|
||
|
||
|
||
########################## Quality control #########################
|
||
##################### Fastp ########################
|
||
# This step is used for quality control by fastqc.
|
||
|
||
mkdir -p qc/fastp
|
||
echo $'\nQC_fastp'
|
||
echo "$(date)"
|
||
mv ${sampleID}.jion ${sampleID}.html qc/fastp/
|
||
|
||
##################### samtools ########################
|
||
# This step is used for quality control by samtools.
|
||
|
||
mkdir -p qc/samtools
|
||
echo $'\nQC_samtools'
|
||
echo "$(date)"
|
||
$samtools stats -@ ${threads} ${sampleID}.sorted.bam >qc/samtools/${sampleID}.sorted.bam.stats.txt
|
||
|
||
|
||
##################### hla_type_neo_predict ########################
|
||
|
||
# This step is used for hla_type_neo_predict by hlahd and MuPeXI.
|
||
if [[ "$bedType" =~ ^(${hla_panel})$ ]]; then
|
||
echo $'\nHLA'
|
||
echo "$(date)"
|
||
cp anno/${sampleID}.tmb.final.filter.vcf anno/${sampleID}.tmb_copy.vcf
|
||
grep '#' anno/${sampleID}.tmb_copy.vcf > anno/${sampleID}.header
|
||
grep -v '#' anno/${sampleID}.tmb_copy.vcf > anno/${sampleID}.body
|
||
|
||
gunzip -k ${sampleID}.R1_001.trim.fq.gz
|
||
gunzip -k ${sampleID}.R2_001.trim.fq.gz
|
||
# get 300w reads, about 1GB to to hla typing
|
||
$seqtk sample -s100 ${sampleID}.R1_001.trim.fq 3000000 > ${sampleID}.R1_001.trim.sub.fq
|
||
$seqtk sample -s100 ${sampleID}.R2_001.trim.fq 3000000 > ${sampleID}.R2_001.trim.sub.fq
|
||
|
||
$python2 ${hla_optitype} -sampleID $sampleID -outDir $outDir -results_TMB_txt anno/${sampleID}.body -fq1 ${sampleID}.R1_001.trim.sub.fq -fq2 ${sampleID}.R2_001.trim.sub.fq
|
||
|
||
# 拼接vcf
|
||
cat anno/${sampleID}.header anno/${sampleID}.select_variant_from_TMB.txt > anno/${sampleID}.mupexi.vcf
|
||
|
||
$python2 ${hla_optitype} -sampleID $sampleID -outDir $outDir -vcf anno/${sampleID}.mupexi.vcf -determine_tnb anno/${sampleID}.body
|
||
rm ${sampleID}.R*_001.trim.fq ${sampleID}.R*_001.trim.sub.fq \
|
||
anno/${sampleID}.body anno/${sampleID}.tmb_copy.vcf anno/${sampleID}.header anno/${sampleID}.select_variant_from_TMB.txt \
|
||
neo/${sampleID}_coverage_plot.pdf neo/${sampleID}.fasta neo/${sampleID}.g \
|
||
neo/${sampleID}_netMHCpan* neo/${sampleID}_pepmatch_* neo/${sampleID}_peptide_netMHCinput.txt neo/${sampleID}_reference_peptide_* \
|
||
neo/${sampleID}_vep_compatible_vcf.vcf neo/${sampleID}_vep.vep
|
||
fi
|
||
|
||
|
||
########################## merge_data #########################
|
||
$python3 ${single_sample_qc} . ${sampleID} ${sampleType}
|
||
$python3 ${parser_single_sample_data} . ${sampleID}
|
||
$python3 ${collect_single_sample_data} . ${sampleID}
|
||
|
||
|
||
rm ${sampleID}.R*_001.trim.fq.gz
|
||
if [[ ${bedType} =~ ^(${probe_panel})$ ]]; then
|
||
rm ${sampleID}.sorted.bam*
|
||
fi
|
||
|
||
echo "$sampleID -- ending at: $(date)"
|
||
echo "pipeline finished!" >finish.txt
|