715 lines
30 KiB
Bash
715 lines
30 KiB
Bash
|
|
#!/usr/bin/bash
|
|||
|
|
|
|||
|
|
################################################################
|
|||
|
|
## step1: recording all version informations ##
|
|||
|
|
################################################################
|
|||
|
|
|
|||
|
|
################# pipeline_version ########################
|
|||
|
|
# pipeline: ${pipeline_version}
|
|||
|
|
|
|||
|
|
#################### soft_version ########################
|
|||
|
|
# fastp: ${fastp_version}
|
|||
|
|
# bwa: ${bwa_version}
|
|||
|
|
# samtools: ${samtools_version}
|
|||
|
|
# sentieon: ${sentieon_version}
|
|||
|
|
# bcftools: ${bcftools_version}
|
|||
|
|
# annovar: ${annovar_version}
|
|||
|
|
# vep: ${vep_version}
|
|||
|
|
# msisensor2: ${msisensor2_version}
|
|||
|
|
# genefuse: ${genefuse_version}
|
|||
|
|
# cnvkit: ${cnvkit_version}
|
|||
|
|
# bamdst: ${bamdst_version}
|
|||
|
|
# seqtk: ${seqtk_version}
|
|||
|
|
# htslib: ${htslib_version}
|
|||
|
|
# python2: ${python2_version}
|
|||
|
|
# python3: ${python3_version}
|
|||
|
|
# delly: ${delly_version}
|
|||
|
|
|
|||
|
|
#################### bed_version ########################
|
|||
|
|
# bed: ${bed_version}
|
|||
|
|
|
|||
|
|
################# database_version ########################
|
|||
|
|
# reference: ${reference_version}
|
|||
|
|
# refgene: ${refgene_version}
|
|||
|
|
# cytoBand: ${cytoBand_version}
|
|||
|
|
# 1000g: ${g1000_version}
|
|||
|
|
# esp: ${esp_version}
|
|||
|
|
# exac: ${exac_version}
|
|||
|
|
# cosmic: ${cosmic_version}
|
|||
|
|
# clinvar: ${clinvar_version}
|
|||
|
|
# intervar: ${intervar_version}
|
|||
|
|
# dbnsfp: ${dbnsfp_version}
|
|||
|
|
# avsnp: ${avsnp_version}
|
|||
|
|
# private_database: ${private_database}
|
|||
|
|
# pon_database: ${pon_database}
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
################################################################
|
|||
|
|
## step2: pipeline for analysis ##
|
|||
|
|
################################################################
|
|||
|
|
|
|||
|
|
# Update with the location of the Sentieon software package and license file
|
|||
|
|
export SENTIEON_LICENSE=${SENTIEON_LICENSE}
|
|||
|
|
|
|||
|
|
|
|||
|
|
threads=${THREAD}
|
|||
|
|
|
|||
|
|
cd $outDir
|
|||
|
|
echo "$sampleID -- beginning at: $(date)"
|
|||
|
|
|
|||
|
|
$python3 ${checkUMI} -f $fq1 -s ${sampleType}
|
|||
|
|
################# fastp ########################
|
|||
|
|
# fastp is used for remove lower quality bases
|
|||
|
|
echo "fastp"
|
|||
|
|
echo "$(date)"
|
|||
|
|
$fastp -w ${threads} -i $fq1 -o ${sampleID}.R1_001.trim.fq.gz -I $fq2 -O ${sampleID}.R2_001.trim.fq.gz -l 35 -c --detect_adapter_for_pe -j ${sampleID}.jion -h ${sampleID}.html
|
|||
|
|
|
|||
|
|
|
|||
|
|
################# UMI + alignment + dedup ########################
|
|||
|
|
# extract umi info and alignment to reference
|
|||
|
|
if [[ ${bedType} =~ ^(${umi_panel})$ ]];
|
|||
|
|
then
|
|||
|
|
# remove UMI from following panels
|
|||
|
|
echo $'\nsentieon umi extract'
|
|||
|
|
echo "$(date)"
|
|||
|
|
$sentieon umi extract 21S+T,12M11S+T ${sampleID}.R1_001.trim.fq.gz ${sampleID}.R2_001.trim.fq.gz | \
|
|||
|
|
( $sentieon bwa mem -t ${threads} -p -C -M -R "@RG\\tID:${sampleID}\\tSM:${sampleID}\\tPL:ILLUMINA\\tLB:lib" -K 10000000 ${hg19fa} - || echo -n 'error') | \
|
|||
|
|
$sentieon util sort -t ${threads} --sam2bam -i - -o ${sampleID}.sorted.bam
|
|||
|
|
|
|||
|
|
echo $'\nsentieon umi consensus'
|
|||
|
|
echo "$(date)"
|
|||
|
|
$sentieon umi consensus -i ${sampleID}.sorted.bam --input_format BAM -o ${sampleID}.umi_consensus.fq.gz
|
|||
|
|
|
|||
|
|
echo $'\nbwa'
|
|||
|
|
echo "$(date)"
|
|||
|
|
( $sentieon bwa mem -t ${threads} -p -C -M -R "@RG\\tID:${sampleID}\\tSM:${sampleID}\\tPL:ILLUMINA\\tLB:lib" -K 10000000 ${hg19fa} ${sampleID}.umi_consensus.fq.gz || echo -n 'error' ) | \
|
|||
|
|
$sentieon util sort --umi_post_process --sam2bam -i - -o ${sampleID}.sorted.dedup.bam
|
|||
|
|
|
|||
|
|
elif [[ ${bedType} =~ (${probe_panel}) && ${sampleType} =~ (ctdna) ]];
|
|||
|
|
then
|
|||
|
|
# remove UMI from following panels
|
|||
|
|
echo $'\nsentieon umi'
|
|||
|
|
echo "$(date)"
|
|||
|
|
$sentieon umi extract -d 6M1S+T,6M1S+T ${sampleID}.R1_001.trim.fq.gz ${sampleID}.R2_001.trim.fq.gz | \
|
|||
|
|
( $sentieon bwa mem -t ${threads} -p -C -M -R "@RG\\tID:${sampleID}\\tSM:${sampleID}\\tPL:ILLUMINA\\tLB:lib" -K 10000000 ${hg19fa} - || echo -n 'error') | \
|
|||
|
|
$sentieon util sort -t ${threads} --sam2bam -i - -o ${sampleID}.sorted.bam
|
|||
|
|
|
|||
|
|
echo $'\nsentieon umi consensus'
|
|||
|
|
echo "$(date)"
|
|||
|
|
$sentieon umi consensus -i ${sampleID}.sorted.bam --input_format BAM -o ${sampleID}.umi_consensus.fq.gz
|
|||
|
|
|
|||
|
|
echo $'\nbwa'
|
|||
|
|
echo "$(date)"
|
|||
|
|
( $sentieon bwa mem -t ${threads} -p -C -M -R "@RG\\tID:${sampleID}\\tSM:${sampleID}\\tPL:ILLUMINA\\tLB:lib" -K 10000000 ${hg19fa} ${sampleID}.umi_consensus.fq.gz || echo -n 'error' ) | \
|
|||
|
|
$sentieon util sort --umi_post_process --sam2bam -i - -o ${sampleID}.sorted.dedup.bam
|
|||
|
|
|
|||
|
|
else
|
|||
|
|
# following panels does not contain UMI
|
|||
|
|
echo $'\nbwa'
|
|||
|
|
echo "$(date)"
|
|||
|
|
( $sentieon bwa mem -t ${threads} -M -R "@RG\\tID:${sampleID}\\tSM:${sampleID}\\tPL:ILLUMINA\\tLB:lib" -K 10000000 ${hg19fa} ${sampleID}.R1_001.trim.fq.gz ${sampleID}.R2_001.trim.fq.gz || echo -n 'error' ) | \
|
|||
|
|
$sentieon util sort -o ${sampleID}.sorted.bam -t ${threads} --sam2bam -i -
|
|||
|
|
|
|||
|
|
if [[ ${bedType} =~ ^(${probe_panel})$ ]]; then
|
|||
|
|
echo $'\nRemove Duplicate'
|
|||
|
|
echo "$(date)"
|
|||
|
|
$sentieon driver -t ${threads} -i ${sampleID}.sorted.bam --algo LocusCollector --fun score_info ${sampleID}.score.txt
|
|||
|
|
$sentieon driver -t ${threads} -i ${sampleID}.sorted.bam --algo Dedup --score_info ${sampleID}.score.txt --metrics ${sampleID}.dedup_metrics.txt ${sampleID}.sorted.dedup.bam
|
|||
|
|
rm ${sampleID}.score.txt* ${sampleID}.dedup_metrics.txt
|
|||
|
|
fi
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
|
|||
|
|
################## Variant Calling ######################
|
|||
|
|
# TNscope is used for somatic SNVs & small INDELs calling
|
|||
|
|
if [[ ${bedType} =~ ^(${pcr_panel})$ ]]; then
|
|||
|
|
|
|||
|
|
echo $'\nHaplotyper'
|
|||
|
|
echo "$(date)"
|
|||
|
|
$sentieon driver -r ${hg19fa} -t ${threads} -i ${sampleID}.sorted.bam \
|
|||
|
|
--interval ${bedPath}/${bedType}.bed \
|
|||
|
|
--algo Haplotyper --emit_conf=10 --call_conf=10 \
|
|||
|
|
${sampleID}.germline.raw.vcf
|
|||
|
|
|
|||
|
|
echo $'\nTNscope'
|
|||
|
|
echo "$(date)"
|
|||
|
|
|
|||
|
|
if [[ ${bedType} =~ (AT|CA) ]];
|
|||
|
|
then
|
|||
|
|
$sentieon driver -r ${hg19fa} -t ${threads} -i ${sampleID}.sorted.bam \
|
|||
|
|
--interval ${bedPath}/${bedType}.bed --interval_padding 10 \
|
|||
|
|
--algo TNscope \
|
|||
|
|
--tumor_sample ${sampleID} \
|
|||
|
|
--dbsnp ${dbsnp138vcf} \
|
|||
|
|
--disable_detector sv \
|
|||
|
|
--max_fisher_pv_active 0.05 \
|
|||
|
|
--min_tumor_allele_frac 0.009 \
|
|||
|
|
--sv_mask_ext 10 \
|
|||
|
|
--prune_factor 0 \
|
|||
|
|
--assemble_mode 2 \
|
|||
|
|
--resample_depth 100000 \
|
|||
|
|
--pon ${pon} \
|
|||
|
|
${sampleID}.output_tnscope.pre_filter.vcf
|
|||
|
|
else
|
|||
|
|
$sentieon driver -r ${hg19fa} -t ${threads} -i ${sampleID}.sorted.bam \
|
|||
|
|
--interval ${bedPath}/${bedType}.bed --interval_padding 10 \
|
|||
|
|
--algo TNscope \
|
|||
|
|
--tumor_sample ${sampleID} \
|
|||
|
|
--dbsnp ${dbsnp138vcf} \
|
|||
|
|
--disable_detector sv \
|
|||
|
|
--max_fisher_pv_active 0.05 \
|
|||
|
|
--min_tumor_allele_frac 0.009 \
|
|||
|
|
--sv_mask_ext 10 \
|
|||
|
|
--prune_factor 0 \
|
|||
|
|
--assemble_mode 2 \
|
|||
|
|
--resample_depth 100000 \
|
|||
|
|
--trim_primer ${bedPath}/${bedType}.amplicon.bed \
|
|||
|
|
--pon ${pon} \
|
|||
|
|
${sampleID}.output_tnscope.pre_filter.vcf
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
echo $'\nbcftools filter'
|
|||
|
|
echo "$(date)"
|
|||
|
|
$bcftools annotate -x "FILTER/triallelic_site" ${sampleID}.output_tnscope.pre_filter.vcf | \
|
|||
|
|
$bcftools filter -m + -s "insignificant" -e "(PV>0.25 && PV2>0.25)" | \
|
|||
|
|
$bcftools filter -m + -s "insignificant" -e "(INFO/STR == 1 && PV>0.05)" | \
|
|||
|
|
$bcftools filter -m + -s "low_qual" -e "QUAL < 20" | \
|
|||
|
|
$bcftools filter -m + -s "short_tandem_repeat" -e "RPA[0]>=10" | \
|
|||
|
|
$bcftools filter -m + -s "noisy_region" -e "ECNT>5" | \
|
|||
|
|
$sentieon util vcfconvert - ${sampleID}.somatic.raw.vcf
|
|||
|
|
rm ${sampleID}.output_tnscope.pre_filter.vcf*
|
|||
|
|
|
|||
|
|
elif [[ ${bedType} =~ ^(${probe_panel})$ && ${sampleType} != "ctdna" ]]; then
|
|||
|
|
|
|||
|
|
echo $'\nHaplotyper'
|
|||
|
|
echo "$(date)"
|
|||
|
|
$sentieon driver -r ${hg19fa} -t ${threads} -i ${sampleID}.sorted.dedup.bam \
|
|||
|
|
--interval ${bedPath}/${bedType}.bed \
|
|||
|
|
--algo Haplotyper --emit_conf=10 --call_conf=10 \
|
|||
|
|
${sampleID}.germline.raw.vcf
|
|||
|
|
|
|||
|
|
echo $'\nTNscope'
|
|||
|
|
echo "$(date)"
|
|||
|
|
$sentieon driver -r ${hg19fa} -t ${threads} -i ${sampleID}.sorted.dedup.bam \
|
|||
|
|
--interval ${bedPath}/${bedType}.bed --interval_padding 10 \
|
|||
|
|
--algo TNscope \
|
|||
|
|
--tumor_sample ${sampleID} \
|
|||
|
|
--dbsnp ${dbsnp138vcf} \
|
|||
|
|
--disable_detector sv \
|
|||
|
|
--max_fisher_pv_active 0.05 \
|
|||
|
|
--min_tumor_allele_frac 0.005 \
|
|||
|
|
--filter_t_alt_frac 0.005 \
|
|||
|
|
--resample_depth 100000 \
|
|||
|
|
--assemble_mode 4 \
|
|||
|
|
--pon ${pon} \
|
|||
|
|
${sampleID}.output_tnscope.pre_filter.vcf
|
|||
|
|
|
|||
|
|
echo $'\nbcftools filter'
|
|||
|
|
echo "$(date)"
|
|||
|
|
echo $'\nbcftools filter'
|
|||
|
|
echo "$(date)"
|
|||
|
|
$bcftools annotate -x "FILTER/triallelic_site" ${sampleID}.output_tnscope.pre_filter.vcf | \
|
|||
|
|
$bcftools filter -m + -s "insignificant" -e "(PV>0.25 && PV2>0.25)" | \
|
|||
|
|
$bcftools filter -m + -s "insignificant" -e "(INFO/STR == 1 && PV>0.05)" | \
|
|||
|
|
$bcftools filter -m + -s "orientation_bias" -e "FMT/FOXOG[0] == 1" | \
|
|||
|
|
$bcftools filter -m + -s "strand_bias" -e "SOR > 3" | \
|
|||
|
|
$bcftools filter -m + -s "low_qual" -e "QUAL < 20" | \
|
|||
|
|
$bcftools filter -m + -s "short_tandem_repeat" -e "RPA[0]>=10" | \
|
|||
|
|
$bcftools filter -m + -s "noisy_region" -e "ECNT>5" | \
|
|||
|
|
$bcftools filter -m + -s "read_pos_bias" -e "FMT/ReadPosRankSumPS[0] < -8" | \
|
|||
|
|
$sentieon util vcfconvert - ${sampleID}.somatic.raw.vcf
|
|||
|
|
rm ${sampleID}.output_tnscope.pre_filter.vcf*
|
|||
|
|
|
|||
|
|
else
|
|||
|
|
echo $'\nHaplotyper'
|
|||
|
|
echo "$(date)"
|
|||
|
|
$sentieon driver -r ${hg19fa} -t ${threads} -i ${sampleID}.sorted.dedup.bam \
|
|||
|
|
--interval ${bedPath}/${bedType}.bed \
|
|||
|
|
--algo Haplotyper --emit_conf=10 --call_conf=10 \
|
|||
|
|
${sampleID}.germline.raw.vcf
|
|||
|
|
|
|||
|
|
echo $'\nTNscope'
|
|||
|
|
echo "$(date)"
|
|||
|
|
$sentieon driver -r ${hg19fa} -t ${threads} -i ${sampleID}.sorted.dedup.bam --interval ${bedPath}/${bedType}.bed \
|
|||
|
|
--algo TNscope \
|
|||
|
|
--tumor_sample ${sampleID} \
|
|||
|
|
--dbsnp ${dbsnp138vcf} \
|
|||
|
|
--pcr_indel_model NONE \
|
|||
|
|
--disable_detector sv \
|
|||
|
|
--min_tumor_allele_frac 0.0003 \
|
|||
|
|
--filter_t_alt_frac 0.0003 \
|
|||
|
|
--max_error_per_read 3 \
|
|||
|
|
--min_init_tumor_lod 3.0 \
|
|||
|
|
--min_tumor_lod 3.0 \
|
|||
|
|
--min_base_qual 40 \
|
|||
|
|
--min_base_qual_asm 40 \
|
|||
|
|
--resample_depth 100000 \
|
|||
|
|
--assemble_mode 4 \
|
|||
|
|
--pon ${pon} \
|
|||
|
|
${sampleID}.output_tnscope.pre_filter.vcf
|
|||
|
|
|
|||
|
|
echo $'\nbcftools filter'
|
|||
|
|
echo "$(date)"
|
|||
|
|
$bcftools annotate -x "FILTER/triallelic_site" ${sampleID}.output_tnscope.pre_filter.vcf | \
|
|||
|
|
$bcftools filter -m + -s "low_qual" -e "QUAL < 10" | \
|
|||
|
|
$bcftools filter -m + -s "short_tandem_repeat" -e "RPA[0]>=10" | \
|
|||
|
|
$bcftools filter -m + -s "read_pos_bias" -e "FMT/ReadPosRankSumPS[0] < -5" | \
|
|||
|
|
$bcftools filter -m + -s "base_qual_bias" -e "FMT/BaseQRankSumPS[0] < -5" | \
|
|||
|
|
$sentieon util vcfconvert - ${sampleID}.somatic.raw.vcf
|
|||
|
|
rm ${sampleID}.output_tnscope.pre_filter.vcf*
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
|
|||
|
|
echo $'\nfilter repeat regions'
|
|||
|
|
echo "$(date)"
|
|||
|
|
grep "#" ${sampleID}.somatic.raw.vcf > ${sampleID}.somatic.filter.vcf
|
|||
|
|
grep -v "#" ${sampleID}.somatic.raw.vcf | awk '{len=split($5,a,",");if(len<3 || (len>=3 && $7 !~ "panel_of_normals")) print $0}' >> ${sampleID}.somatic.filter.vcf
|
|||
|
|
|
|||
|
|
|
|||
|
|
################# 1p19q ########################
|
|||
|
|
|
|||
|
|
if [[ ${bedType} =~ (AT) ]];
|
|||
|
|
then
|
|||
|
|
$samtools mpileup -l ${bedPath}/AT.force.bed -f ${hg19fa} -d 30000 -Q 0 -O -s ${sampleID}.sorted.bam > ${sampleID}.mpileup.snp.txt
|
|||
|
|
$python3 ${dig_pileup} ${sampleID}.mpileup.snp.txt -o ${sampleID}.mpileup.snp.vcf
|
|||
|
|
$python3 ${p1q19_paint} ${sampleID}.mpileup.snp.vcf
|
|||
|
|
rm ${sampleID}.mpileup.snp.txt
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
################# bcftools ########################
|
|||
|
|
# bcftools is used for retrieving mutations of interest, or mutations with higher confidence
|
|||
|
|
|
|||
|
|
echo $'\nbcftools'
|
|||
|
|
echo "$(date)"
|
|||
|
|
#germline
|
|||
|
|
$bcftools norm -m -both ${sampleID}.germline.raw.vcf -o ${sampleID}.germline.raw.split.vcf
|
|||
|
|
$bcftools norm -f ${hg19fa} ${sampleID}.germline.raw.split.vcf -o ${sampleID}.germline.split.norm.vcf
|
|||
|
|
rm ${sampleID}.germline.raw.vcf* ${sampleID}.germline.raw.split.vcf
|
|||
|
|
|
|||
|
|
# #somatic
|
|||
|
|
$bcftools norm -m -both ${sampleID}.somatic.filter.vcf -o ${sampleID}.somatic.raw.split.vcf
|
|||
|
|
$bcftools norm -f ${hg19fa} ${sampleID}.somatic.raw.split.vcf -o ${sampleID}.somatic.split.norm.vcf
|
|||
|
|
rm ${sampleID}.somatic.filter.vcf ${sampleID}.somatic.raw.split.vcf
|
|||
|
|
|
|||
|
|
|
|||
|
|
###################### ANNOVAR ########################
|
|||
|
|
# used for annotation
|
|||
|
|
|
|||
|
|
###################### germline ########################
|
|||
|
|
echo $'\ngermline'
|
|||
|
|
echo "$(date)"
|
|||
|
|
mkdir anno
|
|||
|
|
${annovar} ${sampleID}.germline.split.norm.vcf ${humandb} \
|
|||
|
|
--outfile anno/${sampleID}.germline.anno --buildver hg19 \
|
|||
|
|
--protocol refGene,cytoBand,1000g2015aug_all,esp6500siv2_all,exac03nonpsych,cosmic70,clinvar_20220624,intervar_20180118,dbnsfp35a,avsnp150 \
|
|||
|
|
--operation g,r,f,f,f,f,f,f,f,f --vcfinput --remove
|
|||
|
|
rm anno/${sampleID}.germline.anno.avinput
|
|||
|
|
|
|||
|
|
if ([[ "$bedType" =~ (${germline_panel1}) ]]) || ([[ "$bedType" =~ (${germline_panel2}) ]]); then
|
|||
|
|
$python3 ${germline_filter} anno/${sampleID}.germline.anno.hg19_multianno.vcf -o anno/${sampleID}.germline.inital.filter.vcf > anno/${sampleID}.germline.filter_reason.txt
|
|||
|
|
|
|||
|
|
# vep注释(主要为HGVS)
|
|||
|
|
${vep} --dir ${vep_dir} --cache --offline --cache_version 98 \
|
|||
|
|
--use_given_ref --refseq --assembly GRCh37 --format vcf \
|
|||
|
|
--fa ${hg19fa} --force_overwrite --vcf --variant_class --gene_phenotype --vcf_info_field ANN \
|
|||
|
|
--hgvs --hgvsg --transcript_version \
|
|||
|
|
-i anno/${sampleID}.germline.inital.filter.vcf -o anno/${sampleID}.germline.vep_anno.vcf
|
|||
|
|
rm anno/${sampleID}.germline.inital.filter.vcf
|
|||
|
|
|
|||
|
|
# HGVS命名注释
|
|||
|
|
$python3 ${anno_hgvs} anno/${sampleID}.germline.vep_anno.vcf ${clinic_transcripts} ${refFlat} -o anno/${sampleID}.germline.hgvs_anno.vcf
|
|||
|
|
rm anno/${sampleID}.germline.vep_anno.vcf*
|
|||
|
|
|
|||
|
|
# VEP修复后假阳性删除
|
|||
|
|
$python3 ${snv_second_filter} anno/${sampleID}.germline.hgvs_anno.vcf -k germline -o anno/${sampleID}.germline.final.filter.vcf >>anno/${sampleID}.germline.filter_reason.txt
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
|
|||
|
|
###################### somatic ########################
|
|||
|
|
echo $'\nsomatic'
|
|||
|
|
echo "$(date)"
|
|||
|
|
|
|||
|
|
# 过滤 nopass 突变
|
|||
|
|
grep "#" ${sampleID}.somatic.split.norm.vcf > anno/${sampleID}.somatic.split.filter.vcf
|
|||
|
|
grep -v "#" ${sampleID}.somatic.split.norm.vcf | awk '{len=split($7,a,";");split($NF,b,":");if(len<2 || (len>=2 && b[3] >=0.01)) print $0}' >> anno/${sampleID}.somatic.split.filter.vcf
|
|||
|
|
|
|||
|
|
# annovar注释
|
|||
|
|
${annovar} anno/${sampleID}.somatic.split.filter.vcf ${humandb} \
|
|||
|
|
--outfile anno/${sampleID}.somatic.annovar --buildver hg19 \
|
|||
|
|
--protocol refGene,1000g2015aug_all,esp6500siv2_all,exac03nonpsych,cosmic70,clinvar_20220624,intervar_20180118,dbnsfp35a,avsnp150,dbscsnv11 \
|
|||
|
|
--operation g,f,f,f,f,f,f,f,f,f --vcfinput --remove
|
|||
|
|
|
|||
|
|
# vep注释(主要为HGVS)
|
|||
|
|
${vep} --dir ${vep_dir} --cache --offline --cache_version 98 \
|
|||
|
|
--use_given_ref --refseq --assembly GRCh37 --format vcf \
|
|||
|
|
--fa ${hg19fa} --force_overwrite --vcf --variant_class --gene_phenotype --vcf_info_field ANN \
|
|||
|
|
--hgvs --hgvsg --transcript_version \
|
|||
|
|
-i anno/${sampleID}.somatic.annovar.hg19_multianno.vcf -o anno/${sampleID}.somatic.vep_anno.vcf
|
|||
|
|
|
|||
|
|
# HGVS命名注释
|
|||
|
|
$python3 ${anno_hgvs} anno/${sampleID}.somatic.vep_anno.vcf ${clinic_transcripts} ${refFlat} -o anno/${sampleID}.somatic.hgvs_anno.vcf
|
|||
|
|
|
|||
|
|
# 初步过滤(假阳性等)
|
|||
|
|
$python3 ${snv_init_filter} anno/${sampleID}.somatic.hgvs_anno.vcf \
|
|||
|
|
-o anno/${sampleID}.somatic.inital.filter.vcf -t ${sampleType} -m ${mode} > anno/${sampleID}.somatic.filter_reason.txt
|
|||
|
|
|
|||
|
|
# 合并同一单倍型上的复杂突变
|
|||
|
|
$python3 ${merge_mnv} anno/${sampleID}.somatic.inital.filter.vcf $hg19fa -o anno/${sampleID}.somatic.merge_mnv.vcf
|
|||
|
|
|
|||
|
|
# 将合并后的mnp位置左移校准
|
|||
|
|
$bcftools norm -m -both -f ${hg19fa} anno/${sampleID}.somatic.merge_mnv.vcf -o anno/${sampleID}.somatic.merged_norm.vcf
|
|||
|
|
|
|||
|
|
# 拆分MNV突变
|
|||
|
|
$python3 ${find_mnp} anno/${sampleID}.somatic.merged_norm.vcf
|
|||
|
|
|
|||
|
|
if [ -f "anno/${sampleID}.somatic.mnp.vcf" ];
|
|||
|
|
then
|
|||
|
|
# annovar注释MNV
|
|||
|
|
${annovar} anno/${sampleID}.somatic.mnp.vcf ${humandb} \
|
|||
|
|
--outfile anno/${sampleID}.somatic.mnp.annovar --buildver hg19 \
|
|||
|
|
--protocol refGene,1000g2015aug_all,esp6500siv2_all,exac03nonpsych,cosmic70,clinvar_20220624,intervar_20180118,dbnsfp35a,avsnp150,dbscsnv11 \
|
|||
|
|
--operation g,f,f,f,f,f,f,f,f,f --vcfinput --remove
|
|||
|
|
|
|||
|
|
# vep注释MNV(主要为HGVS)
|
|||
|
|
${vep} --dir ${vep_dir} --cache --offline --cache_version 98 \
|
|||
|
|
--use_given_ref --refseq --assembly GRCh37 --format vcf \
|
|||
|
|
--fa ${hg19fa} --force_overwrite --vcf --variant_class --gene_phenotype --vcf_info_field ANN \
|
|||
|
|
--hgvs --hgvsg --transcript_version \
|
|||
|
|
-i anno/${sampleID}.somatic.mnp.annovar.hg19_multianno.vcf -o anno/${sampleID}.somatic.mnp.vep_anno.vcf
|
|||
|
|
|
|||
|
|
# HGVS命名注释MNV
|
|||
|
|
$python3 ${anno_hgvs} anno/${sampleID}.somatic.mnp.vep_anno.vcf ${clinic_transcripts} ${refFlat} -o anno/${sampleID}.somatic.mnp.hgvs_anno.vcf
|
|||
|
|
|
|||
|
|
# 合并突变
|
|||
|
|
$bgzip -f anno/${sampleID}.somatic.common.vcf
|
|||
|
|
$tabix -p vcf anno/${sampleID}.somatic.common.vcf.gz
|
|||
|
|
|
|||
|
|
$bgzip -f anno/${sampleID}.somatic.mnp.hgvs_anno.vcf
|
|||
|
|
$tabix -p vcf anno/${sampleID}.somatic.mnp.hgvs_anno.vcf.gz
|
|||
|
|
|
|||
|
|
$bcftools concat anno/${sampleID}.somatic.common.vcf.gz anno/${sampleID}.somatic.mnp.hgvs_anno.vcf.gz -a -d all -o anno/${sampleID}.somatic.merge.vcf
|
|||
|
|
|
|||
|
|
else
|
|||
|
|
mv anno/${sampleID}.somatic.merged_norm.vcf anno/${sampleID}.somatic.merge.vcf
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
# VEP修复后假阳性及良性变异删除
|
|||
|
|
$python3 ${snv_second_filter} anno/${sampleID}.somatic.merge.vcf -k somatic -o anno/${sampleID}.somatic.final.filter.vcf >>anno/${sampleID}.somatic.filter_reason.txt
|
|||
|
|
|
|||
|
|
# 统计变异删除的原因
|
|||
|
|
$python3 ${get_filter_reason} . -k somatic
|
|||
|
|
|
|||
|
|
|
|||
|
|
###################### tmb ########################
|
|||
|
|
if [[ ${bedType} =~ ^(${tmb_panel})$ ]]; then
|
|||
|
|
echo $'\nTMB'
|
|||
|
|
echo "$(date)"
|
|||
|
|
|
|||
|
|
# 过滤低频假阳性
|
|||
|
|
grep "#" ${sampleID}.somatic.split.norm.vcf > anno/${sampleID}.tmb.filter_1.vcf
|
|||
|
|
if [[ ${sampleType} =~ (ctdna) ]]; then
|
|||
|
|
grep -v "#" ${sampleID}.somatic.split.norm.vcf | awk '{split($NF,a,":");if(a[3]>0.05 && a[3] <= 0.5) print $0}' >> anno/${sampleID}.tmb.filter_1.vcf
|
|||
|
|
else
|
|||
|
|
grep -v "#" ${sampleID}.somatic.split.norm.vcf | awk '{split($NF,a,":");if(a[3]>0.05) print $0}' >> anno/${sampleID}.tmb.filter_1.vcf
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
# 过滤filter信息大于2的突变
|
|||
|
|
grep "#" anno/${sampleID}.tmb.filter_1.vcf > anno/${sampleID}.tmb.filter_2.vcf
|
|||
|
|
grep -v "#" anno/${sampleID}.tmb.filter_1.vcf | awk '{len=split($7,a,";");if(len<2) print $0}' >> anno/${sampleID}.tmb.filter_2.vcf
|
|||
|
|
|
|||
|
|
# 合并同一单倍型上的复杂突变
|
|||
|
|
$python3 ${merge_mnv} anno/${sampleID}.tmb.filter_2.vcf $hg19fa -o anno/${sampleID}.tmb.merged.vcf
|
|||
|
|
|
|||
|
|
# 将合并后的mnp位置左移校准
|
|||
|
|
$bcftools norm -m -both -f ${hg19fa} anno/${sampleID}.tmb.merged.vcf -o anno/${sampleID}.tmb.merged_norm.vcf
|
|||
|
|
rm anno/${sampleID}.tmb.merged.vcf
|
|||
|
|
|
|||
|
|
# annovar 注释相关数据库
|
|||
|
|
${annovar} anno/${sampleID}.tmb.merged_norm.vcf ${humandb} \
|
|||
|
|
--outfile anno/${sampleID}.tmb.annovar --buildver hg19 \
|
|||
|
|
--protocol refGene,cytoBand,1000g2015aug_all,esp6500siv2_all,exac03nonpsych,cosmic70,clinvar_20220624,intervar_20180118,dbnsfp35a,avsnp150 \
|
|||
|
|
--operation g,r,f,f,f,f,f,f,f,f --vcfinput --remove
|
|||
|
|
rm anno/${sampleID}.tmb.merged_norm.vcf
|
|||
|
|
|
|||
|
|
# vep注释(主要为HGVS)
|
|||
|
|
${vep} --dir ${vep_dir} --cache --offline --cache_version 98 \
|
|||
|
|
--use_given_ref --refseq --assembly GRCh37 --format vcf \
|
|||
|
|
--fa ${hg19fa} --force_overwrite --vcf --variant_class --gene_phenotype --vcf_info_field ANN \
|
|||
|
|
--hgvs --hgvsg --transcript_version \
|
|||
|
|
-i anno/${sampleID}.tmb.annovar.hg19_multianno.vcf -o anno/${sampleID}.tmb.vep_anno.vcf
|
|||
|
|
rm anno/${sampleID}.tmb.annovar*
|
|||
|
|
|
|||
|
|
# HGVS命名注释
|
|||
|
|
$python3 ${anno_hgvs} anno/${sampleID}.tmb.vep_anno.vcf ${clinic_transcripts} ${refFlat} -o anno/${sampleID}.tmb.hgvs_anno.vcf
|
|||
|
|
rm anno/${sampleID}.tmb.vep_anno.vcf*
|
|||
|
|
|
|||
|
|
# 过滤其他假阳性,germline等
|
|||
|
|
$python3 ${tmb_filter} anno/${sampleID}.tmb.hgvs_anno.vcf -t ${sampleType} -o anno/${sampleID}.tmb.final.filter.vcf -m ${mode} > anno/${sampleID}.tmb.filter_reason.txt
|
|||
|
|
rm anno/${sampleID}.tmb.hgvs_anno.vcf
|
|||
|
|
|
|||
|
|
# 统计变异删除的原因
|
|||
|
|
$python3 ${get_filter_reason} . -k tmb
|
|||
|
|
rm anno/${sampleID}.tmb.filter_1.vcf anno/${sampleID}.tmb.filter_2.vcf
|
|||
|
|
fi
|
|||
|
|
rm ${sampleID}.somatic.split.norm.vcf
|
|||
|
|
|
|||
|
|
|
|||
|
|
###################### concat vcf ########################
|
|||
|
|
if ([[ "$bedType" =~ (${germline_panel1}) ]]) || ([[ "$bedType" =~ (QR) && ${sampleType} != "blood" && ${sampleType} != "dna" ]]);then
|
|||
|
|
echo $'\nmerge_vcf'
|
|||
|
|
echo "$(date)"
|
|||
|
|
|
|||
|
|
cp anno/${sampleID}.germline.final.filter.vcf anno/${sampleID}.germline.final.filter.concat.vcf
|
|||
|
|
$bgzip -f anno/${sampleID}.germline.final.filter.concat.vcf
|
|||
|
|
$tabix -p vcf anno/${sampleID}.germline.final.filter.concat.vcf.gz
|
|||
|
|
|
|||
|
|
cp anno/${sampleID}.somatic.final.filter.vcf anno/${sampleID}.somatic.final.filter.concat.vcf
|
|||
|
|
$bgzip -f anno/${sampleID}.somatic.final.filter.concat.vcf
|
|||
|
|
$tabix -p vcf anno/${sampleID}.somatic.final.filter.concat.vcf.gz
|
|||
|
|
|
|||
|
|
if [[ "$bedType" =~ (${tmb_panel}) ]];then
|
|||
|
|
cp anno/${sampleID}.tmb.final.filter.vcf anno/${sampleID}.tmb.final.filter.concat.vcf
|
|||
|
|
$bgzip -f anno/${sampleID}.tmb.final.filter.concat.vcf
|
|||
|
|
$tabix -p vcf anno/${sampleID}.tmb.final.filter.concat.vcf.gz
|
|||
|
|
$bcftools concat anno/${sampleID}.somatic.final.filter.concat.vcf.gz anno/${sampleID}.germline.final.filter.concat.vcf.gz anno/${sampleID}.tmb.final.filter.concat.vcf.gz -a -d all -o anno/${sampleID}.somatic.concat.vcf
|
|||
|
|
rm anno/${sampleID}.tmb.final.filter.concat.vcf.gz*
|
|||
|
|
else
|
|||
|
|
$bcftools concat anno/${sampleID}.somatic.final.filter.concat.vcf.gz anno/${sampleID}.germline.final.filter.concat.vcf.gz -a -d all -o anno/${sampleID}.somatic.concat.vcf
|
|||
|
|
fi
|
|||
|
|
rm anno/${sampleID}.somatic.final.filter.concat.vcf.gz* anno/${sampleID}.germline.final.filter.concat.vcf.gz*
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
###################### anno drugs ########################
|
|||
|
|
echo $'\nanno_drugs'
|
|||
|
|
echo "$(date)"
|
|||
|
|
|
|||
|
|
# 注释药物相关数据库
|
|||
|
|
if ([[ "$bedType" =~ (${germline_panel1}) ]]) || ([[ "$bedType" =~ (QR) && ${sampleType} != "blood" && ${sampleType} != "dna" ]]);then
|
|||
|
|
$python3 ${snv_out} anno/${sampleID}.somatic.concat.vcf -o anno/${sampleID}.somatic.xlsx -t ${sampleType} >> anno/${sampleID}.somatic.filter_reason.txt
|
|||
|
|
|
|||
|
|
elif [[ "$bedType" =~ (${germline_panel2}) && "$sampleType" =~ (blood|dna) ]];then
|
|||
|
|
$python3 ${snv_out} anno/${sampleID}.germline.final.filter.vcf -o anno/${sampleID}.somatic.xlsx -t ${sampleType} >> anno/${sampleID}.somatic.filter_reason.txt
|
|||
|
|
|
|||
|
|
else
|
|||
|
|
$python3 ${snv_out} anno/${sampleID}.somatic.final.filter.vcf -o anno/${sampleID}.somatic.xlsx -t ${sampleType} >> anno/${sampleID}.somatic.filter_reason.txt
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
###################### add_pipe_tag ######################
|
|||
|
|
# 添加流程pipe tag
|
|||
|
|
$python3 ${add_pipe_tag} anno/${sampleID}.somatic.xlsx
|
|||
|
|
|
|||
|
|
#################### sanger_determine ####################
|
|||
|
|
# 确定germline突变是否需要验证
|
|||
|
|
$python3 ${sanger_determine} anno/${sampleID}.somatic.xlsx
|
|||
|
|
|
|||
|
|
#################### cis_trans ####################
|
|||
|
|
# 确定T790M和G797S共突变的顺式或反式
|
|||
|
|
$python3 ${cis_trans} anno/${sampleID}.somatic.xlsx
|
|||
|
|
|
|||
|
|
###################### core_filter ########################
|
|||
|
|
if [[ "$bedType" =~ (${core_panel}) ]];then
|
|||
|
|
$python3 ${core_filter} anno/${sampleID}.somatic.xlsx
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
###################### qc_bamdst ########################
|
|||
|
|
echo $'\nqc_bamdst'
|
|||
|
|
echo "$(date)"
|
|||
|
|
|
|||
|
|
if [[ ${bedType} =~ ^(${pcr_panel})$ || ${bedType} =~ ^(${umi_panel})$ ]]; then
|
|||
|
|
mkdir -p qc/bamdst
|
|||
|
|
$bamdst --uncover 30 -p ${bedPath}/${bedType}.bed --cutoffdepth 500 -o qc/bamdst ${sampleID}.sorted.bam
|
|||
|
|
ls qc/bamdst/ | xargs -I{} mv qc/bamdst/{} qc/bamdst/${sampleID}.{}
|
|||
|
|
|
|||
|
|
elif [[ ${bedType} =~ ^(${probe_panel})$ && ${sampleType} != "ctdna" ]]; then
|
|||
|
|
mkdir -p qc/bamdst
|
|||
|
|
$bamdst --uncover 30 -p ${bedPath}/${bedType}.bed --cutoffdepth 1000 -o qc/bamdst ${sampleID}.sorted.dedup.bam
|
|||
|
|
ls qc/bamdst/ | xargs -I{} mv qc/bamdst/{} qc/bamdst/${sampleID}.{}
|
|||
|
|
|
|||
|
|
else
|
|||
|
|
mkdir -p qc/bamdst/raw
|
|||
|
|
if [[ "$sampleType" =~ (ctdna) ]]; then
|
|||
|
|
$bamdst --uncover 30 -p ${bedPath}/${bedType}.bed --cutoffdepth 1000 -o qc/bamdst/raw ${sampleID}.sorted.bam
|
|||
|
|
else
|
|||
|
|
$bamdst --uncover 30 -p ${bedPath}/${bedType}.bed --cutoffdepth 500 -o qc/bamdst/raw ${sampleID}.sorted.bam
|
|||
|
|
fi
|
|||
|
|
ls qc/bamdst/raw/ | xargs -I{} mv qc/bamdst/raw/{} qc/bamdst/raw/${sampleID}.raw.{}
|
|||
|
|
|
|||
|
|
mkdir -p qc/bamdst/dedup
|
|||
|
|
if [[ "$sampleType" =~ (ctdna) ]]; then
|
|||
|
|
$bamdst --uncover 30 -p ${bedPath}/${bedType}.bed --cutoffdepth 1000 -o qc/bamdst/dedup ${sampleID}.sorted.dedup.bam
|
|||
|
|
else
|
|||
|
|
$bamdst --uncover 30 -p ${bedPath}/${bedType}.bed --cutoffdepth 500 -o qc/bamdst/dedup ${sampleID}.sorted.dedup.bam
|
|||
|
|
fi
|
|||
|
|
ls qc/bamdst/dedup | xargs -I{} mv qc/bamdst/dedup/{} qc/bamdst/dedup/${sampleID}.dedup.{}
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
|
|||
|
|
###################### chemical ########################
|
|||
|
|
|
|||
|
|
echo $'\nchemical'
|
|||
|
|
echo "$(date)"
|
|||
|
|
|
|||
|
|
if [[ ${bedType} =~ ^(DA)$ ]]; then
|
|||
|
|
$samtools bedcov ${bedPath}/chemical_DA.bed ${sampleID}.sorted.bam >${sampleID}.chemical.depth.txt
|
|||
|
|
else
|
|||
|
|
$samtools bedcov ${bedPath}/chemical_DF.bed ${sampleID}.sorted.bam >${sampleID}.chemical.depth.txt
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
if [[ "$bedType" =~ (${chemical_panel}) ]]; then
|
|||
|
|
$python3 ${germline_chemical} anno/${sampleID}.germline.anno.hg19_multianno.txt >anno/${sampleID}.chemical.txt
|
|||
|
|
$python3 ${check_sample_data} anno/${sampleID}.chemical.txt
|
|||
|
|
rm ${sampleID}.chemical.depth.txt anno/${sampleID}.chemical.txt
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
|
|||
|
|
###################### Copy Number Aberration ######################
|
|||
|
|
### hybrid capture: DG|DS|DH|DT|DL|DW|DR|DJ|DF|DQ|DE
|
|||
|
|
### targeted amplicon seq: DP|HH|HT|HW|HR|HC|HF|ZJ|ZG|YW|QJ|YR|YZ|YW|YF
|
|||
|
|
### 需要做cnv的panel:DF, DG, DJ, DQ, DR, DS, DW, DE
|
|||
|
|
|
|||
|
|
##################### CNVkit ########################
|
|||
|
|
if [[ ${bedType} =~ ^(${cnv_panel1})$ ]];
|
|||
|
|
then
|
|||
|
|
echo "CNA starts at -- `date` "
|
|||
|
|
|
|||
|
|
if [[ ${bedType} =~ ^(${cnv_panel2})$ ]];
|
|||
|
|
then
|
|||
|
|
$python3 $cnvkit coverage ${sampleID}.sorted.dedup.bam ${cnv_target}/${bedType}.target.bed -p ${threads} -o ${sampleID}.targetcoverage.cnn
|
|||
|
|
$python3 $cnvkit fix ${sampleID}.targetcoverage.cnn ${cnv_ref}/empty.cnn ${cnv_ref}/${bedType}.flat.cnn -o ${sampleID}.cnr
|
|||
|
|
$python3 $cnvkit segment -p ${threads} ${sampleID}.cnr --drop-low-coverage -o ${sampleID}.cns
|
|||
|
|
|
|||
|
|
elif [[ ${bedType} =~ ^(${cnv_panel3})$ ]];
|
|||
|
|
then
|
|||
|
|
$python3 $cnvkit coverage ${sampleID}.sorted.bam ${cnv_target}/${bedType}.target.bed -p ${threads} -o ${sampleID}.targetcoverage.cnn
|
|||
|
|
$python3 $cnvkit fix ${sampleID}.targetcoverage.cnn ${cnv_ref}/empty.cnn ${cnv_ref}/${bedType}.flat.cnn --no-edge -o ${sampleID}.cnr
|
|||
|
|
$python3 $cnvkit segment -p ${threads} ${sampleID}.cnr --drop-low-coverage -o ${sampleID}.cns
|
|||
|
|
|
|||
|
|
else
|
|||
|
|
$python3 $cnvkit coverage ${sampleID}.sorted.dedup.bam ${cnv_target}/${bedType}.target.bed -p ${threads} -o ${sampleID}.targetcoverage.cnn
|
|||
|
|
$python3 $cnvkit coverage ${sampleID}.sorted.dedup.bam ${cnv_anti}/${bedType}.antitarget.bed -p ${threads} -o ${sampleID}.antitargetcoverage.cnn
|
|||
|
|
$python3 $cnvkit fix ${sampleID}.targetcoverage.cnn ${sampleID}.antitargetcoverage.cnn ${cnv_ref}/${bedType}.flat.cnn -o ${sampleID}.cnr
|
|||
|
|
$python3 $cnvkit segment -p ${threads} ${sampleID}.cnr --drop-low-coverage -o ${sampleID}.cns
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
$python3 $cnvkit call ${sampleID}.cns --center --drop-low-coverage -o ${sampleID}.call.cns
|
|||
|
|
$python3 $cnvkit export vcf ${sampleID}.call.cns -i "${sampleID}" -o ${sampleID}.cnv.vcf
|
|||
|
|
$python3 $annotate_vcf -i ${sampleID}.cnv.vcf -r $refFlat -o ${sampleID}.annotated.cnv.csv
|
|||
|
|
|
|||
|
|
$python3 ${cna_out} ${sampleID}.annotated.cnv.csv
|
|||
|
|
|
|||
|
|
##################### THeTA ########################
|
|||
|
|
# THeTA -- calculate sample purity (tumour/normal ratio)
|
|||
|
|
$python3 $cnvkit export theta ${sampleID}.cns -r ${cnv_ref}/${bedType}.flat.cnn
|
|||
|
|
${theta} ${sampleID}.interval_count --NUM_PROCESSES ${threads}
|
|||
|
|
|
|||
|
|
|
|||
|
|
echo "CNA analysis completed at: `date` "
|
|||
|
|
rm ${sampleID}.*coverage.cnn ${sampleID}.cnr ${sampleID}.cnv.vcf ${sampleID}.cns ${sampleID}.call.cns ${sampleID}.interval_count ${sampleID}.n2.graph.pdf ${sampleID}.n2.withBounds
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
|
|||
|
|
################### Microsatelite instability ######################
|
|||
|
|
# ###################### MSIsensor2 ######################
|
|||
|
|
# # Detect msi on below panel using msisensor
|
|||
|
|
|
|||
|
|
if [[ ${bedType} =~ ^(${msi_panel})$ ]];
|
|||
|
|
then
|
|||
|
|
echo $'\nmsisensor2'
|
|||
|
|
echo "$(date)"
|
|||
|
|
$msisensor2 msi -M ${msi_models} -t ${sampleID}.sorted.dedup.bam -o ${sampleID}.prefix
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
|
|||
|
|
########################## Fusion of genes ########################
|
|||
|
|
##################### genefuse ########################
|
|||
|
|
# Detect potential gene fusion on below panels.
|
|||
|
|
if [[ ${bedType} =~ ^(${fusion_panel})$ ]]; then
|
|||
|
|
|
|||
|
|
echo $'\ngenefuse'
|
|||
|
|
echo "$(date)"
|
|||
|
|
$python3 ${make_fusion_gene} ${sampleID}
|
|||
|
|
if [[ "$sampleType" =~ (ctdna) ]]; then
|
|||
|
|
$genefuse -t ${threads} -r ${hg19fa} -f ${sampleID}.fusion_genes.csv -1 ${sampleID}.umi_consensus.fq.gz -h ${sampleID}.fusion.html -j ${sampleID}.fusion.json 1> ${sampleID}.fusion.txt
|
|||
|
|
rm ${sampleID}.umi_consensus.fq.gz
|
|||
|
|
else
|
|||
|
|
$genefuse -t ${threads} -r ${hg19fa} -f ${sampleID}.fusion_genes.csv -1 ${sampleID}.R1_001.trim.fq.gz -2 ${sampleID}.R2_001.trim.fq.gz -h ${sampleID}.fusion.html -j ${sampleID}.fusion.json 1> ${sampleID}.fusion.txt
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
echo $'\ndelly'
|
|||
|
|
echo "$(date)"
|
|||
|
|
$delly call -x ${delly_low_regions} -g ${hg19fa} -o ${sampleID}.bcf ${sampleID}.sorted.dedup.bam
|
|||
|
|
$bcftools view ${sampleID}.bcf > ${sampleID}.sv.raw.vcf
|
|||
|
|
rm ${sampleID}.bcf*
|
|||
|
|
$python3 $delly_sv_filter ${sampleID}.sv.raw.vcf
|
|||
|
|
rm ${sampleID}.delly*
|
|||
|
|
|
|||
|
|
$python3 $fusion_filter ${sampleID}.fusion.json -o ${sampleID}.fusion_filtered.json
|
|||
|
|
$python3 ${fusion_out} ${sampleID}.fusion_filtered.json
|
|||
|
|
$python3 ${check_fusion_result} ${sampleID}.fusion.xlsx
|
|||
|
|
rm ${sampleID}.fusion.txt
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
|
|||
|
|
########################## Quality control #########################
|
|||
|
|
##################### Fastp ########################
|
|||
|
|
# This step is used for quality control by fastqc.
|
|||
|
|
|
|||
|
|
mkdir -p qc/fastp
|
|||
|
|
echo $'\nQC_fastp'
|
|||
|
|
echo "$(date)"
|
|||
|
|
mv ${sampleID}.jion ${sampleID}.html qc/fastp/
|
|||
|
|
|
|||
|
|
##################### samtools ########################
|
|||
|
|
# This step is used for quality control by samtools.
|
|||
|
|
|
|||
|
|
mkdir -p qc/samtools
|
|||
|
|
echo $'\nQC_samtools'
|
|||
|
|
echo "$(date)"
|
|||
|
|
$samtools stats -@ ${threads} ${sampleID}.sorted.bam >qc/samtools/${sampleID}.sorted.bam.stats.txt
|
|||
|
|
|
|||
|
|
|
|||
|
|
##################### hla_type_neo_predict ########################
|
|||
|
|
|
|||
|
|
# This step is used for hla_type_neo_predict by hlahd and MuPeXI.
|
|||
|
|
if [[ "$bedType" =~ ^(${hla_panel})$ ]]; then
|
|||
|
|
echo $'\nHLA'
|
|||
|
|
echo "$(date)"
|
|||
|
|
cp anno/${sampleID}.tmb.final.filter.vcf anno/${sampleID}.tmb_copy.vcf
|
|||
|
|
grep '#' anno/${sampleID}.tmb_copy.vcf > anno/${sampleID}.header
|
|||
|
|
grep -v '#' anno/${sampleID}.tmb_copy.vcf > anno/${sampleID}.body
|
|||
|
|
|
|||
|
|
gunzip -k ${sampleID}.R1_001.trim.fq.gz
|
|||
|
|
gunzip -k ${sampleID}.R2_001.trim.fq.gz
|
|||
|
|
# get 300w reads, about 1GB to to hla typing
|
|||
|
|
$seqtk sample -s100 ${sampleID}.R1_001.trim.fq 3000000 > ${sampleID}.R1_001.trim.sub.fq
|
|||
|
|
$seqtk sample -s100 ${sampleID}.R2_001.trim.fq 3000000 > ${sampleID}.R2_001.trim.sub.fq
|
|||
|
|
|
|||
|
|
$python2 ${hla_optitype} -sampleID $sampleID -outDir $outDir -results_TMB_txt anno/${sampleID}.body -fq1 ${sampleID}.R1_001.trim.sub.fq -fq2 ${sampleID}.R2_001.trim.sub.fq
|
|||
|
|
|
|||
|
|
# 拼接vcf
|
|||
|
|
cat anno/${sampleID}.header anno/${sampleID}.select_variant_from_TMB.txt > anno/${sampleID}.mupexi.vcf
|
|||
|
|
|
|||
|
|
$python2 ${hla_optitype} -sampleID $sampleID -outDir $outDir -vcf anno/${sampleID}.mupexi.vcf -determine_tnb anno/${sampleID}.body
|
|||
|
|
rm ${sampleID}.R*_001.trim.fq ${sampleID}.R*_001.trim.sub.fq \
|
|||
|
|
anno/${sampleID}.body anno/${sampleID}.tmb_copy.vcf anno/${sampleID}.header anno/${sampleID}.select_variant_from_TMB.txt \
|
|||
|
|
neo/${sampleID}_coverage_plot.pdf neo/${sampleID}.fasta neo/${sampleID}.g \
|
|||
|
|
neo/${sampleID}_netMHCpan* neo/${sampleID}_pepmatch_* neo/${sampleID}_peptide_netMHCinput.txt neo/${sampleID}_reference_peptide_* \
|
|||
|
|
neo/${sampleID}_vep_compatible_vcf.vcf neo/${sampleID}_vep.vep
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
|
|||
|
|
########################## merge_data #########################
|
|||
|
|
$python3 ${single_sample_qc} . ${sampleID} ${sampleType}
|
|||
|
|
$python3 ${parser_single_sample_data} . ${sampleID}
|
|||
|
|
$python3 ${collect_single_sample_data} . ${sampleID}
|
|||
|
|
|
|||
|
|
|
|||
|
|
rm ${sampleID}.R*_001.trim.fq.gz
|
|||
|
|
if [[ ${bedType} =~ ^(${probe_panel})$ ]]; then
|
|||
|
|
rm ${sampleID}.sorted.bam*
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
echo "$sampleID -- ending at: $(date)"
|
|||
|
|
echo "pipeline finished!" >finish.txt
|