#!/usr/bin/env perl use strict; #use warnings; use List::Util qw(sum); die "useage:perl $0 input out tag_out project sample_type pipeline" unless @ARGV == 6; my ($input, $out, $tag_out, $project, $sample_type, $pipeline) = @ARGV; my $public_path = defined $ENV{'PUBLIC'} ? $ENV{'PUBLIC'} : "/dataseq/jmdna/codes/public"; print "$pipeline 过滤使用public路径:$public_path\n"; my $database_path = defined $ENV{'DATABASE'} ? $ENV{'DATABASE'} : "/dataseq/jmdna/codes/reportbase"; print "$pipeline 过滤使用database路径:$database_path\n"; # open OUT, ">$**.hg19_multianno_filter.txt"; open OUT, "> $out"; # open OUT1, ">**.hg19_multianno_tag.txt"; open TAG_OUT, ">$tag_out"; if (-z $input) { print "文件 $input 为空 \n"; exit(0); # 正常退出脚本 } open IN, "$input"; my $head = ; chomp($head); my @columns = split("\t", $head); my $new_head = join("\t", "Validated", "ClinicalSign", @columns[0 .. 6], "Freq", @columns[7 .. 20, 23, 28, 32, 50, 56, 62, 101, 102], "Oncogenic", "Mutation_Effect", "genetag", "process"); if (!($pipeline eq 'somatic' || $pipeline eq 'tmb' || $pipeline eq 'hotspot' || $pipeline eq 'germline')) { die "useage: pipeline must be 'somatic' or 'germline' or 'hotspot or tmb'"; } print OUT "$new_head\n"; print TAG_OUT "TAG\t$head\n"; my %blacklist = blacklist(); my ($muts_ref, $hcs_ref, $mmr_ref, $hhr1_ref, $hhr2_ref) = info(); my @muts = @$muts_ref; my @hcs = @$hcs_ref; my @mmr = @$mmr_ref; my @hhr1 = @$hhr1_ref; my @hhr2 = @$hhr2_ref; my %transcript = transcript(); my %oncogenic = get_oncogenic(); while () { chomp; my @line = split(/\t/, $_); my $freq = ($line[102] =~ /AF=([\d.]+)/) ? $1 : die "AF value not found"; my $filters = split(";", $line[101]); my $gene = (split(";", $line[6]))[0]; my @reason; # 黑名单 if (exists $blacklist{join("_", @line[0 .. 4])}) { push @reason, 'blacklist'; } # 是否检测 if ((@muts) and !(grep {$_ eq $gene} @muts)) { push @reason, 'notarget_gene'; } # 人群频率 if (grep {$_ > 0.01} @line[17, 18, 19, 20, 23, 28, 32]) { push @reason, 'common_snp'; } # ExonicFunc.refGene 过滤同义突变,未知突变 if ($line[8] eq "synonymous SNV" or $line[8] eq "unknown") { push @reason, 'synonymous'; } # CLNSIG benign if ($line[16] =~ /benign/i and $line[16] !~ /pathogenic|Affects|association|Conflicting|sensitivity|drug|other|risk|protective|Uncertain|not_provided|\./i) { push @reason, 'benign'; } # cfdna 样本 if ($sample_type eq 'c') { if ($line[101] ne 'PASS' and ($freq < 0.005 or ($freq >= 0.005 and $freq < 0.01 and $filters >= 2))) { push @reason, 'byfilter'; } # 低频的 cosmic91 无记录的 if ($freq < 0.01 and $line[11] eq '.') { push @reason, 'cfdna_lowfreq_cosmic'; } # 低频的;cosmic91 只有1条以内的 if ($line[11] =~ /OCCURENCE=(\S+)/) { my $cosmic = $1; $cosmic =~ s/\(\S+?\)//g; my @cosmic = split(",", $cosmic); $cosmic = sum @cosmic; if ($freq < 0.01 and $cosmic <= 1) { push @reason, 'cfdna_lowfreq_cosmic1'; } if (($cosmic >= 10) and $pipeline eq 'tmb') { push @reason, 'cfdna_lowfreq_cosmic1'; } } } else { # 不是pass 低频的; 或者 0.02 < 0.05 之间 个数大于2 血液 0.005 - 0.01 if ($line[101] ne 'PASS' and ($freq < 0.02 or ($freq >= 0.02 and $freq < 0.05 and $filters >= 2))) { push @reason, 'byfilter'; } } my $protein; # met 基因 在特定区域 if (($gene eq "MET") && ($line[0] eq "chr7") && (($line[1] > 116411872 && $line[1] <= 116411902) or ($line[2] > 116411872 && $line[2] <= 116411902) or ($line[1] > 116412043 && $line[1] <= 116412046) or ($line[2] > 116412043 && $line[2] <= 116412046))) { $line[9] = join(":", ($gene, "NM_000245", "exon14", "c.xxx")); $line[8] = 'skipping'; $protein = 'Exon 14 Skipping Mutation'; } elsif ($line[9] eq '.') { # splicing 位点 if ($line[5] =~ /splicing/) { my @hgvs = split(/;/, $line[7]); my $hgvs = $hgvs[0]; my $transcript_gene; $transcript_gene = $transcript{$gene} if (exists $transcript{$gene}); if (grep {/$transcript_gene/} @hgvs) { $hgvs = (grep {/$transcript_gene/} @hgvs)[0]; } $hgvs =~ /(\S+):exon(\d+):c\.(\S+)$/; my $spl = $3; my $exon = $2; # splicing 位点 的前后 1-2bp 的 if ($spl =~ /\d+\+[1|2]\D+/) { my $intron = $exon; $hgvs =~ s/exon(\d+)/exon$exon;intron$intron/; $line[9] = join(":", ($gene, $hgvs)); } elsif ($spl =~ /\d+\-[1|2]\D+/) { my $intron = $exon - 1; $hgvs =~ s/exon(\d+)/intron$intron;exon$exon/; $line[9] = join(":", ($gene, $hgvs)); } else { push @reason, 'not_need_spl'; } $protein = 'Truncating Mutations'; } else { push @reason, 'not_need_spl_inron'; } } else { my @hgvs = split(/,/, $line[9]); my $hgvs = $hgvs[0]; my $transcript_gene; $transcript_gene = $transcript{$gene} if (exists $transcript{$gene}); if (grep {/$transcript_gene/} @hgvs) { $hgvs = (grep {/$transcript_gene/} @hgvs)[0]; } $line[9] = $hgvs; $hgvs =~ /:(NM_\d+):exon\d+:(c\.\S+):p\.(\S+)$/; $protein = $3; if ($protein =~ /\d+X$|\d+\*$/ or $line[8] eq 'stopgain' or $line[8] eq 'frameshift deletion' or $line[8] eq 'frameshift insertion') { $protein = 'Truncating Mutations'; } } # hotspot 不过滤但是修改 hgvs if ($pipeline eq 'hotspot') { @reason = (); } # tmb 流程去掉 不过滤但是修改 hgvs if ($pipeline eq 'tmb') { @reason = grep(!/synonymous/, @reason); if (($freq < 0.05) and ($sample_type eq 't')) { push @reason, 'lowfreq_tissue_tmb'; } if (($freq < 0.005) and ($sample_type eq 'c')) { push @reason, 'lowfreq_cfdna_tmb'; } } if (@reason) { print TAG_OUT join(";", @reason), "\t", join("\t", @line), "\n"; next; } my ($oncogenic_col, $mut_effect_col); my $get_key = "$gene\_$protein"; if (exists $oncogenic{lc $get_key}) { my @get_values = split('&&', $oncogenic{lc $get_key}); $oncogenic_col = $get_values[0]; $mut_effect_col = $get_values[1]; } else { $oncogenic_col = '.'; $mut_effect_col = '.'; } my $clisig; if ($line[16] =~ /Likely_pathogenic|drug/i) { $clisig = '2'; } elsif ($line[16] =~ /pathogenic/i and $line[16] !~ /Conflicting/i) { $clisig = '1'; } else { $clisig = '3'; } my @genetags; if (grep {$_ eq $gene} @hcs) { push @genetags, 'hcs'; } if (grep {$_ eq $gene} @mmr) { push @genetags, 'mmr'; } if (grep {$_ eq $gene} @hhr1) { push @genetags, 'hrr1'; } if (grep {$_ eq $gene} @hhr2) { push @genetags, 'hrr2'; } my $validated = '1'; $line[6] = $gene; my $genetag = join(";", @genetags); my $new_line = join("\t", $validated, $clisig, @line[0 .. 6], $freq, @line[7 .. 20, 23, 28, 32, 50, 56, 62, 101, 102], $oncogenic_col, $mut_effect_col, $genetag, $pipeline); print OUT "$new_line\n"; print TAG_OUT "PASS\t", join("\t", @line), "\n"; } # black list sub blacklist { open BKLT, "$public_path/blacklist.txt"; my %bk; ; while () { chomp; my @line = split("\t"); my $key = join("_", @line[0 .. 4]); $bk{$key} = 1; } return %bk; } sub info { open INFO, "$database_path/info.csv"; # 读取并解析表头 my $header = ; chomp($header); my @column_names = split(',', $header); while () { chomp; my @line = split(/,/, $_); # 将数据与表头对应 my %record; @record{@column_names} = @line; if ($record{'project'} eq $project) { if ($record{'mutation'} ne "NA") { @muts = split(/\//, $record{'mutation'}); } if ($record{'HCS'} ne "NA") { @hcs = split(/\//, $record{'HCS'}); } if ($record{'MMR'} ne "NA") { @mmr = split(/\//, $record{'MMR'}); } if ($record{'HRR_I'} ne "NA") { @hhr1 = split(/\//, $record{'HRR_I'}); } if ($record{'HRR_II'} ne "NA") { @hhr2 = split(/\//, $record{'HRR_II'}); } } } return \@muts, \@hcs, \@mmr, \@hhr1, \@hhr2 } sub transcript { open TR, "$database_path/oncokbgene.txt"; my %oncogene; while () { chomp; my @line = split; $oncogene{$line[0]} = $line[2]; $oncogene{$line[0]} =~ s/\.\d+//; } return %oncogene; } # oncokb snv_indel 临床意义定义 sub get_oncogenic { my %sig; open SNV_INDEL, "$database_path/snv_indel_mutation.csv"; ; while () { chomp; my @line = split(","); my $key = join("_", @line[0, 1]); $sig{lc $key} = join("&&", @line[2, 3]); } return %sig; }