#!/usr/bin/env perl use strict; #use warnings; use List::Util qw(sum); die "useage:perl $0 input out tag_out project sample_type pipeline" unless @ARGV == 6; my ($input, $out, $tag_out, $project, $sample_type, $pipeline) = @ARGV; # open OUT, ">$**.hg19_multianno_filter.txt"; open OUT, "> $out"; # open OUT1, ">**.hg19_multianno_tag.txt"; open TAG_OUT, ">$tag_out"; if (-z $input) { print "文件 $input 为空 \n"; exit(0); # 正常退出脚本 } open IN, "$input"; my $head = ; my @columns = split("\t", $head); my $new_head = join("\t", @columns[0 .. 6], "Freq", @columns[7 .. 20, 23, 28, 32, 50, 56, 62, 101, 102], "process"); my $head_key; if ($pipeline eq 'somatic') { $head_key = 'Validated'; } elsif ($pipeline eq 'hotspot') { $head_key = 'Validated'; } elsif ($pipeline eq 'germline') { $head_key = 'ClinicalSign'; } else { die "useage: pipeline must be 'somatic' or 'germline' or 'hotspot'"; } print OUT "$head_key\t$new_head\n"; print TAG_OUT "TAG\t$new_head\n"; my %blacklist = blacklist(); my @mut_gene_list = mut_gene(); my %transcript = transcript(); while () { chomp; my @line = split(/\t/, $_); my $freq = ($line[102] =~ /AF=([\d.]+)/) ? $1 : die "AF value not found"; my $filters = split(";", $line[101]); my $gene = (split(";", $line[6]))[0]; my @reason; # 黑名单 if (exists $blacklist{join("_", @line[0 .. 4])}) { push @reason, 'blacklist'; } # 是否检测 if ((@mut_gene_list) and !(grep {$_ eq $gene} @mut_gene_list)) { push @reason, 'notarget_gene'; } # 人群频率 if (grep {$_ > 0.01} @line[17, 18, 19, 20, 23, 28, 32]) { push @reason, 'common_snp'; } # ExonicFunc.refGene 过滤同义突变，未知突变 if ($line[8] eq "synonymous SNV" or $line[8] eq "unknown") { push @reason, 'synonymous'; } # CLNSIG benign if ($line[16] =~ /benign/i and $line[16] !~ /pathogenic|Affects|association|Conflicting|sensitivity|drug|other|risk|protective|Uncertain|not_provided|\./i) { push @reason, 'benign'; } # 不是pass 低频的；或者 0.02 < 0.05 之间个数大于2 if ($line[101] ne 'PASS' and ($freq < 0.02 or ($freq >= 0.02 and $freq < 0.05 and $filters >= 2))) { push @reason, 'byfilter'; } # cfdna 样本 if ($sample_type eq 'c') { # 低频的 cosmic91 无记录的 if ($freq < 0.01 and $line[11] eq '.') { push @reason, 'cfdna_lowfreq_cosmic'; } # 低频的；cosmic91 只有1条以内的 if ($line[11] =~ /OCCURENCE=(\S+)/) { my $cosmic = $1; $cosmic =~ s/$\S+?$//g; my @cosmic = split(",", $cosmic); $cosmic = sum @cosmic; if ($freq < 0.01 and $cosmic <= 1) { push @reason, 'cfdna_lowfreq_cosmic1'; } } } if ($line[9] eq '.') { # splicing 位点或者 MET的 intron 位点 if ($line[5] =~ /splicing/ or ($line[5] =~ /intron/ and $gene eq "MET")) { my @hgvs = split(/;/, $line[7]); my $hgvs = $hgvs[0]; my $transcript = $transcript{$gene} if (exists $transcript{$gene}); if (grep {/$transcript/} @hgvs) { $hgvs = (grep {/$transcript/} @hgvs)[0]; } $hgvs =~ /(\S+):exon(\d+):c\.(\S+)$/; my $spl = $3; # splicing 位点的前后 1-2bp 的或者 MET的基因 if ($spl =~ /\d+[\+|\-][1|2]\D+/) { $line[9] = join(":", ($gene, $hgvs)); } elsif ($gene eq "MET") { $line[9] = join(":", ($gene, "exon14", "c.xxx")); $line[8] = 'skipping' } else { push @reason, 'not_need_spl'; } } else { push @reason, 'not_need'; } } else { my @hgvs = split(/,/, $line[9]); my $hgvs = $hgvs[0]; my $transcript_gene; $transcript_gene = $transcript{$gene} if (exists $transcript{$gene}); if (grep {/$transcript_gene/} @hgvs) { $hgvs = (grep {/$transcript_gene/} @hgvs)[0]; } $line[9] = $hgvs; } # hotspot 不过滤但是修改 hgvs if ($pipeline eq 'hotspot') { @reason = (); } if (@reason) { print TAG_OUT join(";", @reason), "\t", join("\t", @line), "\n"; next; } $line[6] = $gene; my $validated; if ($pipeline == 'somatic' || $pipeline == 'hotspot') { $validated = '1'; } elsif ($pipeline == 'germline') { if ($line[16] =~ /Likely_pathogenic|drug/i) { $validated = '2'; } elsif ($line[16] =~ /pathogenic/i and $line[16] !~ /Conflicting/i) { $validated = '1'; } else { $validated = '3'; } } my $new_line = join("\t", @line[0 .. 6], $freq, @line[7 .. 20, 23, 28, 32, 50, 56, 62, 101, 102], $pipeline); print OUT "$validated\t$new_line\n"; print TAG_OUT "PASS\t$new_line\n"; } # black list sub blacklist { my $public_path = defined $ENV{'PUBLIC'} ? $ENV{'PUBLIC'} : "/dataseq/jmdna/codes/public/"; open BKLT, "$public_path/blacklist.txt"; my %bk; ; while () { chomp; my @line = split("\t"); my $key = join("_", @line[0 .. 4]); $bk{$key} = 1; } return \%bk; } sub mut_gene { my $public_path = defined $ENV{'PUBLIC'} ? $ENV{'PUBLIC'} : "/dataseq/jmdna/codes/public/"; open INFO, "$public_path/info.csv"; my @muts; while () { chomp; my @line = split(/,/, $_); if ($line[0] eq $project) { if (($line[2] ne "NA") and ($line[9] ne "NA")) { @muts = split(/\//, $line[2]); last; } } } return @muts } sub transcript { my $database_path = defined $ENV{'DATABASE'} ? $ENV{'DATABASE'} : "/dataseq/jmdna/codes/reportbase/"; open TR, "$database_path/oncokbgene.txt"; my %oncogene; while () { chomp; my @line = split; $oncogene{$line[0]} = $line[2]; $oncogene{$line[0]} =~ s/\.\d+//; } return %oncogene; }