#!/usr/bin/env perl use strict; #use warnings; use List::Util qw(sum); die "useage:perl $0 input project sample_type somtic_out germline_out tag_out " unless @ARGV == 6; my ($input, $project, $sample_type, $somtic_out, $germline_out, $tag_out) = @ARGV; # die "useage:perl $0 output_dir tumor project sample_type" unless @ARGV == 4; # my ($output_dir, $name, $project, $sample_type) = @ARGV; # open IN, "$output_dir/mutation/${name}.snp.indel.anno.hg19_multianno.txt"; open IN, "$input"; my $head = ; # if ($sample_type eq 'c') { # open OUT, ">${name}.snp.indel.Somatic.annoall.hg19_multianno_filtered_pre.txt"; # } # elsif ($sample_type eq 't') { # open OUT, ">${name}.snp.indel.Somatic.annoall.hg19_multianno_filtered.txt"; # } open OUT, "> $somtic_out"; print OUT "可信\t$head"; # open OUT2, ">${name}.snp.indel.Germline.anno.hg19_multianno_filtered.txt"; open OUT2, ">$germline_out"; print OUT2 "临床意义\t$head"; # open OUT3, ">${name}.snp.indel.anno.hg19_multianno_tag.txt"; open OUT3, ">$tag_out"; print OUT3 "TAG\t$head"; ##black list my $public_path = defined $ENV{'PUBLIC'} ? $ENV{'PUBLIC'} : "/dataseq/jmdna/codes/public/"; open BKLT, "$public_path/blacklist.txt"; my %bk; ; while () { chomp; my @line = split("\t"); my $key = join("_", @line[0 .. 4]); $bk{$key} = 1; } sub blacklist { my $pos = shift @_; if (exists $bk{$pos}) { return "1"; } else { return ""; } } open INFO, "$public_path/info.txt"; my @muts; while () { chomp; my @line = split(/\t/, $_); if ($line[0] eq $project) { if ($line[2] ne "NA") { @muts = split(/\//, $line[2]); } } } while () { chomp; my @line = split(/\t/, $_); my $freq = (split(":", $line[-1]))[4]; # next if $line[9] eq '.'; if ($line[8] ne "synonymous SNV" and $line[8] ne "unknown") { if ($line[17] < 0.01 and $line[18] < 0.01 and $line[19] < 0.01 and $line[20] < 0.01 and $line[23] < 0.01 and $line[28] < 0.01 and $line[32] < 0.01) { if ($line[16] =~ /benign/i and $line[16] !~ /pathogenic|Affects|association|Conflicting|sensitivity|drug|other|risk|protective|Uncertain|not_provided|\./i) { print OUT3 "benign\t", join("\t", @line), "\n"; next; } if ($sample_type eq 'c') { if ($line[11] =~ /OCCURENCE=(\S+)/) { my $cosmic = $1; $cosmic =~ s/$\S+?$//g; my @cosmic = split(",", $cosmic); $cosmic = sum @cosmic; if ($freq < 0.01 and $cosmic <= 1) { print OUT3 "cfdna_lowfreq_cosmic\t", join("\t", @line), "\n"; next; } } if ($freq < 0.01 and $line[11] eq '.') { print OUT3 "cfdna_lowfreq_cosmic\t", join("\t", @line), "\n"; next; } } #blacklist my $key = join("_", @line[0 .. 4]); if (&blacklist($key)) { print OUT3 "blacklist\t", join("\t", @line), "\n"; next; }; if ($line[9] ne '.') { my @hgvs = split(/,/, $line[9]); my $hgvs = $hgvs[0]; $hgvs =~ /(\S+):(\S+):exon(\d+):c\.(\S+):p\.(\S+)$/; my $gene = $1; if (!(@muts and grep {$gene eq $_} @muts)) { print OUT3 "nontarget_gene\t", join("\t", @line), "\n"; next; } if ($line[101] ne 'PASS') { my $filter = split(";", $line[101]); if ($freq < 0.02 or ($freq >= 0.02 and $freq < 0.05 and $filter >= 2)) { print OUT3 "byfilter\t", join("\t", @line), "\n"; next; }; } if (my $transcript = &transcript($gene)) { if (grep {/$transcript/} @hgvs) { $hgvs = (grep {/$transcript/} @hgvs)[0]; } } $line[9] = $hgvs; print OUT "1\t", join("\t", (@line[0 .. 4], "exonic", $gene, @line[7 .. $#line])), "\n"; print OUT3 "PASS\t", join("\t", @line), "\n"; if ($freq > 0.1) { if ($line[16] =~ /Likely_pathogenic|drug/i) { print OUT2 "2\t", join("\t", (@line[0 .. 4], "exonic", $gene, @line[7 .. $#line])), "\n"; } elsif ($line[16] =~ /pathogenic/i and $line[16] !~ /Conflicting/i) { print OUT2 "1\t", join("\t", (@line[0 .. 4], "exonic", $gene, @line[7 .. $#line])), "\n"; } else { print OUT2 "3\t", join("\t", (@line[0 .. 4], "exonic", $gene, @line[7 .. $#line])), "\n"; } } } elsif ($line[5] =~ /splicing/) { # next if $line[101] ne 'PASS'; # if ($line[101] ne 'PASS') { # print OUT3 "unknow3\t", join("\t", @line), "\n"; # next; # } my $gene = (split(";", $line[6]))[0]; if (!(@muts and grep {$gene eq $_} @muts)) { print OUT3 "nontarget_gene\t", join("\t", @line), "\n"; next; }; my @hgvs = split(/;/, $line[7]); my $hgvs = $hgvs[0]; if (my $transcript = &transcript($gene)) { if (grep {/$transcript/} @hgvs) { $hgvs = (grep {/$transcript/} @hgvs)[0]; } } $hgvs =~ /(\S+):exon(\d+):c\.(\S+)$/; my $spl = $3; if ($spl =~ /\d+[\+|\-][1|2]\D+/) { $line[7] = join(":", ($gene, $hgvs)); print OUT3 "PASS\t", join("\t", @line), "\n"; print OUT "1\t", join("\t", (@line[0 .. 4], "splicing", $gene, '.', '.', @line[7, 10 .. $#line])), "\n"; if ($freq > 0.1) { if ($line[16] =~ /Likely_pathogenic|drug/i) { print OUT2 "2\t", join("\t", (@line[0 .. 4], "splicing", $gene, '.', '.', @line[7, 10 .. $#line])), "\n"; } elsif ($line[16] =~ /pathogenic/i and $line[16] !~ /Conflicting/i) { print OUT2 "1\t", join("\t", (@line[0 .. 4], "splicing", $gene, '.', '.', @line[7, 10 .. $#line])), "\n"; } else { print OUT2 "3\t", join("\t", (@line[0 .. 4], "splicing", $gene, '.', '.', @line[7, 10 .. $#line])), "\n"; } } } else { print OUT3 "unknow1\t", join("\t", @line), "\n"; } } else { print OUT3 "unknow2\t", join("\t", @line), "\n"; } } else { print OUT3 "common_snp\t", join("\t", @line), "\n"; } } else { print OUT3 "synonymous\t", join("\t", @line), "\n"; } } sub transcript { my $gene = shift @_; my $data_path = defined $ENV{'DATABASE'} ? $ENV{'DATABASE'} : "/dataseq/jmdna/codes/reportbase/"; open TR, "$data_path/oncokbgene.txt"; my %oncogene; while () { chomp; my @line = split; $oncogene{$line[0]} = $line[2]; } if (exists $oncogene{$gene}) { $oncogene{$gene} =~ s/\.\d+//; return $oncogene{$gene}; } else { print "$gene has no NM id in oncokbgene.txt"; return ""; } }