pipeline/script/filter_snpindel.pl

213 lines
7.7 KiB
Perl
Executable File

#!/usr/bin/env perl
use strict;
#use warnings;
use List::Util qw(sum);
die "useage:perl $0 input project sample_type somtic_out germline_out tag_out " unless @ARGV == 6;
my ($input, $project, $sample_type, $somtic_out, $germline_out, $tag_out) = @ARGV;
# die "useage:perl $0 output_dir tumor project sample_type" unless @ARGV == 4;
# my ($output_dir, $name, $project, $sample_type) = @ARGV;
# open IN, "$output_dir/mutation/${name}.snp.indel.anno.hg19_multianno.txt";
open IN, "$input";
my $head = <IN>;
# if ($sample_type eq 'c') {
# open OUT, ">${name}.snp.indel.Somatic.annoall.hg19_multianno_filtered_pre.txt";
# }
# elsif ($sample_type eq 't') {
# open OUT, ">${name}.snp.indel.Somatic.annoall.hg19_multianno_filtered.txt";
# }
open OUT, "> $somtic_out";
print OUT "可信\t$head";
# open OUT2, ">${name}.snp.indel.Germline.anno.hg19_multianno_filtered.txt";
open OUT2, ">$germline_out";
print OUT2 "临床意义\t$head";
# open OUT3, ">${name}.snp.indel.anno.hg19_multianno_tag.txt";
open OUT3, ">$tag_out";
print OUT3 "TAG\t$head";
##black list
my $public_path = defined $ENV{'PUBLIC'} ? $ENV{'PUBLIC'} : "/dataseq/jmdna/codes/public/";
open BKLT, "$public_path/blacklist.txt";
my %bk;
<BKLS>;
while (<BKLT>) {
chomp;
my @line = split("\t");
my $key = join("_", @line[0 .. 4]);
$bk{$key} = 1;
}
sub blacklist {
my $pos = shift @_;
if (exists $bk{$pos}) {
return "1";
}
else {
return "";
}
}
open INFO, "$public_path/info.txt";
my @muts;
while (<INFO>) {
chomp;
my @line = split(/\t/, $_);
if ($line[0] eq $project) {
if ($line[2] ne "NA") {
@muts = split(/\//, $line[2]);
}
}
}
while (<IN>) {
chomp;
my @line = split(/\t/, $_);
my $freq = (split(":", $line[-1]))[4];
# next if $line[9] eq '.';
if ($line[8] ne "synonymous SNV" and $line[8] ne "unknown") {
if ($line[17] < 0.01 and $line[18] < 0.01 and $line[19] < 0.01 and $line[20] < 0.01 and $line[23] < 0.01 and $line[28] < 0.01 and $line[32] < 0.01) {
if ($line[16] =~ /benign/i and $line[16] !~ /pathogenic|Affects|association|Conflicting|sensitivity|drug|other|risk|protective|Uncertain|not_provided|\./i) {
print OUT3 "benign\t", join("\t", @line), "\n";
next;
}
if ($sample_type eq 'c') {
if ($line[11] =~ /OCCURENCE=(\S+)/) {
my $cosmic = $1;
$cosmic =~ s/\(\S+?\)//g;
my @cosmic = split(",", $cosmic);
$cosmic = sum @cosmic;
if ($freq < 0.01 and $cosmic <= 1) {
print OUT3 "cfdna_lowfreq_cosmic\t", join("\t", @line), "\n";
next;
}
}
if ($freq < 0.01 and $line[11] eq '.') {
print OUT3 "cfdna_lowfreq_cosmic\t", join("\t", @line), "\n";
next;
}
}
#blacklist
my $key = join("_", @line[0 .. 4]);
if (&blacklist($key)) {
print OUT3 "blacklist\t", join("\t", @line), "\n";
next;
};
if ($line[9] ne '.') {
my @hgvs = split(/,/, $line[9]);
my $hgvs = $hgvs[0];
$hgvs =~ /(\S+):(\S+):exon(\d+):c\.(\S+):p\.(\S+)$/;
my $gene = $1;
if (!(@muts and grep {$gene eq $_} @muts)) {
print OUT3 "nontarget_gene\t", join("\t", @line), "\n";
next;
}
if ($line[101] ne 'PASS') {
my $filter = split(";", $line[101]);
if ($freq < 0.02 or ($freq >= 0.02 and $freq < 0.05 and $filter >= 2)) {
print OUT3 "byfilter\t", join("\t", @line), "\n";
next;
};
}
if (my $transcript = &transcript($gene)) {
if (grep {/$transcript/} @hgvs) {
$hgvs = (grep {/$transcript/} @hgvs)[0];
}
}
$line[9] = $hgvs;
print OUT "1\t", join("\t", (@line[0 .. 4], "exonic", $gene, @line[7 .. $#line])), "\n";
print OUT3 "PASS\t", join("\t", @line), "\n";
if ($freq > 0.1) {
if ($line[16] =~ /Likely_pathogenic|drug/i) {
print OUT2 "2\t", join("\t", (@line[0 .. 4], "exonic", $gene, @line[7 .. $#line])), "\n";
}
elsif ($line[16] =~ /pathogenic/i and $line[16] !~ /Conflicting/i) {
print OUT2 "1\t", join("\t", (@line[0 .. 4], "exonic", $gene, @line[7 .. $#line])), "\n";
}
else {
print OUT2 "3\t", join("\t", (@line[0 .. 4], "exonic", $gene, @line[7 .. $#line])), "\n";
}
}
}
elsif ($line[5] =~ /splicing/) {
# next if $line[101] ne 'PASS';
# if ($line[101] ne 'PASS') {
# print OUT3 "unknow3\t", join("\t", @line), "\n";
# next;
# }
my $gene = (split(";", $line[6]))[0];
if (!(@muts and grep {$gene eq $_} @muts)) {
print OUT3 "nontarget_gene\t", join("\t", @line), "\n";
next;
};
my @hgvs = split(/;/, $line[7]);
my $hgvs = $hgvs[0];
if (my $transcript = &transcript($gene)) {
if (grep {/$transcript/} @hgvs) {
$hgvs = (grep {/$transcript/} @hgvs)[0];
}
}
$hgvs =~ /(\S+):exon(\d+):c\.(\S+)$/;
my $spl = $3;
if ($spl =~ /\d+[\+|\-][1|2]\D+/) {
$line[7] = join(":", ($gene, $hgvs));
print OUT3 "PASS\t", join("\t", @line), "\n";
print OUT "1\t", join("\t", (@line[0 .. 4], "splicing", $gene, '.', '.', @line[7, 10 .. $#line])), "\n";
if ($freq > 0.1) {
if ($line[16] =~ /Likely_pathogenic|drug/i) {
print OUT2 "2\t", join("\t", (@line[0 .. 4], "splicing", $gene, '.', '.', @line[7, 10 .. $#line])), "\n";
}
elsif ($line[16] =~ /pathogenic/i and $line[16] !~ /Conflicting/i) {
print OUT2 "1\t", join("\t", (@line[0 .. 4], "splicing", $gene, '.', '.', @line[7, 10 .. $#line])), "\n";
}
else {
print OUT2 "3\t", join("\t", (@line[0 .. 4], "splicing", $gene, '.', '.', @line[7, 10 .. $#line])), "\n";
}
}
}
else {
print OUT3 "unknow1\t", join("\t", @line), "\n";
}
}
else {
print OUT3 "unknow2\t", join("\t", @line), "\n";
}
}
else {
print OUT3 "common_snp\t", join("\t", @line), "\n";
}
}
else {
print OUT3 "synonymous\t", join("\t", @line), "\n";
}
}
sub transcript {
my $gene = shift @_;
my $data_path = defined $ENV{'DATABASE'} ? $ENV{'DATABASE'} : "/dataseq/jmdna/codes/reportbase/";
open TR, "$data_path/oncokbgene.txt";
my %oncogene;
while (<TR>) {
chomp;
my @line = split;
$oncogene{$line[0]} = $line[2];
}
if (exists $oncogene{$gene}) {
$oncogene{$gene} =~ s/\.\d+//;
return $oncogene{$gene};
}
else {
print "$gene has no NM id in oncokbgene.txt";
return "";
}
}