pipeline/script/filter_snpindel.pl

219 lines
6.1 KiB
Perl
Executable File
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#!/usr/bin/env perl
use strict;
#use warnings;
use List::Util qw(sum);
die "useage:perl $0 input out tag_out project sample_type pipeline" unless @ARGV == 6;
my ($input, $out, $tag_out, $project, $sample_type, $pipeline) = @ARGV;
# open OUT, ">$**.hg19_multianno_filter.txt";
open OUT, "> $out";
# open OUT1, ">**.hg19_multianno_tag.txt";
open TAG_OUT, ">$tag_out";
if (-z $input) {
print "文件 $input 为空 \n";
exit(0); # 正常退出脚本
}
open IN, "$input";
my $head = <IN>;
my @columns = split("\t", $head);
my $new_head = join("\t", @columns[0 .. 6], "Freq", @columns[7 .. 20, 23, 28, 32, 50, 56, 62, 101, 102], "process");
my $head_key;
if ($pipeline eq 'somatic') {
$head_key = 'Validated';
}
elsif ($pipeline eq 'hotspot') {
$head_key = 'Validated';
}
elsif ($pipeline eq 'germline') {
$head_key = 'ClinicalSign';
}
else {
die "useage: pipeline must be 'somatic' or 'germline' or 'hotspot'";
}
print OUT "$head_key\t$new_head\n";
print TAG_OUT "TAG\t$new_head\n";
my %blacklist = blacklist();
my @mut_gene_list = mut_gene();
my %transcript = transcript();
while (<IN>) {
chomp;
my @line = split(/\t/, $_);
my $freq = ($line[102] =~ /AF=([\d.]+)/) ? $1 : die "AF value not found";
my $filters = split(";", $line[101]);
my $gene = (split(";", $line[6]))[0];
my @reason;
# 黑名单
if (exists $blacklist{join("_", @line[0 .. 4])}) {
push @reason, 'blacklist';
}
# 是否检测
if ((@mut_gene_list) and !(grep {$_ eq $gene} @mut_gene_list)) {
push @reason, 'notarget_gene';
}
# 人群频率
if (grep {$_ > 0.01} @line[17, 18, 19, 20, 23, 28, 32]) {
push @reason, 'common_snp';
}
# ExonicFunc.refGene 过滤同义突变,未知突变
if ($line[8] eq "synonymous SNV" or $line[8] eq "unknown") {
push @reason, 'synonymous';
}
# CLNSIG benign
if ($line[16] =~ /benign/i and $line[16] !~ /pathogenic|Affects|association|Conflicting|sensitivity|drug|other|risk|protective|Uncertain|not_provided|\./i) {
push @reason, 'benign';
}
# 不是pass 低频的; 或者 0.02 < 0.05 之间 个数大于2
if ($line[101] ne 'PASS' and ($freq < 0.02 or ($freq >= 0.02 and $freq < 0.05 and $filters >= 2))) {
push @reason, 'byfilter';
}
# cfdna 样本
if ($sample_type eq 'c') {
# 低频的 cosmic91 无记录的
if ($freq < 0.01 and $line[11] eq '.') {
push @reason, 'cfdna_lowfreq_cosmic';
}
# 低频的cosmic91 只有1条以内的
if ($line[11] =~ /OCCURENCE=(\S+)/) {
my $cosmic = $1;
$cosmic =~ s/\(\S+?\)//g;
my @cosmic = split(",", $cosmic);
$cosmic = sum @cosmic;
if ($freq < 0.01 and $cosmic <= 1) {
push @reason, 'cfdna_lowfreq_cosmic1';
}
}
}
if ($line[9] eq '.') {
# splicing 位点 或者 MET的 intron 位点
if ($line[5] =~ /splicing/ or ($line[5] =~ /intron/ and $gene eq "MET")) {
my @hgvs = split(/;/, $line[7]);
my $hgvs = $hgvs[0];
my $transcript = $transcript{$gene} if (exists $transcript{$gene});
if (grep {/$transcript/} @hgvs) {
$hgvs = (grep {/$transcript/} @hgvs)[0];
}
$hgvs =~ /(\S+):exon(\d+):c\.(\S+)$/;
my $spl = $3;
# splicing 位点 的前后 1-2bp 的 或者 MET的基因
if ($spl =~ /\d+[\+|\-][1|2]\D+/) {
$line[9] = join(":", ($gene, $hgvs));
}
elsif ($gene eq "MET") {
$line[9] = join(":", ($gene, "exon14", "c.xxx"));
$line[8] = 'skipping'
}
else {
push @reason, 'not_need_spl';
}
}
else {
push @reason, 'not_need';
}
}
else {
my @hgvs = split(/,/, $line[9]);
my $hgvs = $hgvs[0];
my $transcript_gene;
$transcript_gene = $transcript{$gene} if (exists $transcript{$gene});
if (grep {/$transcript_gene/} @hgvs) {
$hgvs = (grep {/$transcript_gene/} @hgvs)[0];
}
$line[9] = $hgvs;
}
# hotspot 不过滤但是修改 hgvs
if ($pipeline eq 'hotspot') {
@reason = ();
}
if (@reason) {
print TAG_OUT join(";", @reason), "\t", join("\t", @line), "\n";
next;
}
$line[6] = $gene;
my $validated;
if ($pipeline == 'somatic' || $pipeline == 'hotspot') {
$validated = '1';
}
elsif ($pipeline == 'germline') {
if ($line[16] =~ /Likely_pathogenic|drug/i) {
$validated = '2';
}
elsif ($line[16] =~ /pathogenic/i and $line[16] !~ /Conflicting/i) {
$validated = '1';
}
else {
$validated = '3';
}
}
my $new_line = join("\t", @line[0 .. 6], $freq, @line[7 .. 20, 23, 28, 32, 50, 56, 62, 101, 102], $pipeline);
print OUT "$validated\t$new_line\n";
print TAG_OUT "PASS\t$new_line\n";
}
# black list
sub blacklist {
my $public_path = defined $ENV{'PUBLIC'} ? $ENV{'PUBLIC'} : "/dataseq/jmdna/codes/public/";
open BKLT, "$public_path/blacklist.txt";
my %bk;
<BKLS>;
while (<BKLT>) {
chomp;
my @line = split("\t");
my $key = join("_", @line[0 .. 4]);
$bk{$key} = 1;
}
return \%bk;
}
sub mut_gene {
my $public_path = defined $ENV{'PUBLIC'} ? $ENV{'PUBLIC'} : "/dataseq/jmdna/codes/public/";
open INFO, "$public_path/info.csv";
my @muts;
while (<INFO>) {
chomp;
my @line = split(/,/, $_);
if ($line[0] eq $project) {
if (($line[2] ne "NA") and ($line[9] ne "NA")) {
@muts = split(/\//, $line[2]);
last;
}
}
}
return @muts
}
sub transcript {
my $database_path = defined $ENV{'DATABASE'} ? $ENV{'DATABASE'} : "/dataseq/jmdna/codes/reportbase/";
open TR, "$database_path/oncokbgene.txt";
my %oncogene;
while (<TR>) {
chomp;
my @line = split;
$oncogene{$line[0]} = $line[2];
$oncogene{$line[0]} =~ s/\.\d+//;
}
return %oncogene;
}