pipeline/codes/filter_snpindel.pl

#!/usr/bin/env perl

use strict;
#use warnings;
use List::Util qw(sum);

die "useage:perl $0 input out tag_out project sample_type pipeline" unless @ARGV == 6;
my ($input, $out, $tag_out, $project, $sample_type, $pipeline) = @ARGV;

my $public_path = defined $ENV{'PUBLIC'} ? $ENV{'PUBLIC'} : "/dataseq/jmdna/codes/public";
print "$pipeline 过滤使用public路径：$public_path\n";

my $database_path = defined $ENV{'DATABASE'} ? $ENV{'DATABASE'} : "/dataseq/jmdna/codes/reportbase";

print "$pipeline 过滤使用database路径：$database_path\n";

# open OUT, ">$**.hg19_multianno_filter.txt";
open OUT, "> $out";
# open OUT1, ">**.hg19_multianno_tag.txt";
open TAG_OUT, ">$tag_out";

if (-z $input) {
    print "文件 $input 为空 \n";
    exit(0); # 正常退出脚本
}

open IN, "$input";
my $head = <IN>;
chomp($head);

my @columns = split("\t", $head);
my $new_head = join("\t", "Validated", "ClinicalSign", @columns[0 .. 6],
    "Freq", @columns[7 .. 20, 23, 28, 32, 50, 56, 62, 101, 102], "Oncogenic", "Mutation_Effect", "genetag", "process");

if (!($pipeline eq 'somatic' || $pipeline eq 'tmb' || $pipeline eq 'hotspot' || $pipeline eq 'germline')) {
    die "useage: pipeline must be 'somatic' or 'germline' or 'hotspot or tmb'";
}

print OUT "$new_head\n";
print TAG_OUT "TAG\t$head\n";

my %blacklist = blacklist();
my ($muts_ref, $hcs_ref, $mmr_ref, $hhr1_ref, $hhr2_ref) = info();

my @muts = @$muts_ref;
my @hcs = @$hcs_ref;
my @mmr = @$mmr_ref;
my @hhr1 = @$hhr1_ref;
my @hhr2 = @$hhr2_ref;

my %transcript = transcript();
my %oncogenic = get_oncogenic();

while (<IN>) {
    chomp;
    my @line = split(/\t/, $_);
    my $freq = ($line[102] =~ /AF=([\d.]+)/) ? $1 : die "AF value not found";
    my $filters = split(";", $line[101]);
    my $gene = (split(";", $line[6]))[0];

    if ($pipeline eq 'germline') {
        $freq = (split(";", $line[-1]))[6];
    }

    my @reason;
    # 黑名单
    if (exists $blacklist{join("_", @line[0 .. 4])}) {
        push @reason, 'blacklist';
    }

    # 是否检测
    if ((@muts) and !(grep {$_ eq $gene} @muts)) {
        push @reason, 'notarget_gene';
    }

    # 人群频率
    if (grep {$_ > 0.01} @line[17, 18, 19, 20, 23, 28, 32]) {
        push @reason, 'common_snp';
    }

    # ExonicFunc.refGene 过滤同义突变，未知突变
    if ($line[8] eq "synonymous SNV" or $line[8] eq "unknown") {
        push @reason, 'synonymous';
    }

    # CLNSIG benign
    if ($line[16] =~ /benign/i and $line[16] !~ /pathogenic|Affects|association|Conflicting|sensitivity|drug|other|risk|protective|Uncertain|not_provided|\./i) {
        push @reason, 'benign';
    }

    # cfdna 样本
    if ($sample_type eq 'c') {

        if ($line[101] ne 'PASS' and ($freq < 0.005 or ($freq >= 0.005 and $freq < 0.01 and $filters >= 2))) {
            push @reason, 'byfilter';
        }

        # 低频的 cosmic91 无记录的
        if ($freq < 0.01 and $line[11] eq '.') {
            push @reason, 'cfdna_lowfreq_cosmic';
        }

        # 低频的；cosmic91 只有1条以内的
        if ($line[11] =~ /OCCURENCE=(\S+)/) {
            my $cosmic = $1;
            $cosmic =~ s/\(\S+?\)//g;
            my @cosmic = split(",", $cosmic);
            $cosmic = sum @cosmic;
            if ($freq < 0.01 and $cosmic <= 1) {
                push @reason, 'cfdna_lowfreq_cosmic1';
            }
            if (($cosmic >= 10) and $pipeline eq 'tmb') {
                push @reason, 'cfdna_lowfreq_cosmic1';
            }
        }
    }
    else {
        # 不是pass 低频的； 或者 0.02  < 0.05 之间 个数大于2 血液 0.005 - 0.01
        if ($line[101] ne 'PASS' and ($freq < 0.02 or ($freq >= 0.02 and $freq < 0.05 and $filters >= 2))) {
            push @reason, 'byfilter';
        }

    }

    my $protein;
    # met 基因 在特定区域
    if (($gene eq "MET") &&
        ($line[0] eq "chr7") &&
        (($line[1] > 116411872 && $line[1] <= 116411902) or ($line[2] > 116411872 && $line[2] <= 116411902) or
            ($line[1] > 116412043 && $line[1] <= 116412046) or ($line[2] > 116412043 && $line[2] <= 116412046))) {
        $line[9] = join(":", ($gene, "NM_000245", "exon14", "c.xxx"));
        $line[8] = 'skipping';
        $protein = 'Exon 14 Skipping Mutation';

    }
    elsif ($line[9] eq '.') {
        # splicing 位点
        if ($line[5] =~ /splicing/) {
            my @hgvs = split(/;/, $line[7]);
            my $hgvs = $hgvs[0];
            my $transcript_gene;
            $transcript_gene = $transcript{$gene} if (exists $transcript{$gene});
            if (grep {/$transcript_gene/} @hgvs) {
                $hgvs = (grep {/$transcript_gene/} @hgvs)[0];
            }
            $hgvs =~ /(\S+):exon(\d+):c\.(\S+)$/;
            my $spl = $3;
            my $exon = $2;
            # splicing 位点 的前后 1-2bp 的
            if ($spl =~ /\d+\+[1|2]\D+/) {
                my $intron = $exon;
                $hgvs =~ s/exon(\d+)/exon$exon;intron$intron/;
                $line[9] = join(":", ($gene, $hgvs));
            }
            elsif ($spl =~ /\d+\-[1|2]\D+/) {
                my $intron = $exon - 1;
                $hgvs =~ s/exon(\d+)/intron$intron;exon$exon/;
                $line[9] = join(":", ($gene, $hgvs));
            }
            else {
                push @reason, 'not_need_spl';
            }
            $protein = 'Truncating Mutations';
        }
        else {
            push @reason, 'not_need_spl_inron';
        }

    }
    else {
        my @hgvs = split(/,/, $line[9]);
        my $hgvs = $hgvs[0];
        my $transcript_gene;
        $transcript_gene = $transcript{$gene} if (exists $transcript{$gene});
        if (grep {/$transcript_gene/} @hgvs) {
            $hgvs = (grep {/$transcript_gene/} @hgvs)[0];
        }
        $line[9] = $hgvs;

        $hgvs =~ /:(NM_\d+):exon\d+:(c\.\S+):p\.(\S+)$/;
        $protein = $3;
        if ($protein =~ /\d+X$|\d+\*$/ or $line[8] eq 'stopgain' or $line[8] eq 'frameshift deletion' or $line[8] eq 'frameshift insertion') {
            $protein = 'Truncating Mutations';
        }

    }

    # hotspot 不过滤但是修改 hgvs
    if ($pipeline eq 'hotspot') {
        @reason = ();
    }

    # tmb 流程去掉 不过滤但是修改 hgvs
    if ($pipeline eq 'tmb') {
        @reason = grep(!/synonymous/, @reason);
        if (($freq < 0.05) and ($sample_type eq 't')) {
            push @reason, 'lowfreq_tissue_tmb';
        }
        if (($freq < 0.005) and ($sample_type eq 'c')) {
            push @reason, 'lowfreq_cfdna_tmb';
        }
    }

    if (@reason) {
        print TAG_OUT join(";", @reason), "\t", join("\t", @line), "\n";
        next;
    }

    my ($oncogenic_col, $mut_effect_col);
    my $get_key = "$gene\_$protein";
    if (exists $oncogenic{lc $get_key}) {
        my @get_values = split('&&', $oncogenic{lc $get_key});
        $oncogenic_col = $get_values[0];
        $mut_effect_col = $get_values[1];
    }
    else {
        $oncogenic_col = '.';
        $mut_effect_col = '.';
    }

    my $clisig;
    if ($line[16] =~ /Likely_pathogenic|drug/i) {
        $clisig = '2';
    }
    elsif ($line[16] =~ /pathogenic/i and $line[16] !~ /Conflicting/i) {
        $clisig = '1';
    }
    else {
        $clisig = '3';
    }

    my @genetags;
    if (grep {$_ eq $gene} @hcs) {
        push @genetags, 'hcs';
    }
    if (grep {$_ eq $gene} @mmr) {
        push @genetags, 'mmr';
    }
    if (grep {$_ eq $gene} @hhr1) {
        push @genetags, 'hrr1';
    }
    if (grep {$_ eq $gene} @hhr2) {
        push @genetags, 'hrr2';
    }

    my $validated = '1';

    $line[6] = $gene;
    my $genetag = join(";", @genetags);
    my $new_line = join("\t", $validated, $clisig, @line[0 .. 6], $freq, @line[7 .. 20, 23, 28, 32, 50, 56, 62, 101, 102], $oncogenic_col, $mut_effect_col, $genetag, $pipeline);
    print OUT "$new_line\n";
    print TAG_OUT "PASS\t", join("\t", @line), "\n";

}

# black list
sub blacklist {
    open BKLT, "$public_path/blacklist.txt";
    my %bk;
    <BKLS>;
    while (<BKLT>) {
        chomp;
        my @line = split("\t");
        my $key = join("_", @line[0 .. 4]);
        $bk{$key} = 1;
    }
    return %bk;
}

sub info {

    open INFO, "$database_path/info.csv";
    # 读取并解析表头
    my $header = <INFO>;
    chomp($header);
    my @column_names = split(',', $header);
    while (<INFO>) {
        chomp;
        my @line = split(/,/, $_);
        # 将数据与表头对应
        my %record;
        @record{@column_names} = @line;
        if ($record{'project'} eq $project) {
            if ($record{'mutation'} ne "NA") {
                @muts = split(/\//, $record{'mutation'});
            }
            if ($record{'HCS'} ne "NA") {
                @hcs = split(/\//, $record{'HCS'});
            }
            if ($record{'MMR'} ne "NA") {
                @mmr = split(/\//, $record{'MMR'});
            }
            if ($record{'HRR_I'} ne "NA") {
                @hhr1 = split(/\//, $record{'HRR_I'});
            }
            if ($record{'HRR_II'} ne "NA") {
                @hhr2 = split(/\//, $record{'HRR_II'});
            }
        }
    }
    return \@muts, \@hcs, \@mmr, \@hhr1, \@hhr2
}

sub transcript {

    open TR, "$database_path/oncokbgene.txt";
    my %oncogene;
    while (<TR>) {
        chomp;
        my @line = split;
        $oncogene{$line[0]} = $line[2];
        $oncogene{$line[0]} =~ s/\.\d+//;
    }
    return %oncogene;
}

# oncokb snv_indel 临床意义定义
sub get_oncogenic {
    my %sig;
    open SNV_INDEL, "$database_path/snv_indel_mutation.csv";
    <SNV_INDEL>;
    while (<SNV_INDEL>) {
        chomp;
        my @line = split(",");
        my $key = join("_", @line[0, 1]);
        $sig{lc $key} = join("&&", @line[2, 3]);
    }
    return %sig;
}