pipeline/codes/filter_snpindel.pl

399 lines
12 KiB
Perl
Executable File
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#!/usr/bin/env perl
use strict;
#use warnings;
use List::Util qw(sum);
die "useage:perl $0 input out tag_out project sample_type pipeline" unless @ARGV == 6;
my ($input, $out, $tag_out, $project, $sample_type, $pipeline) = @ARGV;
my $public_path = defined $ENV{'PUBLIC'} ? $ENV{'PUBLIC'} : "/dataseq/jmdna/codes/public";
print "$pipeline 过滤使用public路径$public_path\n";
my $database_path = defined $ENV{'DATABASE'} ? $ENV{'DATABASE'} : "/dataseq/jmdna/codes/reportbase";
print "$pipeline 过滤使用database路径$database_path\n";
# open OUT, ">$**.hg19_multianno_filter.txt";
open OUT, "> $out";
# open OUT1, ">**.hg19_multianno_tag.txt";
open TAG_OUT, ">$tag_out";
if (-z $input) {
print "文件 $input 为空 \n";
exit(0); # 正常退出脚本
}
open IN, "$input";
my $head = <IN>;
chomp($head);
my @columns = split("\t", $head);
my $new_head = join("\t", "Validated", "ClinicalSign", @columns[0 .. 6],
"Freq", @columns[7 .. 20, 23, 28, 32, 50, 56, 62, 101, 102], "Oncogenic", "Mutation_Effect", "is_oncogene",
"is_tumor_suppressor_gene", "genetag", "process");
if (!($pipeline eq 'somatic' || $pipeline eq 'tmb' || $pipeline eq 'hotspot' || $pipeline eq 'germline')) {
die "useage: pipeline must be 'somatic' or 'germline' or 'hotspot or tmb'";
}
print OUT "$new_head\n";
print TAG_OUT "TAG\t$head\n";
my %blacklist = blacklist();
my ($muts_ref, $hcs_ref, $mmr_ref, $hhr1_ref, $hhr2_ref, $promoter_ref) = info();
my @muts = @$muts_ref;
my @hcs = @$hcs_ref;
my @mmr = @$mmr_ref;
my @hhr1 = @$hhr1_ref;
my @hhr2 = @$hhr2_ref;
my @promoter = @$promoter_ref;
my %transcript = transcript();
my ($oncogenic, $is_oncogene) = get_oncogenic();
my %oncogenic = %$oncogenic;
my %is_oncogene = %$is_oncogene;
while (<IN>) {
chomp;
my @line = split(/\t/, $_);
my $freq = ($line[102] =~ /AF=([\d.]+)/) ? $1 : die "AF value not found";
my $filters = split(";", $line[101]);
my $gene = (split(";", $line[6]))[0];
if ($pipeline eq 'germline') {
# GT:DP:VD:AD:AF:RD:ALD
# GT:DP:VD:ALD:RD:AD:AF:BIAS:PMEAN:PSTD:QUAL:QSTD:SBF:ODDRATIO:MQ:SN:HIAF:ADJAF:NM
# 信息提取
my @content_name = split(/:/, $line[103]);
my @content = split(/:/, $line[-1]);
# 将数据与表头对应
my %record_content;
@record_content{@content_name} = @content;
$freq = $record_content{'AF'};
}
my @reason;
# 黑名单
if (exists $blacklist{join("_", @line[0 .. 4])}) {
push @reason, 'blacklist';
}
# 是否检测
if ((@muts) and !(grep {$_ eq $gene} @muts)) {
push @reason, 'notarget_gene';
}
# 人群频率
if (grep {$_ > 0.01} @line[17, 18, 19, 20, 23, 28, 32]) {
push @reason, 'common_snp';
}
# ExonicFunc.refGene 过滤同义突变,未知突变
if ($line[8] eq "synonymous SNV" or $line[8] eq "unknown") {
push @reason, 'synonymous';
}
# CLNSIG benign
if ($line[16] =~ /benign/i and $line[16] !~ /pathogenic|Affects|association|Conflicting|sensitivity|drug|other|risk|protective|Uncertain|not_provided|\./i) {
push @reason, 'benign';
}
# cfdna 样本
if ($sample_type eq 'c') {
if ($line[101] ne 'PASS' and ($freq < 0.005 or ($freq >= 0.005 and $freq < 0.01 and $filters >= 2))) {
push @reason, 'byfilter';
}
# 低频的 cosmic91 无记录的
if ($freq < 0.01 and $line[11] eq '.') {
push @reason, 'cfdna_lowfreq_cosmic';
}
# 低频的cosmic91 只有1条以内的
if ($line[11] =~ /OCCURENCE=(\S+)/) {
my $cosmic = $1;
$cosmic =~ s/\(\S+?\)//g;
my @cosmic = split(",", $cosmic);
$cosmic = sum @cosmic;
if ($freq < 0.01 and $cosmic <= 1) {
push @reason, 'cfdna_lowfreq_cosmic1';
}
if (($cosmic >= 10) and $pipeline eq 'tmb') {
push @reason, 'cfdna_lowfreq_cosmic1';
}
}
}
else {
# 不是pass 低频的; 或者 0.02 < 0.05 之间 个数大于2 血液 0.005 - 0.01
if ($line[101] ne 'PASS' and ($freq < 0.02 or ($freq >= 0.02 and $freq < 0.05 and $filters >= 2))) {
push @reason, 'byfilter';
}
}
my $protein;
# met 基因 在特定区域
if (($gene eq "MET") &&
($line[0] eq "chr7") &&
(($line[1] > 116411872 && $line[1] <= 116411902) or ($line[2] > 116411872 && $line[2] <= 116411902) or
($line[1] > 116412043 && $line[1] <= 116412046) or ($line[2] > 116412043 && $line[2] <= 116412046))) {
$line[9] = join(":", ($gene, "NM_000245", "exon14", "c.xxx"));
$line[8] = 'skipping';
$protein = 'Exon 14 Skipping Mutation';
}
elsif ($line[9] eq '.') {
# splicing 位点
if (($line[5] =~ /splicing/) or ($pipeline eq 'hotspot')) {
my @hgvs = split(/;/, $line[7]);
my $hgvs = $hgvs[0];
my $transcript_gene;
$transcript_gene = $transcript{$gene} if (exists $transcript{$gene});
if (grep {/$transcript_gene/} @hgvs) {
$hgvs = (grep {/$transcript_gene/} @hgvs)[0];
}
$hgvs =~ /(\S+):exon(\d+):c\.(\S+)$/;
my $spl = $3;
my $exon = $2;
# splicing 位点 的前后 1-2bp 的
if ($spl =~ /\d+\+[1|2]\D+/) {
my $intron = $exon;
$hgvs =~ s/exon(\d+)/exon$exon;intron$intron/;
$line[9] = join(":", ($gene, $hgvs));
}
elsif ($spl =~ /\d+\-[1|2]\D+/) {
my $intron = $exon - 1;
$hgvs =~ s/exon(\d+)/intron$intron;exon$exon/;
$line[9] = join(":", ($gene, $hgvs));
}
# 不是前面2种情况hotspot强制转换hgvs
elsif ($pipeline eq 'hotspot') {
print "$hgvs\n";
$line[9] = join(":", ($gene, $hgvs));
}
else {
push @reason, 'not_need_spl';
}
$protein = 'Truncating Mutations';
}
else {
push @reason, 'not_need_spl_inron';
}
}
else {
if ($line[8] eq 'intron') {
push @reason, 'not_need_spl_inron';
}
my @hgvs = split(/,/, $line[9]);
my $hgvs = $hgvs[0];
my $transcript_gene;
$transcript_gene = $transcript{$gene} if (exists $transcript{$gene});
if (grep {/$transcript_gene/} @hgvs) {
$hgvs = (grep {/$transcript_gene/} @hgvs)[0];
}
$line[9] = $hgvs;
$hgvs =~ /:(NM_\d+):exon\d+:(c\.\S+):p\.(\S+)$/;
$protein = $3;
if ($protein =~ /\d+X$|\d+\*$/ or $line[8] eq 'stopgain' or $line[8] eq 'frameshift deletion' or $line[8] eq 'frameshift insertion') {
$protein = 'Truncating Mutations';
}
}
# hotspot 不过滤但是修改 hgvs
if ($pipeline eq 'hotspot') {
@reason = ();
}
# tmb 流程去掉 不过滤但是修改 hgvs
if ($pipeline eq 'tmb') {
@reason = grep(!/synonymous/, @reason);
if (($freq < 0.05) and ($sample_type eq 't')) {
push @reason, 'lowfreq_tissue_tmb';
}
if (($freq < 0.005) and ($sample_type eq 'c')) {
push @reason, 'lowfreq_cfdna_tmb';
}
}
# TERT 突变特殊处理 不过滤
if ((grep {$_ eq $gene} @promoter) and ($pipeline eq 'somatic') and ($gene eq 'TERT')
and ($line[1] eq '1295228' and $line[4] eq 'A') or ($line[1] eq '1295250' and $line[4] eq 'A')) {
@reason = ();
if ($line[1] eq '1295228') {
$line[9] = 'TERT:NM_198253:/:c.-124C>T (C228T)';
}
else {
$line[9] = 'TERT:NM_198253:/:c.-146C>T (C250T)';
}
$line[8] = 'promoter';
}
if (@reason) {
print TAG_OUT join(";", @reason), "\t", join("\t", @line), "\n";
next;
}
my ($oncogenic_col, $mut_effect_col, $is_oncogene_gene, $is_tumor_suppressor_gene);
my $get_key = "$gene\_$protein";
if (exists $oncogenic{lc $get_key}) {
my @get_values = split('&&', $oncogenic{lc $get_key});
$oncogenic_col = $get_values[0];
$mut_effect_col = $get_values[1];
}
else {
$oncogenic_col = '.';
$mut_effect_col = '.';
}
if (exists $is_oncogene{lc $gene}) {
my @get_values = split('&&', $is_oncogene{lc $gene});
$is_oncogene_gene = $get_values[0];
$is_tumor_suppressor_gene = $get_values[1];
}
else {
$is_oncogene_gene = '.';
$is_tumor_suppressor_gene = '.';
}
my $clisig;
# somatic 依据oncogenic改变等级
if (($pipeline eq 'somatic') | ($pipeline eq 'hotspot')) {
if ($oncogenic_col =~ /Likely Oncogenic/i) {
$clisig = '2';
}
elsif ($oncogenic_col =~ /Oncogenic/i) {
$clisig = '1';
}
else {
$clisig = '3';
}
}
else {
if ($line[16] =~ /Likely_pathogenic|drug/i) {
$clisig = '2';
}
elsif ($line[16] =~ /pathogenic/i and $line[16] !~ /Conflicting/i) {
$clisig = '1';
}
else {
$clisig = '3';
}
}
my @genetags;
if (grep {$_ eq $gene} @hcs) {
push @genetags, 'hcs';
}
if (grep {$_ eq $gene} @mmr) {
push @genetags, 'mmr';
}
if (grep {$_ eq $gene} @hhr1) {
push @genetags, 'hrr1';
}
if (grep {$_ eq $gene} @hhr2) {
push @genetags, 'hrr2';
}
if (grep {$_ eq $gene} @promoter) {
@genetags = ();
push @genetags, 'promoter';
}
my $validated = '1';
$line[6] = $gene;
my $genetag = join(";", @genetags);
my $new_line = join("\t", $validated, $clisig,
@line[0 .. 6], $freq, @line[7 .. 20, 23, 28, 32, 50, 56, 62, 101, 102],
$oncogenic_col, $mut_effect_col, $is_oncogene_gene, $is_tumor_suppressor_gene, $genetag, $pipeline);
print OUT "$new_line\n";
print TAG_OUT "PASS\t", join("\t", @line), "\n";
}
# black list
sub blacklist {
open BKLT, "$public_path/blacklist.txt";
my %bk;
<BKLS>;
while (<BKLT>) {
chomp;
my @line = split("\t");
my $key = join("_", @line[0 .. 4]);
$bk{$key} = 1;
}
return %bk;
}
sub info {
open INFO, "$database_path/info.csv";
# 读取并解析表头
my $header = <INFO>;
chomp($header);
my @column_names = split(',', $header);
while (<INFO>) {
chomp;
my @line = split(/,/, $_);
# 将数据与表头对应
my %record;
@record{@column_names} = @line;
if ($record{'project'} eq $project) {
if ($record{'mutation'} ne "NA") {
@muts = split(/\//, $record{'mutation'});
}
if ($record{'HCS'} ne "NA") {
@hcs = split(/\//, $record{'HCS'});
}
if ($record{'MMR'} ne "NA") {
@mmr = split(/\//, $record{'MMR'});
}
if ($record{'HRR_I'} ne "NA") {
@hhr1 = split(/\//, $record{'HRR_I'});
}
if ($record{'HRR_II'} ne "NA") {
@hhr2 = split(/\//, $record{'HRR_II'});
}
if ($record{'promoter'} ne "NA") {
@promoter = split(/\//, $record{'promoter'});
}
}
}
return \@muts, \@hcs, \@mmr, \@hhr1, \@hhr2, \@promoter,
}
sub transcript {
open TR, "$database_path/oncokbgene.txt";
my %oncogene;
while (<TR>) {
chomp;
my @line = split;
$oncogene{$line[0]} = $line[2];
$oncogene{$line[0]} =~ s/\.\d+//;
}
return %oncogene;
}
# oncokb snv_indel 临床意义定义
sub get_oncogenic {
my %sig;
my %sig_gene;
open SNV_INDEL, "$database_path/snv_indel_mutation.csv";
<SNV_INDEL>;
while (<SNV_INDEL>) {
chomp;
$_ =~ s/\r//g;
my @line = split(",");
my $key = join("_", @line[0, 1]);
$sig{lc $key} = join("&&", @line[2, 3, 7, 8]);
$sig_gene{lc $line[0]} = join("&&", @line[7, 8]);
}
return (\%sig, \%sig_gene);
}