399 lines
12 KiB
Perl
Executable File
399 lines
12 KiB
Perl
Executable File
#!/usr/bin/env perl
|
||
|
||
use strict;
|
||
#use warnings;
|
||
use List::Util qw(sum);
|
||
|
||
die "useage:perl $0 input out tag_out project sample_type pipeline" unless @ARGV == 6;
|
||
my ($input, $out, $tag_out, $project, $sample_type, $pipeline) = @ARGV;
|
||
|
||
my $public_path = defined $ENV{'PUBLIC'} ? $ENV{'PUBLIC'} : "/dataseq/jmdna/codes/public";
|
||
print "$pipeline 过滤使用public路径:$public_path\n";
|
||
|
||
my $database_path = defined $ENV{'DATABASE'} ? $ENV{'DATABASE'} : "/dataseq/jmdna/codes/reportbase";
|
||
|
||
print "$pipeline 过滤使用database路径:$database_path\n";
|
||
|
||
# open OUT, ">$**.hg19_multianno_filter.txt";
|
||
open OUT, "> $out";
|
||
# open OUT1, ">**.hg19_multianno_tag.txt";
|
||
open TAG_OUT, ">$tag_out";
|
||
|
||
if (-z $input) {
|
||
print "文件 $input 为空 \n";
|
||
exit(0); # 正常退出脚本
|
||
}
|
||
|
||
open IN, "$input";
|
||
my $head = <IN>;
|
||
chomp($head);
|
||
|
||
my @columns = split("\t", $head);
|
||
my $new_head = join("\t", "Validated", "ClinicalSign", @columns[0 .. 6],
|
||
"Freq", @columns[7 .. 20, 23, 28, 32, 50, 56, 62, 101, 102], "Oncogenic", "Mutation_Effect", "is_oncogene",
|
||
"is_tumor_suppressor_gene", "genetag", "process");
|
||
|
||
if (!($pipeline eq 'somatic' || $pipeline eq 'tmb' || $pipeline eq 'hotspot' || $pipeline eq 'germline')) {
|
||
die "useage: pipeline must be 'somatic' or 'germline' or 'hotspot or tmb'";
|
||
}
|
||
|
||
print OUT "$new_head\n";
|
||
print TAG_OUT "TAG\t$head\n";
|
||
|
||
my %blacklist = blacklist();
|
||
my ($muts_ref, $hcs_ref, $mmr_ref, $hhr1_ref, $hhr2_ref, $promoter_ref) = info();
|
||
|
||
my @muts = @$muts_ref;
|
||
my @hcs = @$hcs_ref;
|
||
my @mmr = @$mmr_ref;
|
||
my @hhr1 = @$hhr1_ref;
|
||
my @hhr2 = @$hhr2_ref;
|
||
my @promoter = @$promoter_ref;
|
||
|
||
my %transcript = transcript();
|
||
my ($oncogenic, $is_oncogene) = get_oncogenic();
|
||
my %oncogenic = %$oncogenic;
|
||
my %is_oncogene = %$is_oncogene;
|
||
|
||
while (<IN>) {
|
||
chomp;
|
||
my @line = split(/\t/, $_);
|
||
my $freq = ($line[102] =~ /AF=([\d.]+)/) ? $1 : die "AF value not found";
|
||
my $filters = split(";", $line[101]);
|
||
my $gene = (split(";", $line[6]))[0];
|
||
|
||
if ($pipeline eq 'germline') {
|
||
# GT:DP:VD:AD:AF:RD:ALD
|
||
# GT:DP:VD:ALD:RD:AD:AF:BIAS:PMEAN:PSTD:QUAL:QSTD:SBF:ODDRATIO:MQ:SN:HIAF:ADJAF:NM
|
||
# 信息提取
|
||
my @content_name = split(/:/, $line[103]);
|
||
my @content = split(/:/, $line[-1]);
|
||
# 将数据与表头对应
|
||
my %record_content;
|
||
@record_content{@content_name} = @content;
|
||
$freq = $record_content{'AF'};
|
||
}
|
||
|
||
my @reason;
|
||
# 黑名单
|
||
if (exists $blacklist{join("_", @line[0 .. 4])}) {
|
||
push @reason, 'blacklist';
|
||
}
|
||
|
||
# 是否检测
|
||
if ((@muts) and !(grep {$_ eq $gene} @muts)) {
|
||
push @reason, 'notarget_gene';
|
||
}
|
||
|
||
# 人群频率
|
||
if (grep {$_ > 0.01} @line[17, 18, 19, 20, 23, 28, 32]) {
|
||
push @reason, 'common_snp';
|
||
}
|
||
|
||
# ExonicFunc.refGene 过滤同义突变,未知突变
|
||
if ($line[8] eq "synonymous SNV" or $line[8] eq "unknown") {
|
||
push @reason, 'synonymous';
|
||
}
|
||
|
||
# CLNSIG benign
|
||
if ($line[16] =~ /benign/i and $line[16] !~ /pathogenic|Affects|association|Conflicting|sensitivity|drug|other|risk|protective|Uncertain|not_provided|\./i) {
|
||
push @reason, 'benign';
|
||
}
|
||
|
||
# cfdna 样本
|
||
if ($sample_type eq 'c') {
|
||
|
||
if ($line[101] ne 'PASS' and ($freq < 0.005 or ($freq >= 0.005 and $freq < 0.01 and $filters >= 2))) {
|
||
push @reason, 'byfilter';
|
||
}
|
||
|
||
# 低频的 cosmic91 无记录的
|
||
if ($freq < 0.01 and $line[11] eq '.') {
|
||
push @reason, 'cfdna_lowfreq_cosmic';
|
||
}
|
||
|
||
# 低频的;cosmic91 只有1条以内的
|
||
if ($line[11] =~ /OCCURENCE=(\S+)/) {
|
||
my $cosmic = $1;
|
||
$cosmic =~ s/\(\S+?\)//g;
|
||
my @cosmic = split(",", $cosmic);
|
||
$cosmic = sum @cosmic;
|
||
if ($freq < 0.01 and $cosmic <= 1) {
|
||
push @reason, 'cfdna_lowfreq_cosmic1';
|
||
}
|
||
if (($cosmic >= 10) and $pipeline eq 'tmb') {
|
||
push @reason, 'cfdna_lowfreq_cosmic1';
|
||
}
|
||
}
|
||
}
|
||
else {
|
||
# 不是pass 低频的; 或者 0.02 < 0.05 之间 个数大于2 血液 0.005 - 0.01
|
||
if ($line[101] ne 'PASS' and ($freq < 0.02 or ($freq >= 0.02 and $freq < 0.05 and $filters >= 2))) {
|
||
push @reason, 'byfilter';
|
||
}
|
||
|
||
}
|
||
|
||
my $protein;
|
||
# met 基因 在特定区域
|
||
if (($gene eq "MET") &&
|
||
($line[0] eq "chr7") &&
|
||
(($line[1] > 116411872 && $line[1] <= 116411902) or ($line[2] > 116411872 && $line[2] <= 116411902) or
|
||
($line[1] > 116412043 && $line[1] <= 116412046) or ($line[2] > 116412043 && $line[2] <= 116412046))) {
|
||
$line[9] = join(":", ($gene, "NM_000245", "exon14", "c.xxx"));
|
||
$line[8] = 'skipping';
|
||
$protein = 'Exon 14 Skipping Mutation';
|
||
|
||
}
|
||
elsif ($line[9] eq '.') {
|
||
# splicing 位点
|
||
if (($line[5] =~ /splicing/) or ($pipeline eq 'hotspot')) {
|
||
my @hgvs = split(/;/, $line[7]);
|
||
my $hgvs = $hgvs[0];
|
||
my $transcript_gene;
|
||
$transcript_gene = $transcript{$gene} if (exists $transcript{$gene});
|
||
if (grep {/$transcript_gene/} @hgvs) {
|
||
$hgvs = (grep {/$transcript_gene/} @hgvs)[0];
|
||
}
|
||
$hgvs =~ /(\S+):exon(\d+):c\.(\S+)$/;
|
||
my $spl = $3;
|
||
my $exon = $2;
|
||
# splicing 位点 的前后 1-2bp 的
|
||
if ($spl =~ /\d+\+[1|2]\D+/) {
|
||
my $intron = $exon;
|
||
$hgvs =~ s/exon(\d+)/exon$exon;intron$intron/;
|
||
$line[9] = join(":", ($gene, $hgvs));
|
||
}
|
||
elsif ($spl =~ /\d+\-[1|2]\D+/) {
|
||
my $intron = $exon - 1;
|
||
$hgvs =~ s/exon(\d+)/intron$intron;exon$exon/;
|
||
$line[9] = join(":", ($gene, $hgvs));
|
||
}
|
||
# 不是前面2种情况,hotspot强制转换hgvs
|
||
elsif ($pipeline eq 'hotspot') {
|
||
print "$hgvs\n";
|
||
$line[9] = join(":", ($gene, $hgvs));
|
||
}
|
||
else {
|
||
push @reason, 'not_need_spl';
|
||
}
|
||
$protein = 'Truncating Mutations';
|
||
}
|
||
else {
|
||
push @reason, 'not_need_spl_inron';
|
||
}
|
||
|
||
}
|
||
else {
|
||
if ($line[8] eq 'intron') {
|
||
push @reason, 'not_need_spl_inron';
|
||
}
|
||
my @hgvs = split(/,/, $line[9]);
|
||
my $hgvs = $hgvs[0];
|
||
my $transcript_gene;
|
||
$transcript_gene = $transcript{$gene} if (exists $transcript{$gene});
|
||
if (grep {/$transcript_gene/} @hgvs) {
|
||
$hgvs = (grep {/$transcript_gene/} @hgvs)[0];
|
||
}
|
||
$line[9] = $hgvs;
|
||
|
||
$hgvs =~ /:(NM_\d+):exon\d+:(c\.\S+):p\.(\S+)$/;
|
||
$protein = $3;
|
||
if ($protein =~ /\d+X$|\d+\*$/ or $line[8] eq 'stopgain' or $line[8] eq 'frameshift deletion' or $line[8] eq 'frameshift insertion') {
|
||
$protein = 'Truncating Mutations';
|
||
}
|
||
|
||
}
|
||
|
||
# hotspot 不过滤但是修改 hgvs
|
||
if ($pipeline eq 'hotspot') {
|
||
@reason = ();
|
||
}
|
||
|
||
# tmb 流程去掉 不过滤但是修改 hgvs
|
||
if ($pipeline eq 'tmb') {
|
||
@reason = grep(!/synonymous/, @reason);
|
||
if (($freq < 0.05) and ($sample_type eq 't')) {
|
||
push @reason, 'lowfreq_tissue_tmb';
|
||
}
|
||
if (($freq < 0.005) and ($sample_type eq 'c')) {
|
||
push @reason, 'lowfreq_cfdna_tmb';
|
||
}
|
||
}
|
||
|
||
# TERT 突变特殊处理 不过滤
|
||
if ((grep {$_ eq $gene} @promoter) and ($pipeline eq 'somatic') and ($gene eq 'TERT')
|
||
and ($line[1] eq '1295228' and $line[4] eq 'A') or ($line[1] eq '1295250' and $line[4] eq 'A')) {
|
||
@reason = ();
|
||
if ($line[1] eq '1295228') {
|
||
$line[9] = 'TERT:NM_198253:/:c.-124C>T (C228T)';
|
||
}
|
||
else {
|
||
$line[9] = 'TERT:NM_198253:/:c.-146C>T (C250T)';
|
||
}
|
||
$line[8] = 'promoter';
|
||
}
|
||
|
||
if (@reason) {
|
||
print TAG_OUT join(";", @reason), "\t", join("\t", @line), "\n";
|
||
next;
|
||
}
|
||
|
||
my ($oncogenic_col, $mut_effect_col, $is_oncogene_gene, $is_tumor_suppressor_gene);
|
||
my $get_key = "$gene\_$protein";
|
||
if (exists $oncogenic{lc $get_key}) {
|
||
my @get_values = split('&&', $oncogenic{lc $get_key});
|
||
$oncogenic_col = $get_values[0];
|
||
$mut_effect_col = $get_values[1];
|
||
|
||
}
|
||
else {
|
||
$oncogenic_col = '.';
|
||
$mut_effect_col = '.';
|
||
}
|
||
if (exists $is_oncogene{lc $gene}) {
|
||
my @get_values = split('&&', $is_oncogene{lc $gene});
|
||
$is_oncogene_gene = $get_values[0];
|
||
$is_tumor_suppressor_gene = $get_values[1];
|
||
}
|
||
else {
|
||
$is_oncogene_gene = '.';
|
||
$is_tumor_suppressor_gene = '.';
|
||
}
|
||
|
||
my $clisig;
|
||
# somatic 依据oncogenic改变等级
|
||
if (($pipeline eq 'somatic') | ($pipeline eq 'hotspot')) {
|
||
if ($oncogenic_col =~ /Likely Oncogenic/i) {
|
||
$clisig = '2';
|
||
}
|
||
elsif ($oncogenic_col =~ /Oncogenic/i) {
|
||
$clisig = '1';
|
||
}
|
||
else {
|
||
$clisig = '3';
|
||
}
|
||
}
|
||
else {
|
||
if ($line[16] =~ /Likely_pathogenic|drug/i) {
|
||
$clisig = '2';
|
||
}
|
||
elsif ($line[16] =~ /pathogenic/i and $line[16] !~ /Conflicting/i) {
|
||
$clisig = '1';
|
||
}
|
||
else {
|
||
$clisig = '3';
|
||
}
|
||
}
|
||
|
||
my @genetags;
|
||
if (grep {$_ eq $gene} @hcs) {
|
||
push @genetags, 'hcs';
|
||
}
|
||
if (grep {$_ eq $gene} @mmr) {
|
||
push @genetags, 'mmr';
|
||
}
|
||
if (grep {$_ eq $gene} @hhr1) {
|
||
push @genetags, 'hrr1';
|
||
}
|
||
if (grep {$_ eq $gene} @hhr2) {
|
||
push @genetags, 'hrr2';
|
||
}
|
||
if (grep {$_ eq $gene} @promoter) {
|
||
@genetags = ();
|
||
push @genetags, 'promoter';
|
||
}
|
||
|
||
my $validated = '1';
|
||
|
||
$line[6] = $gene;
|
||
my $genetag = join(";", @genetags);
|
||
my $new_line = join("\t", $validated, $clisig,
|
||
@line[0 .. 6], $freq, @line[7 .. 20, 23, 28, 32, 50, 56, 62, 101, 102],
|
||
$oncogenic_col, $mut_effect_col, $is_oncogene_gene, $is_tumor_suppressor_gene, $genetag, $pipeline);
|
||
print OUT "$new_line\n";
|
||
print TAG_OUT "PASS\t", join("\t", @line), "\n";
|
||
|
||
}
|
||
|
||
# black list
|
||
sub blacklist {
|
||
open BKLT, "$public_path/blacklist.txt";
|
||
my %bk;
|
||
<BKLS>;
|
||
while (<BKLT>) {
|
||
chomp;
|
||
my @line = split("\t");
|
||
my $key = join("_", @line[0 .. 4]);
|
||
$bk{$key} = 1;
|
||
}
|
||
return %bk;
|
||
}
|
||
|
||
sub info {
|
||
|
||
open INFO, "$database_path/info.csv";
|
||
# 读取并解析表头
|
||
my $header = <INFO>;
|
||
chomp($header);
|
||
my @column_names = split(',', $header);
|
||
while (<INFO>) {
|
||
chomp;
|
||
my @line = split(/,/, $_);
|
||
# 将数据与表头对应
|
||
my %record;
|
||
@record{@column_names} = @line;
|
||
if ($record{'project'} eq $project) {
|
||
if ($record{'mutation'} ne "NA") {
|
||
@muts = split(/\//, $record{'mutation'});
|
||
}
|
||
if ($record{'HCS'} ne "NA") {
|
||
@hcs = split(/\//, $record{'HCS'});
|
||
}
|
||
if ($record{'MMR'} ne "NA") {
|
||
@mmr = split(/\//, $record{'MMR'});
|
||
}
|
||
if ($record{'HRR_I'} ne "NA") {
|
||
@hhr1 = split(/\//, $record{'HRR_I'});
|
||
}
|
||
if ($record{'HRR_II'} ne "NA") {
|
||
@hhr2 = split(/\//, $record{'HRR_II'});
|
||
}
|
||
if ($record{'promoter'} ne "NA") {
|
||
@promoter = split(/\//, $record{'promoter'});
|
||
}
|
||
}
|
||
}
|
||
return \@muts, \@hcs, \@mmr, \@hhr1, \@hhr2, \@promoter,
|
||
}
|
||
|
||
sub transcript {
|
||
|
||
open TR, "$database_path/oncokbgene.txt";
|
||
my %oncogene;
|
||
while (<TR>) {
|
||
chomp;
|
||
my @line = split;
|
||
$oncogene{$line[0]} = $line[2];
|
||
$oncogene{$line[0]} =~ s/\.\d+//;
|
||
}
|
||
return %oncogene;
|
||
}
|
||
|
||
# oncokb snv_indel 临床意义定义
|
||
sub get_oncogenic {
|
||
my %sig;
|
||
my %sig_gene;
|
||
open SNV_INDEL, "$database_path/snv_indel_mutation.csv";
|
||
<SNV_INDEL>;
|
||
while (<SNV_INDEL>) {
|
||
chomp;
|
||
$_ =~ s/\r//g;
|
||
my @line = split(",");
|
||
my $key = join("_", @line[0, 1]);
|
||
$sig{lc $key} = join("&&", @line[2, 3, 7, 8]);
|
||
$sig_gene{lc $line[0]} = join("&&", @line[7, 8]);
|
||
}
|
||
return (\%sig, \%sig_gene);
|
||
}
|