2023-10-18 15:59:11 +08:00
|
|
|
|
#!/usr/bin/env perl
|
|
|
|
|
|
use strict;
|
|
|
|
|
|
#use warnings;
|
|
|
|
|
|
use List::Util qw(sum);
|
|
|
|
|
|
|
2023-11-01 10:09:29 +08:00
|
|
|
|
die "useage:perl $0 input out tag_out project sample_type pipeline" unless @ARGV == 6;
|
|
|
|
|
|
my ($input, $out, $tag_out, $project, $sample_type, $pipeline) = @ARGV;
|
2023-10-18 15:59:11 +08:00
|
|
|
|
|
2023-11-01 10:09:29 +08:00
|
|
|
|
# open OUT, ">$**.hg19_multianno_filter.txt";
|
|
|
|
|
|
open OUT, "> $out";
|
|
|
|
|
|
# open OUT1, ">**.hg19_multianno_tag.txt";
|
|
|
|
|
|
open TAG_OUT, ">$tag_out";
|
|
|
|
|
|
|
|
|
|
|
|
if (-z $input) {
|
|
|
|
|
|
print "文件 $input 为空 \n";
|
|
|
|
|
|
exit(0); # 正常退出脚本
|
|
|
|
|
|
}
|
2023-10-18 15:59:11 +08:00
|
|
|
|
|
|
|
|
|
|
open IN, "$input";
|
|
|
|
|
|
my $head = <IN>;
|
2023-11-01 10:09:29 +08:00
|
|
|
|
|
|
|
|
|
|
my @columns = split("\t", $head);
|
|
|
|
|
|
my $new_head = join("\t", @columns[0 .. 6], "Freq", @columns[7 .. 20, 23, 28, 32, 50, 56, 62, 101, 102], "process");
|
|
|
|
|
|
|
|
|
|
|
|
my $head_key;
|
|
|
|
|
|
if ($pipeline eq 'somatic') {
|
|
|
|
|
|
$head_key = 'Validated';
|
2023-10-18 15:59:11 +08:00
|
|
|
|
}
|
2023-11-01 10:09:29 +08:00
|
|
|
|
elsif ($pipeline eq 'hotspot') {
|
|
|
|
|
|
$head_key = 'Validated';
|
2023-10-18 15:59:11 +08:00
|
|
|
|
}
|
2023-11-01 10:09:29 +08:00
|
|
|
|
elsif ($pipeline eq 'germline') {
|
|
|
|
|
|
$head_key = 'ClinicalSign';
|
|
|
|
|
|
}
|
|
|
|
|
|
else {
|
|
|
|
|
|
die "useage: pipeline must be 'somatic' or 'germline' or 'hotspot'";
|
2023-10-18 15:59:11 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
2023-11-01 10:09:29 +08:00
|
|
|
|
print OUT "$head_key\t$new_head\n";
|
|
|
|
|
|
print TAG_OUT "TAG\t$new_head\n";
|
|
|
|
|
|
|
|
|
|
|
|
my %blacklist = blacklist();
|
|
|
|
|
|
my @mut_gene_list = mut_gene();
|
|
|
|
|
|
my %transcript = transcript();
|
|
|
|
|
|
|
2023-10-18 15:59:11 +08:00
|
|
|
|
while (<IN>) {
|
|
|
|
|
|
chomp;
|
|
|
|
|
|
my @line = split(/\t/, $_);
|
2023-11-01 10:09:29 +08:00
|
|
|
|
my $freq = ($line[102] =~ /AF=([\d.]+)/) ? $1 : die "AF value not found";
|
|
|
|
|
|
my $filters = split(";", $line[101]);
|
|
|
|
|
|
my $gene = (split(";", $line[6]))[0];
|
|
|
|
|
|
|
|
|
|
|
|
my @reason;
|
|
|
|
|
|
# 黑名单
|
|
|
|
|
|
if (exists $blacklist{join("_", @line[0 .. 4])}) {
|
|
|
|
|
|
push @reason, 'blacklist';
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# 是否检测
|
|
|
|
|
|
if ((@mut_gene_list) and !(grep {$_ eq $gene} @mut_gene_list)) {
|
|
|
|
|
|
push @reason, 'notarget_gene';
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# 人群频率
|
|
|
|
|
|
if (grep {$_ > 0.01} @line[17, 18, 19, 20, 23, 28, 32]) {
|
|
|
|
|
|
push @reason, 'common_snp';
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# ExonicFunc.refGene 过滤同义突变,未知突变
|
|
|
|
|
|
if ($line[8] eq "synonymous SNV" or $line[8] eq "unknown") {
|
|
|
|
|
|
push @reason, 'synonymous';
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# CLNSIG benign
|
|
|
|
|
|
if ($line[16] =~ /benign/i and $line[16] !~ /pathogenic|Affects|association|Conflicting|sensitivity|drug|other|risk|protective|Uncertain|not_provided|\./i) {
|
|
|
|
|
|
push @reason, 'benign';
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# 不是pass 低频的; 或者 0.02 < 0.05 之间 个数大于2
|
|
|
|
|
|
if ($line[101] ne 'PASS' and ($freq < 0.02 or ($freq >= 0.02 and $freq < 0.05 and $filters >= 2))) {
|
|
|
|
|
|
push @reason, 'byfilter';
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# cfdna 样本
|
|
|
|
|
|
if ($sample_type eq 'c') {
|
|
|
|
|
|
# 低频的 cosmic91 无记录的
|
|
|
|
|
|
if ($freq < 0.01 and $line[11] eq '.') {
|
|
|
|
|
|
push @reason, 'cfdna_lowfreq_cosmic';
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# 低频的;cosmic91 只有1条以内的
|
|
|
|
|
|
if ($line[11] =~ /OCCURENCE=(\S+)/) {
|
|
|
|
|
|
my $cosmic = $1;
|
|
|
|
|
|
$cosmic =~ s/\(\S+?\)//g;
|
|
|
|
|
|
my @cosmic = split(",", $cosmic);
|
|
|
|
|
|
$cosmic = sum @cosmic;
|
|
|
|
|
|
if ($freq < 0.01 and $cosmic <= 1) {
|
|
|
|
|
|
push @reason, 'cfdna_lowfreq_cosmic1';
|
2023-10-18 15:59:11 +08:00
|
|
|
|
}
|
2023-11-01 10:09:29 +08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if ($line[9] eq '.') {
|
|
|
|
|
|
# splicing 位点 或者 MET的 intron 位点
|
|
|
|
|
|
if ($line[5] =~ /splicing/ or ($line[5] =~ /intron/ and $gene eq "MET")) {
|
|
|
|
|
|
my @hgvs = split(/;/, $line[7]);
|
|
|
|
|
|
my $hgvs = $hgvs[0];
|
|
|
|
|
|
my $transcript = $transcript{$gene} if (exists $transcript{$gene});
|
|
|
|
|
|
if (grep {/$transcript/} @hgvs) {
|
|
|
|
|
|
$hgvs = (grep {/$transcript/} @hgvs)[0];
|
2023-10-18 15:59:11 +08:00
|
|
|
|
}
|
2023-11-01 10:09:29 +08:00
|
|
|
|
$hgvs =~ /(\S+):exon(\d+):c\.(\S+)$/;
|
|
|
|
|
|
my $spl = $3;
|
|
|
|
|
|
# splicing 位点 的前后 1-2bp 的 或者 MET的基因
|
|
|
|
|
|
if ($spl =~ /\d+[\+|\-][1|2]\D+/) {
|
|
|
|
|
|
$line[9] = join(":", ($gene, $hgvs));
|
2023-10-18 15:59:11 +08:00
|
|
|
|
}
|
2023-11-01 10:09:29 +08:00
|
|
|
|
elsif ($gene eq "MET") {
|
|
|
|
|
|
$line[9] = join(":", ($gene, "exon14", "c.xxx"));
|
|
|
|
|
|
$line[8] = 'skipping'
|
2023-10-18 15:59:11 +08:00
|
|
|
|
}
|
|
|
|
|
|
else {
|
2023-11-01 10:09:29 +08:00
|
|
|
|
push @reason, 'not_need_spl';
|
2023-10-18 15:59:11 +08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
else {
|
2023-11-01 10:09:29 +08:00
|
|
|
|
push @reason, 'not_need';
|
2023-10-18 15:59:11 +08:00
|
|
|
|
}
|
2023-11-01 10:09:29 +08:00
|
|
|
|
|
2023-10-18 15:59:11 +08:00
|
|
|
|
}
|
|
|
|
|
|
else {
|
2023-11-01 10:09:29 +08:00
|
|
|
|
my @hgvs = split(/,/, $line[9]);
|
|
|
|
|
|
my $hgvs = $hgvs[0];
|
|
|
|
|
|
my $transcript_gene;
|
|
|
|
|
|
$transcript_gene = $transcript{$gene} if (exists $transcript{$gene});
|
|
|
|
|
|
if (grep {/$transcript_gene/} @hgvs) {
|
|
|
|
|
|
$hgvs = (grep {/$transcript_gene/} @hgvs)[0];
|
|
|
|
|
|
}
|
|
|
|
|
|
$line[9] = $hgvs;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# hotspot 不过滤但是修改 hgvs
|
|
|
|
|
|
if ($pipeline eq 'hotspot') {
|
|
|
|
|
|
@reason = ();
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (@reason) {
|
|
|
|
|
|
print TAG_OUT join(";", @reason), "\t", join("\t", @line), "\n";
|
|
|
|
|
|
next;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
$line[6] = $gene;
|
|
|
|
|
|
|
|
|
|
|
|
my $validated;
|
|
|
|
|
|
if ($pipeline == 'somatic' || $pipeline == 'hotspot') {
|
|
|
|
|
|
$validated = '1';
|
|
|
|
|
|
}
|
|
|
|
|
|
elsif ($pipeline == 'germline') {
|
|
|
|
|
|
if ($line[16] =~ /Likely_pathogenic|drug/i) {
|
|
|
|
|
|
$validated = '2';
|
|
|
|
|
|
}
|
|
|
|
|
|
elsif ($line[16] =~ /pathogenic/i and $line[16] !~ /Conflicting/i) {
|
|
|
|
|
|
$validated = '1';
|
|
|
|
|
|
}
|
|
|
|
|
|
else {
|
|
|
|
|
|
$validated = '3';
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
my $new_line = join("\t", @line[0 .. 6], $freq, @line[7 .. 20, 23, 28, 32, 50, 56, 62, 101, 102], $pipeline);
|
|
|
|
|
|
print OUT "$validated\t$new_line\n";
|
|
|
|
|
|
print TAG_OUT "PASS\t$new_line\n";
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# black list
|
|
|
|
|
|
sub blacklist {
|
|
|
|
|
|
my $public_path = defined $ENV{'PUBLIC'} ? $ENV{'PUBLIC'} : "/dataseq/jmdna/codes/public/";
|
|
|
|
|
|
open BKLT, "$public_path/blacklist.txt";
|
|
|
|
|
|
my %bk;
|
|
|
|
|
|
<BKLS>;
|
|
|
|
|
|
while (<BKLT>) {
|
|
|
|
|
|
chomp;
|
|
|
|
|
|
my @line = split("\t");
|
|
|
|
|
|
my $key = join("_", @line[0 .. 4]);
|
|
|
|
|
|
$bk{$key} = 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
return \%bk;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
sub mut_gene {
|
|
|
|
|
|
my $public_path = defined $ENV{'PUBLIC'} ? $ENV{'PUBLIC'} : "/dataseq/jmdna/codes/public/";
|
|
|
|
|
|
open INFO, "$public_path/info.csv";
|
|
|
|
|
|
my @muts;
|
|
|
|
|
|
while (<INFO>) {
|
|
|
|
|
|
chomp;
|
|
|
|
|
|
my @line = split(/,/, $_);
|
|
|
|
|
|
if ($line[0] eq $project) {
|
|
|
|
|
|
if (($line[2] ne "NA") and ($line[9] ne "NA")) {
|
|
|
|
|
|
@muts = split(/\//, $line[2]);
|
|
|
|
|
|
last;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2023-10-18 15:59:11 +08:00
|
|
|
|
}
|
2023-11-01 10:09:29 +08:00
|
|
|
|
return @muts
|
2023-10-18 15:59:11 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
sub transcript {
|
2023-11-01 10:09:29 +08:00
|
|
|
|
my $database_path = defined $ENV{'DATABASE'} ? $ENV{'DATABASE'} : "/dataseq/jmdna/codes/reportbase/";
|
|
|
|
|
|
open TR, "$database_path/oncokbgene.txt";
|
2023-10-18 15:59:11 +08:00
|
|
|
|
my %oncogene;
|
|
|
|
|
|
while (<TR>) {
|
|
|
|
|
|
chomp;
|
|
|
|
|
|
my @line = split;
|
|
|
|
|
|
$oncogene{$line[0]} = $line[2];
|
2023-11-01 10:09:29 +08:00
|
|
|
|
$oncogene{$line[0]} =~ s/\.\d+//;
|
2023-10-18 15:59:11 +08:00
|
|
|
|
}
|
2023-11-01 10:09:29 +08:00
|
|
|
|
return %oncogene;
|
2023-10-18 15:59:11 +08:00
|
|
|
|
}
|