pipeline/codes/target_therapy_snpindel.pl

311 lines
13 KiB
Perl
Executable File
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#!/usr/bin/perl
use strict;
use warnings;
die "useage:perl $0 input output cancer_type" unless @ARGV == 3;
my ($input, $output, $cancer_type) = @ARGV;
my $database_path = defined $ENV{'DATABASE'} ? $ENV{'DATABASE'} : "/dataseq/jmdna/codes/reportbase";
print "SnpIndel药物注释使用路径$database_path\n";
##将点突变信息记录到%mut
open MUT, "$database_path/snv_indel_mutation.csv";
<MUT>;
my %mut;
while (<MUT>) {
my @line = split(/,/);
$mut{lc $line[0]}{lc $line[1]} = $line[2];
}
##将用药信息记录到%therapy
open THERAPY, "$database_path/targetTherapy.txt";
my $h1 = <THERAPY>;
chomp $h1;
my %therapy;
while (<THERAPY>) {
chomp;
my @line = split("\t");
push @{$therapy{lc $line[0]}{lc $line[1]}}, $_ if ($line[9] ne 'D' and $line[2] !~ /Leukemia|Lymphoma|Myeloid/i);
}
##药物翻译信息
open DRUG, "$database_path/target_drug.txt";
my %drug;
<DRUG>;
while (<DRUG>) {
chomp;
my @line = split(/\t/);
next unless $line[1];
foreach my $drug (split(/\|/, $line[0])) {
$drug{lc $drug} = $line[1];
}
}
##疾病翻译信息
# 分两部分,向上匹配,和向下匹配
open DIS, "$database_path/oncotree.cancertype.20230801.txt";
<DIS>;
my (%dis, @id, %dis2);
while (<DIS>) {
chomp;
my @line = split(/\t/);
$dis{lc $line[2]} = $line[3];
$dis{lc $line[4]} = $line[5];
push @{$dis2{$line[0]}}, lc $line[2];
push @{$dis2{$line[0]}}, lc $line[4];
push @id, $line[0];
}
foreach my $ID ($cancer_type) {
my @family;
my @ids = split("", $ID);
for (my $i = 1; $i < @ids; $i = $i + 2) {
push @family, join("", @ids[0 .. $i]);
}
push @family, (grep {/^$ID/} @id);
foreach my $t (@family) {
push @{$dis2{$ID}}, @{$dis2{$t}};
}
}
foreach my $key (keys(%dis2)) {
my %uniq;
@{$dis2{$key}} = grep {++$uniq{$_} < 2} @{$dis2{$key}};
}
open IN, "$input";
open POS, ">$output.pos.txt";
open NEG, ">$output.neg.txt";
open VUS, ">$output.vus.txt";
my $h2 = <IN>;
chomp $h2;
my $h = $h2 . "\tfun_change\t" . join("\t", (split("\t", $h1))[0 .. 9, 14]) . "\tLabel\tDrugCn\tIndication\treport_need_note";
print POS "$h\n";
print NEG $h2 . "\tfun_change\n";
print VUS $h2 . "\tfun_change\n";
my @column_names = split('\t', $h2);
my (@pos, @neg, @vus);
while (<IN>) {
chomp;
my @splitline = split("\t");
# 将数据与表头对应
my %record;
@record{@column_names} = @splitline;
my ($protein, $mut_type);
my $gene = $record{'Gene_refGene'};
if ($record{'AAChange_refGene'} =~ /(\w+):(\w+):exon(\d+):c\.(\S+):p\.(\S+)$/) {
my $exon = $3;
my $codon = $4;
if ($5 =~ /\d+X$|\d+\*$/
or $record{'ExonicFunc_refGene'} eq 'stopgain'
or $record{'ExonicFunc_refGene'} eq 'frameshift deletion'
or $record{'ExonicFunc_refGene'} eq 'frameshift insertion') {
$protein = 'Truncating Mutations';
}
else {
$protein = $5;
}
# $mut_type = ($codon =~ /del/) ? ("Exon $exon deletion") : ($codon =~ /ins/) ? ("Exon $exon insertion") : ("Exon $exon mutation");
$mut_type = ($codon =~ /del/) ? ("Exon $exon deletion") :
($codon =~ /ins/) ? ("Exon $exon insertion") :
($codon =~ /dup/) ? ("Exon $exon insertion") :
("Exon $exon mutation");
}
elsif ($record{'ExonicFunc_refGene'} =~ /skipping/) {
$protein = 'Exon 14 Skipping Mutation';
$mut_type = '';
}
elsif ($record{'Func_refGene'} =~ /splicing/) {
$protein = 'Truncating Mutations';
$mut_type = '';
}
else {
print "药物注释未匹配到正确的protein或者mut_type";
}
##若突变不存在于%mut,写入@vus,若突变存在于%mut且neutral写入@neg;若基因不存在于%therapy写入@vus;
if (not exists $mut{lc $gene}{lc $protein}) {
if ($record{'CLNSIG'} =~ /benign/i and $record{'CLNSIG'} !~ /sensitivity|pathogenic|uncertain|\./i and $record{'cosmic91'} ne '.') {
push @neg, "$_\t.";
}
else {
push @vus, "$_\t.";
}
}
else {
if ($mut{lc $gene}{lc $protein} =~ /neutral/i) {
push @neg, "$_\t$mut{lc $gene}{lc $protein}";
}
elsif ($mut{lc $gene}{lc $protein} =~ /Inconclusive/i) {
push @vus, "$_\t$mut{lc $gene}{lc $protein}";
}
else {
if (not exists $therapy{lc $gene}) {
if ($record{'CLNSIG'} =~ /benign/i and $record{'CLNSIG'} !~ /sensitivity|pathogenic|uncertain|\./i and $record{'cosmic91'} ne '.') {
push @neg, "$_\t$mut{lc $gene}{lc $protein}";
}
else {
push @vus, "$_\t$mut{lc $gene}{lc $protein}";
}
}
else {
my $bool = 0;
# 匹配 p.
if (exists $therapy{lc $gene}{lc $protein}) {
foreach my $entry (@{$therapy{lc $gene}{lc $protein}}) {
my @line = split("\t", $entry);
if (!defined($line[16])) {
$line[16] = '';
}
if (($line[14] eq 'A') and (grep {lc $line[2] eq lc $_} @{$dis2{$cancer_type}})) {
push @pos, "$_\t$mut{lc $gene}{lc $protein}\t" . join("\t", @line[0 .. 9, 14]) . "\t适应症" . "\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]} . "\t" . $line[16];
$bool = 1;
}
elsif (($line[14] eq 'A') and (grep {lc $line[2] ne lc $_} @{$dis2{$cancer_type}})) {
push @pos, "$_\t$mut{lc $gene}{lc $protein}\t" . join("\t", @line[0 .. 9, 14]) . "\t非适应症" . "\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]} . "\t" . $line[16];
$bool = 1;
}
elsif (grep {lc $line[2] eq lc $_} @{$dis2{$cancer_type}}) {
push @pos, "$_\t$mut{lc $gene}{lc $protein}\t" . join("\t", @line[0 .. 9, 14]) . "\t\.\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]} . "\t" . $line[16];
$bool = 1;
}
}
}
# 匹配 Mutation
if (exists $therapy{lc $gene}{'mutation'}) {
foreach my $entry (@{$therapy{lc $gene}{'mutation'}}) {
my @line = split("\t", $entry);
if (!defined($line[16])) {
$line[16] = '';
}
if (($line[14] eq 'A') and (grep {lc $line[2] eq lc $_} @{$dis2{$cancer_type}})) {
push @pos, "$_\t$mut{lc $gene}{lc $protein}\t" . join("\t", @line[0 .. 9, 14]) . "\t适应症" . "\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]} . "\t" . $line[16];
$bool = 1;
}
elsif (($line[14] eq 'A') and (grep {lc $line[2] ne lc $_} @{$dis2{$cancer_type}})) {
push @pos, "$_\t$mut{lc $gene}{lc $protein}\t" . join("\t", @line[0 .. 9, 14]) . "\t非适应症" . "\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]} . "\t" . $line[16];
$bool = 1;
}
elsif (grep {lc $line[2] eq lc $_} @{$dis2{$cancer_type}}) {
push @pos, "$_\t$mut{lc $gene}{lc $protein}\t" . join("\t", @line[0 .. 9, 14]) . "\t\.\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]} . "\t" . $line[16];
$bool = 1;
}
}
}
# 去掉最后一个字符 例如V600E 去掉之后 V600再去匹配
if ($protein =~ /^(\w\d+)\w$/ and exists $therapy{lc $gene}{lc $1}) {
foreach my $entry (@{$therapy{lc $gene}{lc $1}}) {
my @line = split("\t", $entry);
if (!defined($line[16])) {
$line[16] = '';
}
if (($line[14] eq 'A') and (grep {lc $line[2] eq lc $_} @{$dis2{$cancer_type}})) {
push @pos, "$_\t$mut{lc $gene}{lc $protein}\t" . join("\t", @line[0 .. 9, 14]) . "\t适应症" . "\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]} . "\t" . $line[16];
$bool = 1;
}
elsif (($line[14] eq 'A') and (grep {lc $line[2] ne lc $_} @{$dis2{$cancer_type}})) {
push @pos, "$_\t$mut{lc $gene}{lc $protein}\t" . join("\t", @line[0 .. 9, 14]) . "\t非适应症" . "\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]} . "\t" . $line[16];
$bool = 1;
}
elsif (grep {lc $line[2] eq lc $_} @{$dis2{$cancer_type}}) {
push @pos, "$_\t$mut{lc $gene}{lc $protein}\t" . join("\t", @line[0 .. 9, 14]) . "\t\.\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]} . "\t" . $line[16];
$bool = 1;
}
}
}
# 去掉最后一个字符 加上“.X”去匹配
if ($protein =~ /^(\w\d+)\w$/ and exists $therapy{lc $gene}{lc $1 . "X"}) {
foreach my $entry (@{$therapy{lc $gene}{lc $1 . "X"}}) {
my @line = split("\t", $entry);
if (!defined($line[16])) {
$line[16] = '';
}
if (($line[14] eq 'A') and (grep {lc $line[2] eq lc $_} @{$dis2{$cancer_type}})) {
push @pos, "$_\t$mut{lc $gene}{lc $protein}\t" . join("\t", @line[0 .. 9, 14]) . "\t适应症" . "\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]} . "\t" . $line[16];
$bool = 1;
}
elsif (($line[14] eq 'A') and (grep {lc $line[2] ne lc $_} @{$dis2{$cancer_type}})) {
push @pos, "$_\t$mut{lc $gene}{lc $protein}\t" . join("\t", @line[0 .. 9, 14]) . "\t非适应症" . "\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]} . "\t" . $line[16];
$bool = 1;
}
elsif (grep {lc $line[2] eq lc $_} @{$dis2{$cancer_type}}) {
push @pos, "$_\t$mut{lc $gene}{lc $protein}\t" . join("\t", @line[0 .. 9, 14]) . "\t\.\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]} . "\t" . $line[16];
$bool = 1;
}
}
}
# 外显子 模式去匹配
if (exists $therapy{lc $gene}{lc $mut_type}) {
foreach my $entry (@{$therapy{lc $gene}{lc $mut_type}}) {
my @line = split("\t", $entry);
if (!defined($line[16])) {
$line[16] = '';
}
if (($line[14] eq 'A') and (grep {lc $line[2] eq lc $_} @{$dis2{$cancer_type}})) {
push @pos, "$_\t$mut{lc $gene}{lc $protein}\t" . join("\t", @line[0 .. 9, 14]) . "\t适应症" . "\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]} . "\t" . $line[16];
$bool = 1;
}
elsif (($line[14] eq 'A') and (grep {lc $line[2] ne lc $_} @{$dis2{$cancer_type}})) {
push @pos, "$_\t$mut{lc $gene}{lc $protein}\t" . join("\t", @line[0 .. 9, 14]) . "\t非适应症" . "\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]} . "\t" . $line[16];
$bool = 1;
}
elsif (grep {lc $line[2] eq lc $_} @{$dis2{$cancer_type}}) {
push @pos, "$_\t$mut{lc $gene}{lc $protein}\t" . join("\t", @line[0 .. 9, 14]) . "\t\.\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]} . "\t" . $line[16];
$bool = 1;
}
}
}
# 没有匹配上
if ($bool == 0) {
if ($record{'CLNSIG'} =~ /benign/i and $record{'CLNSIG'} !~ /sensitivity|pathogenic|uncertain|\./i and $record{'cosmic91'} ne '.') {
push @neg, "$_\t$mut{lc $gene}{lc $protein}";
}
else {
push @vus, "$_\t$mut{lc $gene}{lc $protein}";
}
}
}
}
}
}
sub drug {
my $drugs = shift @_;
my @translation;
foreach my $drug (split(/,/, $drugs)) {
if ($drug =~ /\+/) {
my $plus;
foreach my $drug_c (split(/\s+\+\s+/, $drug)) {
my $new = (exists $drug{lc $drug_c}) ? $drug{lc $drug_c} : ($drug_c);
$plus .= " + $new";
}
$plus =~ s/^ \+ //;
push @translation, $plus;
}
else {
my $new = (exists $drug{lc $drug}) ? $drug{lc $drug} : ($drug);
push @translation, $new;
}
}
return (join(",", @translation));
}
if (@pos) {
print POS join("\n", @pos) . "\n";
}
if (@neg) {
print NEG join("\n", @neg) . "\n";
}
if (@vus) {
print VUS join("\n", @vus) . "\n";
}