pipeline/script/target_therapy_snpindel.pl

#!/usr/bin/perl
use strict;
use warnings;

die "useage:perl $0 input output cancer_type" unless @ARGV == 3;

my ($input, $output, $cancer_type) = @ARGV;

my $database_path = defined $ENV{'DATABASE'} ? $ENV{'DATABASE'} : "/dataseq/jmdna/codes/reportbase";

print "SnpIndel药物注释使用路径：$database_path\n";

##将点突变信息记录到%mut

open MUT, "$database_path/snv_indel_mutation.csv";
<MUT>;
my %mut;
while (<MUT>) {
    my @line = split(/,/);
    $mut{$line[0]}{$line[1]} = $line[2];
}

##将用药信息记录到%therapy
open THERAPY, "$database_path/targetTherapy.txt";
my $h1 = <THERAPY>;
chomp $h1;
my %therapy;
while (<THERAPY>) {
    chomp;
    my @line = split("\t");
    push @{$therapy{$line[0]}{$line[1]}}, $_ if ($line[9] ne 'D' and $line[2] !~ /Leukemia|Lymphoma|Myeloid/i);
}

##药物翻译信息
open DRUG, "$database_path/target_drug.txt";
my %drug;
<DRUG>;
while (<DRUG>) {
    chomp;
    my @line = split(/\t/);
    next unless $line[1];
    foreach my $drug (split(/\|/, $line[0])) {
        $drug{lc $drug} = $line[1];
    }
}

##疾病翻译信息
# 分两部分，向上匹配，和向下匹配
open DIS, "$database_path/oncotree.cancertype.20230801.txt";
<DIS>;
my (%dis, @id, %dis2);
while (<DIS>) {
    chomp;
    my @line = split(/\t/);
    $dis{lc $line[2]} = $line[3];
    $dis{lc $line[4]} = $line[5];
    push @{$dis2{$line[0]}}, lc $line[2];
    push @{$dis2{$line[0]}}, lc $line[4];
    push @id, $line[0];
}
foreach my $ID ($cancer_type) {
    my @family;
    my @ids = split("", $ID);
    for (my $i = 1; $i < @ids; $i = $i + 2) {
        push @family, join("", @ids[0 .. $i]);
    }
    push @family, (grep {/^$ID/} @id);
    foreach my $t (@family) {
        push @{$dis2{$ID}}, @{$dis2{$t}};
    }
}
foreach my $key (keys(%dis2)) {
    my %uniq;
    @{$dis2{$key}} = grep {++$uniq{$_} < 2} @{$dis2{$key}};
}

open IN, "$input";
open POS, ">$output.pos.txt";
open NEG, ">$output.neg.txt";
open VUS, ">$output.vus.txt";
my $h2 = <IN>;
chomp $h2;

my $h = $h2 . "\tfun_change\t" . join("\t", (split("\t", $h1))[0 .. 9, 14]) . "\tLabel\tDrugCn\tIndication";
print POS "$h\n";
print NEG $h2 . "\tfun_change\n";
print VUS $h2 . "\tfun_change\n";

my @column_names = split('\t', $h2);
my (@pos, @neg, @vus);
while (<IN>) {
    chomp;
    my @splitline = split("\t");
    # 将数据与表头对应
    my %record;
    @record{@column_names} = @splitline;
    my ($protein, $mut_type);
    my $gene = $record{'Gene_refGene'};
    if ($record{'AAChange_refGene'} =~ /(\w+):(\w+):exon(\d+):c\.(\S+):p\.(\S+)$/) {
        my $exon = $3;
        my $codon = $4;
        if ($5 =~ /\d+X$|\d+\*$/
            or $record{'ExonicFunc_refGene'} eq 'stopgain'
            or $record{'ExonicFunc_refGene'} eq 'frameshift deletion'
            or $record{'ExonicFunc_refGene'} eq 'frameshift insertion') {
            $protein = 'Truncating Mutations';
        }
        else {
            $protein = $5;
        }
        # $mut_type = ($codon =~ /del/) ? ("Exon $exon deletion") : ($codon =~ /ins/) ? ("Exon $exon insertion") : ("Exon $exon mutation");
        $mut_type = ($codon =~ /del/) ? ("Exon $exon deletion") :
            ($codon =~ /ins/) ? ("Exon $exon insertion") :
                ($codon =~ /dup/) ? ("Exon $exon insertion") :
                    ("Exon $exon mutation");
    }
    elsif ($record{'AAChange_refGene'} =~ /splicing/) {
        $protein = 'Truncating Mutations';
        $mut_type = '';
    }
    elsif ($record{'ExonicFunc_refGene'} =~ /skipping/) {
        $protein = 'Exon 14 skipping Mutations';
        $mut_type = '';
    }
    else {
        print $_;
    }

    ##若突变不存在于%mut,写入@vus,若突变存在于%mut且neutral，写入@neg;若基因不存在于%therapy，写入@vus;
    if (not exists $mut{$gene}{$protein}) {
        if ($record{'CLNSIG'} =~ /benign/i and $record{'CLNSIG'} !~ /sensitivity|pathogenic|uncertain|\./i and $record{'cosmic91'} ne '.') {
            push @neg, "$_\t.";
        }
        else {
            push @vus, "$_\t.";
        }
    }
    else {
        if ($mut{$gene}{$protein} =~ /neutral/i) {
            push @neg, "$_\t$mut{$gene}{$protein}";
        }
        elsif ($mut{$gene}{$protein} =~ /Inconclusive/i) {
            push @vus, "$_\t$mut{$gene}{$protein}";
        }
        else {
            if (not exists $therapy{$gene}) {
                if ($record{'CLNSIG'} =~ /benign/i and $record{'CLNSIG'} !~ /sensitivity|pathogenic|uncertain|\./i and $record{'cosmic91'} ne '.') {
                    push @neg, "$_\t$mut{$gene}{$protein}";
                }
                else {
                    push @vus, "$_\t$mut{$gene}{$protein}";
                }
            }
            else {
                my $bool = 0;
                # 匹配 p.
                if (exists $therapy{$gene}{$protein}) {
                    foreach my $entry (@{$therapy{$gene}{$protein}}) {
                        my @line = split("\t", $entry);
                        if (($line[14] eq 'A') and (grep {lc $line[2] eq lc $_} @{$dis2{$cancer_type}})) {
                            push @pos, "$_\t$mut{$gene}{$protein}\t" . join("\t", @line[0 .. 9, 14]) . "\t适应症" . "\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]};
                            $bool = 1;
                        }
                        elsif (($line[14] eq 'A') and (grep {lc $line[2] ne lc $_} @{$dis2{$cancer_type}})) {
                            push @pos, "$_\t$mut{$gene}{$protein}\t" . join("\t", @line[0 .. 9, 14]) . "\t非适应症" . "\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]};
                            $bool = 1;
                        }
                        elsif (grep {lc $line[2] eq lc $_} @{$dis2{$cancer_type}}) {
                            push @pos, "$_\t$mut{$gene}{$protein}\t" . join("\t", @line[0 .. 9, 14]) . "\t\.\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]};
                            $bool = 1;
                        }
                    }
                }
                # 匹配 Mutation
                if (exists $therapy{$gene}{'Mutation'}) {
                    foreach my $entry (@{$therapy{$gene}{'Mutation'}}) {
                        my @line = split("\t", $entry);
                        if (($line[14] eq 'A') and (grep {lc $line[2] eq lc $_} @{$dis2{$cancer_type}})) {
                            push @pos, "$_\t$mut{$gene}{$protein}\t" . join("\t", @line[0 .. 9, 14]) . "\t适应症" . "\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]};
                            $bool = 1;
                        }
                        elsif (($line[14] eq 'A') and (grep {lc $line[2] ne lc $_} @{$dis2{$cancer_type}})) {
                            push @pos, "$_\t$mut{$gene}{$protein}\t" . join("\t", @line[0 .. 9, 14]) . "\t非适应症" . "\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]};
                            $bool = 1;
                        }
                        elsif (grep {lc $line[2] eq lc $_} @{$dis2{$cancer_type}}) {
                            push @pos, "$_\t$mut{$gene}{$protein}\t" . join("\t", @line[0 .. 9, 14]) . "\t\.\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]};
                            $bool = 1;
                        }
                    }
                }
                # 去掉最后一个字符 例如V600E 去掉之后 V600再去匹配
                if ($protein =~ /^(\w\d+)\w$/ and exists $therapy{$gene}{$1}) {
                    foreach my $entry (@{$therapy{$gene}{$1}}) {
                        my @line = split("\t", $entry);
                        if (($line[14] eq 'A') and (grep {lc $line[2] eq lc $_} @{$dis2{$cancer_type}})) {
                            push @pos, "$_\t$mut{$gene}{$protein}\t" . join("\t", @line[0 .. 9, 14]) . "\t适应症" . "\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]};
                            $bool = 1;
                        }
                        elsif (($line[14] eq 'A') and (grep {lc $line[2] ne lc $_} @{$dis2{$cancer_type}})) {
                            push @pos, "$_\t$mut{$gene}{$protein}\t" . join("\t", @line[0 .. 9, 14]) . "\t非适应症" . "\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]};
                            $bool = 1;
                        }
                        elsif (grep {lc $line[2] eq lc $_} @{$dis2{$cancer_type}}) {
                            push @pos, "$_\t$mut{$gene}{$protein}\t" . join("\t", @line[0 .. 9, 14]) . "\t\.\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]};
                            $bool = 1;
                        }
                    }
                }
                # 去掉最后一个字符 加上“.X”去匹配
                if ($protein =~ /^(\w\d+)\w$/ and exists $therapy{$gene}{$1 . "X"}) {
                    foreach my $entry (@{$therapy{$gene}{$1 . "X"}}) {
                        my @line = split("\t", $entry);
                        if (($line[14] eq 'A') and (grep {lc $line[2] eq lc $_} @{$dis2{$cancer_type}})) {
                            push @pos, "$_\t$mut{$gene}{$protein}\t" . join("\t", @line[0 .. 9, 14]) . "\t适应症" . "\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]};
                            $bool = 1;
                        }
                        elsif (($line[14] eq 'A') and (grep {lc $line[2] ne lc $_} @{$dis2{$cancer_type}})) {
                            push @pos, "$_\t$mut{$gene}{$protein}\t" . join("\t", @line[0 .. 9, 14]) . "\t非适应症" . "\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]};
                            $bool = 1;
                        }
                        elsif (grep {lc $line[2] eq lc $_} @{$dis2{$cancer_type}}) {
                            push @pos, "$_\t$mut{$gene}{$protein}\t" . join("\t", @line[0 .. 9, 14]) . "\t\.\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]};
                            $bool = 1;
                        }
                    }
                }
                # 外显子 模式去匹配
                if (exists $therapy{$gene}{$mut_type}) {
                    foreach my $entry(@{$therapy{$gene}{$mut_type}}){
						my @line=split("\t",$entry);
						if (($line[14] eq 'A') and (grep{lc$line[2] eq lc$_}@{$dis2{$cancer_type}})){
							push @pos,"$_\t$mut{$gene}{$protein}\t".join("\t",@line[0..9,14])."\t适应症"."\t".&drug($line[3])."\t".$dis{lc$line[2]};$bool=1;
						}elsif(($line[14] eq 'A') and (grep{lc$line[2] ne lc$_}@{$dis2{$cancer_type}})){
							push @pos,"$_\t$mut{$gene}{$protein}\t".join("\t",@line[0..9,14])."\t非适应症"."\t".&drug($line[3])."\t".$dis{lc$line[2]};$bool=1;
						}elsif(grep{lc$line[2] eq lc$_}@{$dis2{$cancer_type}}){
							push @pos,"$_\t$mut{$gene}{$protein}\t".join("\t",@line[0..9,14])."\t\.\t".&drug($line[3])."\t".$dis{lc$line[2]};$bool=1;
						}
					}
                }
                # 没有匹配上
                if ($bool == 0) {
                    if ($record{'CLNSIG'} =~ /benign/i and $record{'CLNSIG'} !~ /sensitivity|pathogenic|uncertain|\./i and $record{'cosmic91'} ne '.') {
                        push @neg, "$_\t$mut{$gene}{$protein}";
                    }
                    else {
                        push @vus, "$_\t$mut{$gene}{$protein}";
                    }
                }
            }
        }
    }
}

sub drug {
    my $drugs = shift @_;
    my @translation;
    foreach my $drug (split(/,/, $drugs)) {
        if ($drug =~ /\+/) {
            my $plus;
            foreach my $drug_c (split(/\s+\+\s+/, $drug)) {
                my $new = (exists $drug{lc $drug_c}) ? $drug{lc $drug_c} : ($drug_c);
                $plus .= " + $new";
            }
            $plus =~ s/^ \+ //;
            push @translation, $plus;
        }
        else {
            my $new = (exists $drug{lc $drug}) ? $drug{lc $drug} : ($drug);
            push @translation, $new;
        }
    }
    return (join(",", @translation));
}

if (@pos) {
    print POS join("\n", @pos) . "\n";
}

if (@neg) {
    print NEG join("\n", @neg) . "\n";
}

if (@vus) {
    print VUS join("\n", @vus) . "\n";
}