pipeline/codes/filter_fusion.pl

214 lines
7.7 KiB
Perl
Executable File
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#!/usr/bin/perl
use strict;
use warnings;
die "usage:perl $0 depth_file input out project" unless @ARGV == 4;
my ($depth_file, $input, $out, $project) = @ARGV;
my $public_path = defined $ENV{'PUBLIC'} ? $ENV{'PUBLIC'} : "/dataseq/jmdna/codes/public/";
my $database_path = defined $ENV{'DATABASE'} ? $ENV{'DATABASE'} : "/dataseq/jmdna/codes/reportbase";
print "Fusion过滤使用public路径$public_path\n";
print "Fusion过滤使用database路径$database_path\n";
open IN, "$input";
open OUT1, "> $out";
open SD, "$public_path/gene_strand.txt";
my %strand;
while (<SD>) {
chomp;
my @line = split;
$strand{$line[1]} = $line[0];
}
my $fusion = info();
my @fusions = @$fusion;
while (<IN>) {
if (/^##/) {
next;
}
if (/^#CHROM/) {
chomp;
print OUT1 "$_\tCHROM2\tPOS2\tINFO2\tGENE1\tGENE2\tFUSION\tFREQ1\tFREQ2\tref_gene\n";
next;
}
chomp;
my @line = split(/\t/);
$line[7] =~ /SVTYPE=(.*?);/;
my $SVTYPE = $1;
if ($SVTYPE eq 'INV') {
$line[7] =~ /SVLEN=(.*?);/;
if ($1 > 1000) {
my $freq_inv = sprintf("%.2f", (split(/:/, $line[9]))[1] / (split(/:/, $line[9]))[7] * 100);
next if $freq_inv < 0.3;
my $pt_inv = $_;
my $gene1 = &site2gene($line[0], $line[1]);
$line[7] =~ /END=(.*?);/;
my $end = $1;
my $gene2 = &site2gene($line[0], $end);
# if (!((grep {$_ eq $gene1} @fusions) or (grep {$_ eq $gene2} @fusions))) {
# next;
# }
my $ref_gene;
if (grep {$_ eq $gene1} @fusions) {
$ref_gene = $gene1;
}
if (grep {$_ eq $gene2} @fusions) {
$ref_gene = $gene2;
}
if (!(defined $ref_gene)) {
next;
}
my $depth = &depth($line[0], $line[1], $line[0], $end);
my $freq_inv2;
if (defined $depth) {$freq_inv2 = sprintf("%.2f", (split(/:/, $line[9]))[1] / $depth * 100)}
else {$freq_inv2 = "-"};
print OUT1 "$pt_inv\t$line[0]\t$end\t-\t$gene1\t$gene2\t$gene1-$gene2\t$freq_inv\t$freq_inv2\t$ref_gene\n";
print OUT1 "$pt_inv\t$line[0]\t$end\t-\t$gene2\t$gene1\t$gene2-$gene1\t$freq_inv\t$freq_inv2\t$ref_gene\n";
next;
}
}
elsif ($SVTYPE eq 'BND') {
if ($line[2] =~ /(\d+)_1/) {
$line[7] =~ /EVENT=(.*?);\S+Func.refGene=(.*?);Gene.refGene=(.*?);/;
my ($event, $region1, $gene1) = ($1, $2, $3);
my $pt = "$_";
my $mate = <IN>;
chomp $mate;
my @line2 = split(/\t/, $mate);
$line2[7] =~ /EVENT=(.*?);\S+Func.refGene=(.*?);Gene.refGene=(.*?);/;
my ($event2, $region2, $gene2) = ($1, $2, $3);
$gene1 = (split /\\x3b/, $gene1)[0];
$gene2 = (split /\\x3b/, $gene2)[0];
# if (!(grep {$_ eq $gene1} @fusions or grep {$_ eq $gene2} @fusions)) {
# next;
# }
my $ref_gene;
if (grep {$_ eq $gene1} @fusions) {
$ref_gene = $gene1;
}
if (grep {$_ eq $gene2} @fusions) {
$ref_gene = $gene2;
}
if (!(defined $ref_gene)) {
next;
}
if ($event eq $event2 and $line2[2] =~ /_2/ and $gene1 ne $gene2 and $region1 !~ /upstream|downstream|intergenic|UTR/ and $region2 !~ /upstream|downstream|intergenic|UTR/) {
my $freq = sprintf("%.2f", (split(/:/, $line[9]))[1] / (split(/:/, $line[9]))[7] * 100);
next if $freq < 0.3;
my $strand1 = &gene2strand($gene1);
my $strand2 = &gene2strand($gene2);
my $depth = &depth($line[0], $line[1], $line2[0], $line2[1]);
my $freq_bnd;
if (defined $depth) {$freq_bnd = sprintf("%.2f", (split(/:/, $line[9]))[1] / $depth * 100)}
else {$freq_bnd = "-"};
if ($line[4] =~ /^N\[/) {
if ($strand1 eq '+' and $strand2 eq '+') {
print OUT1 "$pt\t$line2[0]\t$line2[1]\t$line2[7]\t$gene1\t$gene2\t$gene1-$gene2\t$freq\t$freq_bnd\t$ref_gene\n";
}
elsif ($strand1 eq '-' and $strand2 eq '-') {
print OUT1 "$pt\t$line2[0]\t$line2[1]\t$line2[7]\t$gene2\t$gene1\t$gene2-$gene1\t$freq\t$freq_bnd\t$ref_gene\n";
}
else {
print($gene1, '.....\n')
}
}
elsif ($line[4] =~ /^N\]/) {
if ($strand1 eq '+' and $strand2 eq '-') {
print OUT1 "$pt\t$line2[0]\t$line2[1]\t$line2[7]\t$gene1\t$gene2\t$gene1-$gene2\t$freq\t$freq_bnd\t$ref_gene\n";
}
elsif ($strand1 eq '-' and $strand2 eq '+') {
print OUT1 "$pt\t$line2[0]\t$line2[1]\t$line2[7]\t$gene2\t$gene1\t$gene2-$gene1\t$freq\t$freq_bnd\t$ref_gene\n";
}
}
elsif ($line[4] =~ /\]N$/) {
if ($strand1 eq '+' and $strand2 eq '+') {
print OUT1 "$pt\t$line2[0]\t$line2[1]\t$line2[7]\t$gene2\t$gene1\t$gene2-$gene1\t$freq\t$freq_bnd\t$ref_gene\n";
}
elsif ($strand1 eq '-' and $strand2 eq '-') {
print OUT1 "$pt\t$line2[0]\t$line2[1]\t$line2[7]\t$gene1\t$gene2\t$gene1-$gene2\t$freq\t$freq_bnd\t$ref_gene\n";
}
}
elsif ($line[4] =~ /\[N$/) {
if ($strand1 eq '+' and $strand2 eq '-') {
print OUT1 "$pt\t$line2[0]\t$line2[1]\t$line2[7]\t$gene2\t$gene1\t$gene2-$gene1\t$freq\t$freq_bnd\t$ref_gene\n";
}
elsif ($strand1 eq '-' and $strand2 eq '+') {
print OUT1 "$pt\t$line2[0]\t$line2[1]\t$line2[7]\t$gene1\t$gene2\t$gene1-$gene2\t$freq\t$freq_bnd\t$ref_gene\n";
}
}
else {
print($gene1 'NNN.......\n')
}
}
}
}
}
sub site2gene {
my ($chr, $pos) = @_;
my %gene;
open REFGENE, "$public_path/hg19_refGene.txt";
my @gene;
while (<REFGENE>) {
my @line = split(/\t/, $_);
if ($chr eq $line[2] and $pos >= $line[4] and $pos <= $line[5]) {
$gene{$line[-4]}++;
push @gene, $line[-4] if $gene{$line[-4]} < 2;
}
}
return join("|", @gene);
}
sub depth {
my ($chr1, $pos1, $chr2, $pos2) = @_;
my $depth1 = `less $depth_file|grep -w $chr1|grep -w $pos1`;
my $depth2 = `less $depth_file|grep -w $chr2|grep -w $pos2`;
$depth1 = (split(/\t/, $depth1))[3] if defined $depth1;
$depth2 = (split(/\t/, $depth2))[3] if defined $depth2;
my $depth;
if ($depth1 and $depth2) {
$depth = ($depth1 + $depth2) / 2
}
elsif ($depth1 and !$depth2) {
$depth = $depth1;
}
elsif (!$depth1 and $depth2) {
$depth = $depth2;
}
return $depth;
}
sub gene2strand {
my $gene = shift @_;
return $strand{$gene};
}
sub info {
open INFO, "$database_path/info.csv";
# 读取并解析表头
my $header = <INFO>;
chomp($header);
my @column_names = split(',', $header);
my (@fusion);
while (<INFO>) {
chomp;
my @line = split(/,/, $_);
# 将数据与表头对应
my %record;
@record{@column_names} = @line;
if ($record{'project'} eq $project) {
if ($record{'fusion'} ne "NA") {
@fusion = split(/\//, $record{'fusion'});
}
}
}
return \@fusion
}