pipeline/codes/filter_cnv.pl

62 lines
1.6 KiB
Perl
Raw Normal View History

2023-10-18 15:59:11 +08:00
#!/usr/bin/perl
2023-11-29 15:13:30 +08:00
# use strict;
2023-10-18 15:59:11 +08:00
use warnings;
2023-11-29 15:13:30 +08:00
die "usage:perl $0 input output project\n" unless @ARGV == 3;
2023-10-18 15:59:11 +08:00
2023-11-29 15:13:30 +08:00
open IN, "$ARGV[0]";
open OUT, ">$ARGV[1]";
my $project = $ARGV[2];
2023-10-18 15:59:11 +08:00
2023-11-30 15:31:35 +08:00
# my $public_path = defined $ENV{'PUBLIC'} ? $ENV{'PUBLIC'} : "/dataseq/jmdna/codes/public/";
my $database_path = defined $ENV{'DATABASE'} ? $ENV{'DATABASE'} : "/dataseq/jmdna/codes/reportbase";
2023-11-29 15:13:30 +08:00
2023-11-30 15:31:35 +08:00
print "Cnv过滤使用database路径$database_path\n";
2023-11-29 15:13:30 +08:00
my $cnv = info();
my @cnv_list = @$cnv;
my $head = <IN>;
2023-10-18 15:59:11 +08:00
chomp $head;
2023-11-29 15:13:30 +08:00
my @head = split("\t", $head);
print OUT join("\t", (@head[0 .. 4], "ref_gene", "copy", @head[5 .. 9])), "\n";
while (<IN>) {
chomp;
my @line = split(/\t/, $_);
2023-11-30 15:31:35 +08:00
my $cn = sprintf("%.1f", 2 ** (1 + $line[4]));
2023-11-29 15:13:30 +08:00
my @gene_list = split(/,/, $line[3]);
my %uniq;
foreach my $element (@gene_list) {
if (grep {$_ eq $element} @cnv_list) {
$uniq{$element}++;
next if $uniq{$element} > 1;
if (($cn <= 1 or $cn >= 3.5)) {
print OUT join("\t", (@line[0 .. 4], $element, $cn, @line[5 .. 9])), "\n";
}
}
}
}
sub info {
2023-11-30 15:31:35 +08:00
open INFO, "$database_path/info.csv";
2023-11-29 15:13:30 +08:00
# 读取并解析表头
my $header = <INFO>;
chomp($header);
my @column_names = split(',', $header);
my (@cnvs);
while (<INFO>) {
chomp;
my @line = split(/,/, $_);
# 将数据与表头对应
my %record;
@record{@column_names} = @line;
if ($record{'project'} eq $project) {
if ($record{'cnv'} ne "NA") {
@cnvs = split(/\//, $record{'cnv'});
}
}
}
return \@cnvs
2023-10-18 15:59:11 +08:00
}