Compare commits

...

3 Commits

Author SHA1 Message Date
chaopower fb36b97329 遗传分解hgvs,没有p. 采用c. 2024-02-26 10:07:00 +08:00
chaopower 778d55ed5b hgvs 采用标准转录本 2024-02-26 10:06:23 +08:00
chaopower 157fa94ef8 污染添加17和160图 2024-02-26 10:05:47 +08:00
4 changed files with 73 additions and 16 deletions

View File

@ -189,12 +189,12 @@ while (<IN>) {
push @reason, 'not_need_spl_inron';
}
my @hgvs = split(/,/, $line[9]);
my $hgvs = $hgvs[0];
# my $hgvs = $hgvs[0];
my $transcript_gene;
$transcript_gene = $transcript{$gene} if (exists $transcript{$gene});
my $hgvs;
if (grep {/$transcript_gene/} @hgvs) {
$hgvs = (grep {/$transcript_gene/} @hgvs)[0];
}
$line[9] = $hgvs;
$hgvs =~ /:(NM_\d+):exon\d+:(c\.\S+):p\.(\S+)$/;
@ -202,6 +202,10 @@ while (<IN>) {
if ($protein =~ /\d+X$|\d+\*$/ or $line[8] eq 'stopgain' or $line[8] eq 'frameshift deletion' or $line[8] eq 'frameshift insertion') {
$protein = 'Truncating Mutations';
}
}
else {
push @reason, 'not_correct_hgvs';
}
}

View File

@ -6,6 +6,32 @@ import re
import pandas as pd
def split_hgvs(hgvs):
hgvs_split = hgvs.split(':')
if len(hgvs_split) == 4:
gene, position, transcript_version, coordinate_type = hgvs_split
# pattern = r'c\.\d+([\+\-])[12]\D+>\D+'
# match = re.search(pattern, coordinate_type)
# # if match:
# # transcript_version =
# # if match.group(1) == '-':X
variant_version = None
elif len(hgvs_split) == 5:
gene, position, transcript_version, coordinate_type, variant_version = hgvs_split
else:
raise ValueError(f'Invalid HGVS format{hgvs}')
return {
'gene': gene,
'transcript': position,
'exon': transcript_version,
'nacid': coordinate_type,
'aacid': variant_version
}
class HereditaryRun:
def __init__(self, database, project, output_dir, name, file):
@ -30,21 +56,19 @@ class HereditaryRun:
result_df = pd.DataFrame(columns=['Gene', 'Syndrome_Cn', 'inheritance', 'genotype', 'mutation'])
for _, rows in data.iterrows():
matches = re.match(r"([A-Za-z0-9]+):.*:(p\..*)", rows['AAChange_refGene'])
row_df = pd.DataFrame(columns=['Gene', 'Syndrome_Cn', 'inheritance', 'genotype', 'mutation', 'ClinicalSign'])
gene, mutation = '', ''
if matches:
gene = matches.group(1)
mutation = matches.group(2)
else:
raise UserWarning('HGVS 解析错误!')
# matches = re.match(r"([A-Za-z0-9]+):.*:(p\..*)", rows['AAChange_refGene'])
matches = split_hgvs(rows['AAChange_refGene'])
gene = matches['gene']
aacid = matches['aacid'] if matches['aacid'] else matches['nacid']
row_df = pd.DataFrame(
columns=['Gene', 'Syndrome_Cn', 'inheritance', 'genotype', 'mutation', 'ClinicalSign'])
selected_rows = expanded_database[expanded_database['Gene'].str.split(';').apply(lambda x: gene in x)]
row_df['Syndrome_Cn'] = selected_rows['Syndrome_Cn']
row_df['inheritance'] = selected_rows['inheritance']
row_df['Gene'] = gene
row_df['mutation'] = mutation
row_df['mutation'] = aacid
row_df['genotype'] = '纯合' if rows['Freq'] > 0.9 else '杂合'
row_df['ClinicalSign'] = str(rows['ClinicalSign'])

View File

@ -46,9 +46,13 @@ def single_monitor(name, vcf_file, bed_file, freq_range, output_dir):
bed_regions = load_bed_regions(bed_file)
vcf = pysam.VariantFile(vcf_file)
res_pos = list()
count_normal = 0
count_exception = 0
vcf_out = open(os.path.join(output_dir, f'{name}_cnvkit_tumor.vcf'), 'w')
for line in str(vcf.header).strip().split('\n'):
vcf_out.write(line + '\n')
for record in vcf:
contig = record.contig
@ -79,7 +83,9 @@ def single_monitor(name, vcf_file, bed_file, freq_range, output_dir):
freq=freq,
res=res
))
vcf_out.write(str(record) + '\n')
break
count_all = count_exception + count_normal
if count_all == 0:
z_score = 0

View File

@ -61,6 +61,19 @@ task run_generate_png {
}
}
task run_single_generate_png {
String name
String probe
String cnvkit_tumor_vcf
String cnv_cnr
String cnv_cns
String output_dir
command {
cnvkit.py scatter ${cnv_cnr} -s ${cnv_cns} -v ${cnvkit_tumor_vcf} -o ${output_dir}/pollution/${name}_pollution_cnvkit_tumor.png
}
}
workflow call_pollution {
Boolean run=true
@ -114,6 +127,16 @@ workflow call_pollution {
probe=probe,
vcf=initial_vcf
}
call run_single_generate_png {
input:
name=tumor,
probe=probe,
cnvkit_tumor_vcf=run_pollution_paired.cnvkit_tumor_vcf,
cnv_cnr=cnv_cnr,
cnv_cns=cnv_cns,
output_dir=output_dir
}
}
}