From fb36b973296eee51ed19d60690c58a20abe294ce Mon Sep 17 00:00:00 2001 From: chaopower Date: Mon, 26 Feb 2024 10:07:00 +0800 Subject: [PATCH] =?UTF-8?q?=E9=81=97=E4=BC=A0=E5=88=86=E8=A7=A3hgvs?= =?UTF-8?q?=EF=BC=8C=E6=B2=A1=E6=9C=89p.=20=E9=87=87=E7=94=A8c.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- codes/hereditary.py | 42 +++++++++++++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/codes/hereditary.py b/codes/hereditary.py index de4f6ae..c120250 100755 --- a/codes/hereditary.py +++ b/codes/hereditary.py @@ -6,6 +6,32 @@ import re import pandas as pd +def split_hgvs(hgvs): + hgvs_split = hgvs.split(':') + if len(hgvs_split) == 4: + gene, position, transcript_version, coordinate_type = hgvs_split + + # pattern = r'c\.\d+([\+\-])[12]\D+>\D+' + # match = re.search(pattern, coordinate_type) + # # if match: + # # transcript_version = + # # if match.group(1) == '-':X + + variant_version = None + elif len(hgvs_split) == 5: + gene, position, transcript_version, coordinate_type, variant_version = hgvs_split + else: + raise ValueError(f'Invalid HGVS format{hgvs}') + + return { + 'gene': gene, + 'transcript': position, + 'exon': transcript_version, + 'nacid': coordinate_type, + 'aacid': variant_version + } + + class HereditaryRun: def __init__(self, database, project, output_dir, name, file): @@ -30,21 +56,19 @@ class HereditaryRun: result_df = pd.DataFrame(columns=['Gene', 'Syndrome_Cn', 'inheritance', 'genotype', 'mutation']) for _, rows in data.iterrows(): - matches = re.match(r"([A-Za-z0-9]+):.*:(p\..*)", rows['AAChange_refGene']) - row_df = pd.DataFrame(columns=['Gene', 'Syndrome_Cn', 'inheritance', 'genotype', 'mutation', 'ClinicalSign']) - gene, mutation = '', '' - if matches: - gene = matches.group(1) - mutation = matches.group(2) - else: - raise UserWarning('HGVS 解析错误!') + # matches = re.match(r"([A-Za-z0-9]+):.*:(p\..*)", rows['AAChange_refGene']) + matches = split_hgvs(rows['AAChange_refGene']) + gene = matches['gene'] + aacid = matches['aacid'] if matches['aacid'] else matches['nacid'] + row_df = pd.DataFrame( + columns=['Gene', 'Syndrome_Cn', 'inheritance', 'genotype', 'mutation', 'ClinicalSign']) selected_rows = expanded_database[expanded_database['Gene'].str.split(';').apply(lambda x: gene in x)] row_df['Syndrome_Cn'] = selected_rows['Syndrome_Cn'] row_df['inheritance'] = selected_rows['inheritance'] row_df['Gene'] = gene - row_df['mutation'] = mutation + row_df['mutation'] = aacid row_df['genotype'] = '纯合' if rows['Freq'] > 0.9 else '杂合' row_df['ClinicalSign'] = str(rows['ClinicalSign'])