遗传分解hgvs，没有p. 采用c.

2024-02-26 10:07:00 +08:00 · 2024-02-26 10:07:00 +08:00 · fb36b97329
parent 778d55ed5b
commit fb36b97329
1 changed files with 33 additions and 9 deletions
--- a/codes/hereditary.py
+++ b/codes/hereditary.py
@ -6,6 +6,32 @@ import re
 import pandas as pd
 def split_hgvs(hgvs):
    hgvs_split = hgvs.split(':')
    if len(hgvs_split) == 4:
        gene, position, transcript_version, coordinate_type = hgvs_split
        # pattern = r'c\.\d+([\+\-])[12]\D+>\D+'
        # match = re.search(pattern, coordinate_type)
        # # if match:
        # #     transcript_version =
        # #     if match.group(1) == '-':X
        variant_version = None
    elif len(hgvs_split) == 5:
        gene, position, transcript_version, coordinate_type, variant_version = hgvs_split
    else:
        raise ValueError(f'Invalid HGVS format{hgvs}')
    return {
        'gene': gene,
        'transcript': position,
        'exon': transcript_version,
        'nacid': coordinate_type,
        'aacid': variant_version
    }
 class HereditaryRun:
    def __init__(self, database, project, output_dir, name, file):
@ -30,21 +56,19 @@ class HereditaryRun:
        result_df = pd.DataFrame(columns=['Gene', 'Syndrome_Cn', 'inheritance', 'genotype', 'mutation'])
        for _, rows in data.iterrows():
-            matches = re.match(r"([A-Za-z0-9]+):.*:(p\..*)", rows['AAChange_refGene'])
+            # matches = re.match(r"([A-Za-z0-9]+):.*:(p\..*)", rows['AAChange_refGene'])
-            row_df = pd.DataFrame(columns=['Gene', 'Syndrome_Cn', 'inheritance', 'genotype', 'mutation', 'ClinicalSign'])
+            matches = split_hgvs(rows['AAChange_refGene'])
-            gene, mutation = '', ''
+            gene = matches['gene']
-            if matches:
+            aacid = matches['aacid'] if matches['aacid'] else matches['nacid']
-                gene = matches.group(1)
+            row_df = pd.DataFrame(
-                mutation = matches.group(2)
+                columns=['Gene', 'Syndrome_Cn', 'inheritance', 'genotype', 'mutation', 'ClinicalSign'])
            else:
                raise UserWarning('HGVS 解析错误！')
            selected_rows = expanded_database[expanded_database['Gene'].str.split(';').apply(lambda x: gene in x)]
            row_df['Syndrome_Cn'] = selected_rows['Syndrome_Cn']
            row_df['inheritance'] = selected_rows['inheritance']
            row_df['Gene'] = gene
-            row_df['mutation'] = mutation
+            row_df['mutation'] = aacid
            row_df['genotype'] = '纯合' if rows['Freq'] > 0.9 else '杂合'
            row_df['ClinicalSign'] = str(rows['ClinicalSign'])