From fb36b973296eee51ed19d60690c58a20abe294ce Mon Sep 17 00:00:00 2001
From: chaopower <chao.zhang@jmdna.com>
Date: Mon, 26 Feb 2024 10:07:00 +0800
Subject: [PATCH] =?UTF-8?q?=E9=81=97=E4=BC=A0=E5=88=86=E8=A7=A3hgvs?=
 =?UTF-8?q?=EF=BC=8C=E6=B2=A1=E6=9C=89p.=20=E9=87=87=E7=94=A8c.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 codes/hereditary.py | 42 +++++++++++++++++++++++++++++++++---------
 1 file changed, 33 insertions(+), 9 deletions(-)

diff --git a/codes/hereditary.py b/codes/hereditary.py
index de4f6ae..c120250 100755
--- a/codes/hereditary.py
+++ b/codes/hereditary.py
@@ -6,6 +6,32 @@ import re
 import pandas as pd
 
 
+def split_hgvs(hgvs):
+    hgvs_split = hgvs.split(':')
+    if len(hgvs_split) == 4:
+        gene, position, transcript_version, coordinate_type = hgvs_split
+
+        # pattern = r'c\.\d+([\+\-])[12]\D+>\D+'
+        # match = re.search(pattern, coordinate_type)
+        # # if match:
+        # #     transcript_version =
+        # #     if match.group(1) == '-':X
+
+        variant_version = None
+    elif len(hgvs_split) == 5:
+        gene, position, transcript_version, coordinate_type, variant_version = hgvs_split
+    else:
+        raise ValueError(f'Invalid HGVS format{hgvs}')
+
+    return {
+        'gene': gene,
+        'transcript': position,
+        'exon': transcript_version,
+        'nacid': coordinate_type,
+        'aacid': variant_version
+    }
+
+
 class HereditaryRun:
 
     def __init__(self, database, project, output_dir, name, file):
@@ -30,21 +56,19 @@ class HereditaryRun:
 
         result_df = pd.DataFrame(columns=['Gene', 'Syndrome_Cn', 'inheritance', 'genotype', 'mutation'])
         for _, rows in data.iterrows():
-            matches = re.match(r"([A-Za-z0-9]+):.*:(p\..*)", rows['AAChange_refGene'])
-            row_df = pd.DataFrame(columns=['Gene', 'Syndrome_Cn', 'inheritance', 'genotype', 'mutation', 'ClinicalSign'])
-            gene, mutation = '', ''
-            if matches:
-                gene = matches.group(1)
-                mutation = matches.group(2)
-            else:
-                raise UserWarning('HGVS 解析错误！')
+            # matches = re.match(r"([A-Za-z0-9]+):.*:(p\..*)", rows['AAChange_refGene'])
+            matches = split_hgvs(rows['AAChange_refGene'])
+            gene = matches['gene']
+            aacid = matches['aacid'] if matches['aacid'] else matches['nacid']
+            row_df = pd.DataFrame(
+                columns=['Gene', 'Syndrome_Cn', 'inheritance', 'genotype', 'mutation', 'ClinicalSign'])
 
             selected_rows = expanded_database[expanded_database['Gene'].str.split(';').apply(lambda x: gene in x)]
 
             row_df['Syndrome_Cn'] = selected_rows['Syndrome_Cn']
             row_df['inheritance'] = selected_rows['inheritance']
             row_df['Gene'] = gene
-            row_df['mutation'] = mutation
+            row_df['mutation'] = aacid
             row_df['genotype'] = '纯合' if rows['Freq'] > 0.9 else '杂合'
             row_df['ClinicalSign'] = str(rows['ClinicalSign'])