遗传分解hgvs,没有p. 采用c.

master
chaopower 2024-02-26 10:07:00 +08:00
parent 778d55ed5b
commit fb36b97329
1 changed files with 33 additions and 9 deletions

View File

@ -6,6 +6,32 @@ import re
import pandas as pd import pandas as pd
def split_hgvs(hgvs):
hgvs_split = hgvs.split(':')
if len(hgvs_split) == 4:
gene, position, transcript_version, coordinate_type = hgvs_split
# pattern = r'c\.\d+([\+\-])[12]\D+>\D+'
# match = re.search(pattern, coordinate_type)
# # if match:
# # transcript_version =
# # if match.group(1) == '-':X
variant_version = None
elif len(hgvs_split) == 5:
gene, position, transcript_version, coordinate_type, variant_version = hgvs_split
else:
raise ValueError(f'Invalid HGVS format{hgvs}')
return {
'gene': gene,
'transcript': position,
'exon': transcript_version,
'nacid': coordinate_type,
'aacid': variant_version
}
class HereditaryRun: class HereditaryRun:
def __init__(self, database, project, output_dir, name, file): def __init__(self, database, project, output_dir, name, file):
@ -30,21 +56,19 @@ class HereditaryRun:
result_df = pd.DataFrame(columns=['Gene', 'Syndrome_Cn', 'inheritance', 'genotype', 'mutation']) result_df = pd.DataFrame(columns=['Gene', 'Syndrome_Cn', 'inheritance', 'genotype', 'mutation'])
for _, rows in data.iterrows(): for _, rows in data.iterrows():
matches = re.match(r"([A-Za-z0-9]+):.*:(p\..*)", rows['AAChange_refGene']) # matches = re.match(r"([A-Za-z0-9]+):.*:(p\..*)", rows['AAChange_refGene'])
row_df = pd.DataFrame(columns=['Gene', 'Syndrome_Cn', 'inheritance', 'genotype', 'mutation', 'ClinicalSign']) matches = split_hgvs(rows['AAChange_refGene'])
gene, mutation = '', '' gene = matches['gene']
if matches: aacid = matches['aacid'] if matches['aacid'] else matches['nacid']
gene = matches.group(1) row_df = pd.DataFrame(
mutation = matches.group(2) columns=['Gene', 'Syndrome_Cn', 'inheritance', 'genotype', 'mutation', 'ClinicalSign'])
else:
raise UserWarning('HGVS 解析错误!')
selected_rows = expanded_database[expanded_database['Gene'].str.split(';').apply(lambda x: gene in x)] selected_rows = expanded_database[expanded_database['Gene'].str.split(';').apply(lambda x: gene in x)]
row_df['Syndrome_Cn'] = selected_rows['Syndrome_Cn'] row_df['Syndrome_Cn'] = selected_rows['Syndrome_Cn']
row_df['inheritance'] = selected_rows['inheritance'] row_df['inheritance'] = selected_rows['inheritance']
row_df['Gene'] = gene row_df['Gene'] = gene
row_df['mutation'] = mutation row_df['mutation'] = aacid
row_df['genotype'] = '纯合' if rows['Freq'] > 0.9 else '杂合' row_df['genotype'] = '纯合' if rows['Freq'] > 0.9 else '杂合'
row_df['ClinicalSign'] = str(rows['ClinicalSign']) row_df['ClinicalSign'] = str(rows['ClinicalSign'])