#!/usr/bin/python3 # -*- coding: UTF-8 -*- import os import re import sys import pandas as pd if len(sys.argv) != 3: print(" ".join(['usage:python3', sys.argv[0], 'output_dir', 'name'])) sys.exit() output_dir = sys.argv[1] name = sys.argv[2] snv_file = os.path.join(output_dir, 'mutation', f'{name}.somatic.hg19_multianno.filter.sum.pos.txt') snv_file_new = os.path.join(output_dir, 'mutation', f'{name}.somatic.hg19_multianno.filter.sum.pos.dedup.txt') fusion_file = os.path.join(output_dir, 'fusion', f'{name}.fusion.hg19_multianno.filter.fusion.pos.txt') fusion_file_new = os.path.join(output_dir, 'fusion', f'{name}.fusion.hg19_multianno.filter.fusion.pos.dedup.txt') cnv_file = os.path.join(output_dir, 'cnv', f'{name}.rmdup.cns.filter.pos.txt') cnv_file_new = os.path.join(output_dir, 'cnvkit', f'{name}.rmdup.cns.filter.pos.dedup.txt') # gm_snv_file = os.path.join(output_dir, '/mutation/', name, '.snvindel.Germline.pos.txt') # gm_snv_file_new = os.path.join(output_dir, '/mutation/', name, '.snvindel.Germline.pos.dedup.txt') open(snv_file_new, "w") open(fusion_file_new, "w") open(cnv_file_new, "w") # open(gm_snv_file_new, "w") ##Evidence_Source_C及标签排序 df_mapping_1 = pd.DataFrame({ 'Evidence_Source_C': ['FDA', 'NMPA', 'NCCN', '临床III期', '临床II期', '临床I期', '临床试验', '回顾性研究', '个案', '临床前研究'], }) sort_mapping_1 = df_mapping_1.reset_index().set_index('Evidence_Source_C') df_mapping_2 = pd.DataFrame({'标签': ['适应症', '非适应症', '.']}) sort_mapping_2 = df_mapping_2.reset_index().set_index('标签') ##snvindel处理 snv_size = os.path.getsize(snv_file) if snv_size > 0: data = pd.read_table(snv_file, sep="\t") data['level1'] = data['Evidence_Source_C'].map(sort_mapping_1['index']) data['level2'] = data['标签'].map(sort_mapping_2['index']) data.sort_values(by=['AAChange.refGene', 'level2', 'level1'], ascending=True, inplace=True) data.drop(['level1', 'level2'], axis=1, inplace=True) info = {} for index, row in data.iterrows(): if re.search(r'敏感', row['Response_Type_C']): if row['标签'] == '适应症': info[row['AAChange.refGene'] + row['Drug']] = '1' else: if (row['AAChange.refGene'] + row['Drug']) in info.keys(): data.drop([index], inplace=True) data.insert(0, '可信', 1) data.to_csv(snv_file_new, index=False, sep='\t') # ##germline snv/indel处理 # gm_snv_size = os.path.getsize(gm_snv_file) # if gm_snv_size > 0: # data = pd.read_table(gm_snv_file, sep="\t") # data['level1'] = data['Evidence_Source_C'].map(sort_mapping_1['index']) # data['level2'] = data['标签'].map(sort_mapping_2['index']) # data.sort_values(by=['AAChange.refGene', 'level2', 'level1'], ascending=True, inplace=True) # data.drop(['level1', 'level2'], axis=1, inplace=True) # info = {} # for index, row in data.iterrows(): # if re.search(r'敏感', row['Response_Type_C']): # if row['标签'] == '适应症': # info[row['AAChange.refGene'] + row['Drug']] = '1' # else: # if (row['AAChange.refGene'] + row['Drug']) in info.keys(): # data.drop([index], inplace=True) # data.insert(0, '可信', 1) # data.to_csv(gm_snv_file_new, index=False, sep='\t') ##fusion处理 fusion_size = os.path.getsize(fusion_file) if fusion_size > 0: data = pd.read_table(fusion_file, sep="\t") data['level1'] = data['Evidence_Source_C'].map(sort_mapping_1['index']) data['level2'] = data['标签'].map(sort_mapping_2['index']) data.sort_values(by=['FUSION', 'level2', 'level1'], ascending=True, inplace=True) data.drop(['level1', 'level2'], axis=1, inplace=True) info = {} for index, row in data.iterrows(): if re.search(r'敏感', row['Response_Type_C']): if row['标签'] == '适应症': info[row['FUSION'] + row['Drug']] = '1' else: if (row['FUSION'] + row['Drug']) in info.keys(): data.drop([index], inplace=True) data.insert(0, '可信', 1) data.to_csv(fusion_file_new, index=False, sep='\t') ##cnv处理 cnv_size = os.path.getsize(cnv_file) if cnv_size > 0: data = pd.read_table(cnv_file, sep="\t") data['level1'] = data['Evidence_Source_C'].map(sort_mapping_1['index']) data['level2'] = data['标签'].map(sort_mapping_2['index']) data.sort_values(by=['Gene_Symbol', 'level2', 'level1'], ascending=True, inplace=True) data.drop(['level1', 'level2'], axis=1, inplace=True) info = {} for index, row in data.iterrows(): if re.search(r'敏感', row['Response_Type_C']): if row['标签'] == '适应症': info[row['Gene_Symbol'] + row['Drug']] = '1' else: if (row['Gene_Symbol'] + row['Drug']) in info.keys(): data.drop([index], inplace=True) data.insert(0, '可信', 1) data.to_csv(cnv_file_new, index=False, sep='\t')