122 lines
4.9 KiB
Python
122 lines
4.9 KiB
Python
|
|
#!/usr/bin/python3
|
||
|
|
# -*- coding: UTF-8 -*-
|
||
|
|
|
||
|
|
import pandas as pd
|
||
|
|
from pandas import Series,DataFrame
|
||
|
|
import re
|
||
|
|
import os
|
||
|
|
import sys
|
||
|
|
|
||
|
|
if len(sys.argv) != 3:
|
||
|
|
print(" ".join(['usage:python3',sys.argv[0],'output_dir','name']))
|
||
|
|
sys.exit()
|
||
|
|
|
||
|
|
output_dir=sys.argv[1]
|
||
|
|
name=sys.argv[2]
|
||
|
|
snv_base="".join([name,'.snvindel.pos.txt'])
|
||
|
|
snv_file='/'.join([output_dir,'mutation',snv_base])
|
||
|
|
snv_base_new="".join([name,'.snvindel.pos.dedup.txt'])
|
||
|
|
snv_file_new='/'.join([output_dir,'mutation',snv_base_new])
|
||
|
|
fusion_base="".join([name,'.fusion.pos.txt'])
|
||
|
|
fusion_file='/'.join([output_dir,'fusion',fusion_base])
|
||
|
|
fusion_base_new="".join([name,'.fusion.pos.dedup.txt'])
|
||
|
|
fusion_file_new='/'.join([output_dir,'fusion',fusion_base_new])
|
||
|
|
cnv_base="".join([name,'.cnv.pos.txt'])
|
||
|
|
cnv_file='/'.join([output_dir,'cnvkit',cnv_base])
|
||
|
|
cnv_base_new="".join([name,'.cnv.pos.dedup.txt'])
|
||
|
|
cnv_file_new='/'.join([output_dir,'cnvkit',cnv_base_new])
|
||
|
|
gm_snv_file=''.join([output_dir,'/mutation/',name,'.snvindel.Germline.pos.txt'])
|
||
|
|
gm_snv_file_new=''.join([output_dir,'/mutation/',name,'.snvindel.Germline.pos.dedup.txt'])
|
||
|
|
|
||
|
|
open(snv_file_new, "w")
|
||
|
|
open(fusion_file_new, "w")
|
||
|
|
open(cnv_file_new, "w")
|
||
|
|
open(gm_snv_file_new, "w")
|
||
|
|
##Evidence_Source_C及标签排序
|
||
|
|
df_mapping_1 = pd.DataFrame({
|
||
|
|
'Evidence_Source_C': ['FDA', 'NMPA', 'NCCN', '临床III期', '临床II期', '临床I期', '临床试验', '回顾性研究', '个案', '临床前研究'],
|
||
|
|
})
|
||
|
|
sort_mapping_1 = df_mapping_1.reset_index().set_index('Evidence_Source_C')
|
||
|
|
df_mapping_2 = pd.DataFrame({'标签':['适应症','非适应症','.']})
|
||
|
|
sort_mapping_2 = df_mapping_2.reset_index().set_index('标签')
|
||
|
|
|
||
|
|
|
||
|
|
##snvindel处理
|
||
|
|
snv_size = os.path.getsize(snv_file)
|
||
|
|
if snv_size>0:
|
||
|
|
data=pd.read_table(snv_file,sep="\t")
|
||
|
|
data['level1'] = data['Evidence_Source_C'].map(sort_mapping_1['index'])
|
||
|
|
data['level2'] = data['标签'].map(sort_mapping_2['index'])
|
||
|
|
data.sort_values(by=['AAChange.refGene','level2','level1'],ascending=True,inplace=True)
|
||
|
|
data.drop(['level1','level2'],axis=1,inplace=True)
|
||
|
|
info={}
|
||
|
|
for index,row in data.iterrows():
|
||
|
|
if re.search(r'敏感',row['Response_Type_C']):
|
||
|
|
if row['标签']=='适应症':
|
||
|
|
info[row['AAChange.refGene'] + row['Drug']]='1'
|
||
|
|
else:
|
||
|
|
if (row['AAChange.refGene'] + row['Drug']) in info.keys():
|
||
|
|
data.drop([index],inplace=True)
|
||
|
|
data.insert(0,'可信',1)
|
||
|
|
data.to_csv(snv_file_new,index=False,sep='\t')
|
||
|
|
|
||
|
|
##germline snv/indel处理
|
||
|
|
gm_snv_size = os.path.getsize(gm_snv_file)
|
||
|
|
if gm_snv_size>0:
|
||
|
|
data=pd.read_table(gm_snv_file,sep="\t")
|
||
|
|
data['level1'] = data['Evidence_Source_C'].map(sort_mapping_1['index'])
|
||
|
|
data['level2'] = data['标签'].map(sort_mapping_2['index'])
|
||
|
|
data.sort_values(by=['AAChange.refGene','level2','level1'],ascending=True,inplace=True)
|
||
|
|
data.drop(['level1','level2'],axis=1,inplace=True)
|
||
|
|
info={}
|
||
|
|
for index,row in data.iterrows():
|
||
|
|
if re.search(r'敏感',row['Response_Type_C']):
|
||
|
|
if row['标签']=='适应症':
|
||
|
|
info[row['AAChange.refGene'] + row['Drug']]='1'
|
||
|
|
else:
|
||
|
|
if (row['AAChange.refGene'] + row['Drug']) in info.keys():
|
||
|
|
data.drop([index],inplace=True)
|
||
|
|
data.insert(0,'可信',1)
|
||
|
|
data.to_csv(gm_snv_file_new,index=False,sep='\t')
|
||
|
|
|
||
|
|
|
||
|
|
##fusion处理
|
||
|
|
fusion_size = os.path.getsize(fusion_file)
|
||
|
|
if fusion_size>0:
|
||
|
|
data=pd.read_table(fusion_file,sep="\t")
|
||
|
|
data['level1'] = data['Evidence_Source_C'].map(sort_mapping_1['index'])
|
||
|
|
data['level2'] = data['标签'].map(sort_mapping_2['index'])
|
||
|
|
data.sort_values(by=['FUSION','level2','level1'],ascending=True,inplace=True)
|
||
|
|
data.drop(['level1','level2'],axis=1,inplace=True)
|
||
|
|
info={}
|
||
|
|
for index,row in data.iterrows():
|
||
|
|
if re.search(r'敏感',row['Response_Type_C']):
|
||
|
|
if row['标签']=='适应症':
|
||
|
|
info[row['FUSION'] + row['Drug']]='1'
|
||
|
|
else:
|
||
|
|
if (row['FUSION'] + row['Drug']) in info.keys():
|
||
|
|
data.drop([index],inplace=True)
|
||
|
|
data.insert(0,'可信',1)
|
||
|
|
data.to_csv(fusion_file_new,index=False,sep='\t')
|
||
|
|
|
||
|
|
|
||
|
|
##cnv处理
|
||
|
|
cnv_size = os.path.getsize(cnv_file)
|
||
|
|
if cnv_size>0:
|
||
|
|
data=pd.read_table(cnv_file,sep="\t")
|
||
|
|
data['level1'] = data['Evidence_Source_C'].map(sort_mapping_1['index'])
|
||
|
|
data['level2'] = data['标签'].map(sort_mapping_2['index'])
|
||
|
|
data.sort_values(by=['Gene_Symbol','level2','level1'],ascending=True,inplace=True)
|
||
|
|
data.drop(['level1','level2'],axis=1,inplace=True)
|
||
|
|
info={}
|
||
|
|
for index,row in data.iterrows():
|
||
|
|
if re.search(r'敏感',row['Response_Type_C']):
|
||
|
|
if row['标签']=='适应症':
|
||
|
|
info[row['Gene_Symbol'] + row['Drug']]='1'
|
||
|
|
else:
|
||
|
|
if (row['Gene_Symbol'] + row['Drug']) in info.keys():
|
||
|
|
data.drop([index],inplace=True)
|
||
|
|
data.insert(0,'可信',1)
|
||
|
|
data.to_csv(cnv_file_new,index=False,sep='\t')
|
||
|
|
|