pipeline/script/drug_dedup.py

122 lines
4.9 KiB
Python
Raw Normal View History

2023-08-25 10:06:31 +08:00
#!/usr/bin/python3
# -*- coding: UTF-8 -*-
import pandas as pd
from pandas import Series,DataFrame
import re
import os
import sys
if len(sys.argv) != 3:
print(" ".join(['usage:python3',sys.argv[0],'output_dir','name']))
sys.exit()
output_dir=sys.argv[1]
name=sys.argv[2]
snv_base="".join([name,'.snvindel.pos.txt'])
snv_file='/'.join([output_dir,'mutation',snv_base])
snv_base_new="".join([name,'.snvindel.pos.dedup.txt'])
snv_file_new='/'.join([output_dir,'mutation',snv_base_new])
fusion_base="".join([name,'.fusion.pos.txt'])
fusion_file='/'.join([output_dir,'fusion',fusion_base])
fusion_base_new="".join([name,'.fusion.pos.dedup.txt'])
fusion_file_new='/'.join([output_dir,'fusion',fusion_base_new])
cnv_base="".join([name,'.cnv.pos.txt'])
cnv_file='/'.join([output_dir,'cnvkit',cnv_base])
cnv_base_new="".join([name,'.cnv.pos.dedup.txt'])
cnv_file_new='/'.join([output_dir,'cnvkit',cnv_base_new])
gm_snv_file=''.join([output_dir,'/mutation/',name,'.snvindel.Germline.pos.txt'])
gm_snv_file_new=''.join([output_dir,'/mutation/',name,'.snvindel.Germline.pos.dedup.txt'])
open(snv_file_new, "w")
open(fusion_file_new, "w")
open(cnv_file_new, "w")
open(gm_snv_file_new, "w")
##Evidence_Source_C及标签排序
df_mapping_1 = pd.DataFrame({
'Evidence_Source_C': ['FDA', 'NMPA', 'NCCN', '临床III期', '临床II期', '临床I期', '临床试验', '回顾性研究', '个案', '临床前研究'],
})
sort_mapping_1 = df_mapping_1.reset_index().set_index('Evidence_Source_C')
df_mapping_2 = pd.DataFrame({'标签':['适应症','非适应症','.']})
sort_mapping_2 = df_mapping_2.reset_index().set_index('标签')
##snvindel处理
snv_size = os.path.getsize(snv_file)
if snv_size>0:
data=pd.read_table(snv_file,sep="\t")
data['level1'] = data['Evidence_Source_C'].map(sort_mapping_1['index'])
data['level2'] = data['标签'].map(sort_mapping_2['index'])
data.sort_values(by=['AAChange.refGene','level2','level1'],ascending=True,inplace=True)
data.drop(['level1','level2'],axis=1,inplace=True)
info={}
for index,row in data.iterrows():
if re.search(r'敏感',row['Response_Type_C']):
if row['标签']=='适应症':
info[row['AAChange.refGene'] + row['Drug']]='1'
else:
if (row['AAChange.refGene'] + row['Drug']) in info.keys():
data.drop([index],inplace=True)
data.insert(0,'可信',1)
data.to_csv(snv_file_new,index=False,sep='\t')
##germline snv/indel处理
gm_snv_size = os.path.getsize(gm_snv_file)
if gm_snv_size>0:
data=pd.read_table(gm_snv_file,sep="\t")
data['level1'] = data['Evidence_Source_C'].map(sort_mapping_1['index'])
data['level2'] = data['标签'].map(sort_mapping_2['index'])
data.sort_values(by=['AAChange.refGene','level2','level1'],ascending=True,inplace=True)
data.drop(['level1','level2'],axis=1,inplace=True)
info={}
for index,row in data.iterrows():
if re.search(r'敏感',row['Response_Type_C']):
if row['标签']=='适应症':
info[row['AAChange.refGene'] + row['Drug']]='1'
else:
if (row['AAChange.refGene'] + row['Drug']) in info.keys():
data.drop([index],inplace=True)
data.insert(0,'可信',1)
data.to_csv(gm_snv_file_new,index=False,sep='\t')
##fusion处理
fusion_size = os.path.getsize(fusion_file)
if fusion_size>0:
data=pd.read_table(fusion_file,sep="\t")
data['level1'] = data['Evidence_Source_C'].map(sort_mapping_1['index'])
data['level2'] = data['标签'].map(sort_mapping_2['index'])
data.sort_values(by=['FUSION','level2','level1'],ascending=True,inplace=True)
data.drop(['level1','level2'],axis=1,inplace=True)
info={}
for index,row in data.iterrows():
if re.search(r'敏感',row['Response_Type_C']):
if row['标签']=='适应症':
info[row['FUSION'] + row['Drug']]='1'
else:
if (row['FUSION'] + row['Drug']) in info.keys():
data.drop([index],inplace=True)
data.insert(0,'可信',1)
data.to_csv(fusion_file_new,index=False,sep='\t')
##cnv处理
cnv_size = os.path.getsize(cnv_file)
if cnv_size>0:
data=pd.read_table(cnv_file,sep="\t")
data['level1'] = data['Evidence_Source_C'].map(sort_mapping_1['index'])
data['level2'] = data['标签'].map(sort_mapping_2['index'])
data.sort_values(by=['Gene_Symbol','level2','level1'],ascending=True,inplace=True)
data.drop(['level1','level2'],axis=1,inplace=True)
info={}
for index,row in data.iterrows():
if re.search(r'敏感',row['Response_Type_C']):
if row['标签']=='适应症':
info[row['Gene_Symbol'] + row['Drug']]='1'
else:
if (row['Gene_Symbol'] + row['Drug']) in info.keys():
data.drop([index],inplace=True)
data.insert(0,'可信',1)
data.to_csv(cnv_file_new,index=False,sep='\t')