pipeline/codes/drug_dedup.py

115 lines
5.0 KiB
Python
Executable File

#!/usr/bin/python3
# -*- coding: UTF-8 -*-
import os
import re
import sys
import pandas as pd
if len(sys.argv) != 3:
print(" ".join(['usage:python3', sys.argv[0], 'output_dir', 'name']))
sys.exit()
output_dir = sys.argv[1]
name = sys.argv[2]
snv_file = os.path.join(output_dir, 'mutation', f'{name}.somatic.hg19_multianno.filter.sum.pos.txt')
snv_file_new = os.path.join(output_dir, 'mutation', f'{name}.somatic.hg19_multianno.filter.sum.pos.dedup.txt')
fusion_file = os.path.join(output_dir, 'fusion', f'{name}.fusion.hg19_multianno.filter.fusion.pos.txt')
fusion_file_new = os.path.join(output_dir, 'fusion', f'{name}.fusion.hg19_multianno.filter.fusion.pos.dedup.txt')
cnv_file = os.path.join(output_dir, 'cnv', f'{name}.rmdup.cns.filter.pos.txt')
cnv_file_new = os.path.join(output_dir, 'cnvkit', f'{name}.rmdup.cns.filter.pos.dedup.txt')
# gm_snv_file = os.path.join(output_dir, '/mutation/', name, '.snvindel.Germline.pos.txt')
# gm_snv_file_new = os.path.join(output_dir, '/mutation/', name, '.snvindel.Germline.pos.dedup.txt')
open(snv_file_new, "w")
open(fusion_file_new, "w")
open(cnv_file_new, "w")
# open(gm_snv_file_new, "w")
##Evidence_Source_C及标签排序
df_mapping_1 = pd.DataFrame({
'Evidence_Source_C': ['FDA', 'NMPA', 'NCCN', '临床III期', '临床II期', '临床I期', '临床试验', '回顾性研究', '个案', '临床前研究'],
})
sort_mapping_1 = df_mapping_1.reset_index().set_index('Evidence_Source_C')
df_mapping_2 = pd.DataFrame({'标签': ['适应症', '非适应症', '.']})
sort_mapping_2 = df_mapping_2.reset_index().set_index('标签')
##snvindel处理
snv_size = os.path.getsize(snv_file)
if snv_size > 0:
data = pd.read_table(snv_file, sep="\t")
data['level1'] = data['Evidence_Source_C'].map(sort_mapping_1['index'])
data['level2'] = data['标签'].map(sort_mapping_2['index'])
data.sort_values(by=['AAChange.refGene', 'level2', 'level1'], ascending=True, inplace=True)
data.drop(['level1', 'level2'], axis=1, inplace=True)
info = {}
for index, row in data.iterrows():
if re.search(r'敏感', row['Response_Type_C']):
if row['标签'] == '适应症':
info[row['AAChange.refGene'] + row['Drug']] = '1'
else:
if (row['AAChange.refGene'] + row['Drug']) in info.keys():
data.drop([index], inplace=True)
data.insert(0, '可信', 1)
data.to_csv(snv_file_new, index=False, sep='\t')
# ##germline snv/indel处理
# gm_snv_size = os.path.getsize(gm_snv_file)
# if gm_snv_size > 0:
# data = pd.read_table(gm_snv_file, sep="\t")
# data['level1'] = data['Evidence_Source_C'].map(sort_mapping_1['index'])
# data['level2'] = data['标签'].map(sort_mapping_2['index'])
# data.sort_values(by=['AAChange.refGene', 'level2', 'level1'], ascending=True, inplace=True)
# data.drop(['level1', 'level2'], axis=1, inplace=True)
# info = {}
# for index, row in data.iterrows():
# if re.search(r'敏感', row['Response_Type_C']):
# if row['标签'] == '适应症':
# info[row['AAChange.refGene'] + row['Drug']] = '1'
# else:
# if (row['AAChange.refGene'] + row['Drug']) in info.keys():
# data.drop([index], inplace=True)
# data.insert(0, '可信', 1)
# data.to_csv(gm_snv_file_new, index=False, sep='\t')
##fusion处理
fusion_size = os.path.getsize(fusion_file)
if fusion_size > 0:
data = pd.read_table(fusion_file, sep="\t")
data['level1'] = data['Evidence_Source_C'].map(sort_mapping_1['index'])
data['level2'] = data['标签'].map(sort_mapping_2['index'])
data.sort_values(by=['FUSION', 'level2', 'level1'], ascending=True, inplace=True)
data.drop(['level1', 'level2'], axis=1, inplace=True)
info = {}
for index, row in data.iterrows():
if re.search(r'敏感', row['Response_Type_C']):
if row['标签'] == '适应症':
info[row['FUSION'] + row['Drug']] = '1'
else:
if (row['FUSION'] + row['Drug']) in info.keys():
data.drop([index], inplace=True)
data.insert(0, '可信', 1)
data.to_csv(fusion_file_new, index=False, sep='\t')
##cnv处理
cnv_size = os.path.getsize(cnv_file)
if cnv_size > 0:
data = pd.read_table(cnv_file, sep="\t")
data['level1'] = data['Evidence_Source_C'].map(sort_mapping_1['index'])
data['level2'] = data['标签'].map(sort_mapping_2['index'])
data.sort_values(by=['Gene_Symbol', 'level2', 'level1'], ascending=True, inplace=True)
data.drop(['level1', 'level2'], axis=1, inplace=True)
info = {}
for index, row in data.iterrows():
if re.search(r'敏感', row['Response_Type_C']):
if row['标签'] == '适应症':
info[row['Gene_Symbol'] + row['Drug']] = '1'
else:
if (row['Gene_Symbol'] + row['Drug']) in info.keys():
data.drop([index], inplace=True)
data.insert(0, '可信', 1)
data.to_csv(cnv_file_new, index=False, sep='\t')