pipeline/script/public/hotspot/hotspot.py

31 lines
1009 B
Python
Raw Normal View History

2023-11-01 10:09:29 +08:00
import sys
import pysam
genes_and_proteins = set()
with open(sys.argv[1], 'r') as file:
for line in file:
gene, protein = line.strip().split('\t')
genes_and_proteins.add((gene, protein))
with pysam.VariantFile('/dataseq/jmdna/database/cosmic/v91_GRCh37/CosmicCodingMuts.vcf', 'r') as vcf_file:
for chr in range(1, 23):
vcf_file.header.add_line(f'##contig=<ID={chr}>')
for chr in ['X', 'Y', 'MT']:
vcf_file.header.add_line(f'##contig=<ID={chr}>')
output_vcf = pysam.VariantFile('hotspot_snv.vcf', 'w', header=vcf_file.header)
record_list = set()
for record in vcf_file:
gene = record.info.get('GENE', '')
protein = record.info.get('AA', '')
for (g, p) in genes_and_proteins:
if g in gene and p in protein:
output_vcf.write(record)
record_list.add((g, p))
no_record = genes_and_proteins.difference(record_list)
print('没有获取到记录的有:', no_record)
output_vcf.close()