31 lines
961 B
Python
31 lines
961 B
Python
import sys
|
|
|
|
import pysam
|
|
|
|
genes_and_proteins = set()
|
|
with open(sys.argv[1], 'r') as file:
|
|
for line in file:
|
|
gene, protein = line.strip().split('\t')
|
|
genes_and_proteins.add((gene, protein))
|
|
|
|
with pysam.VariantFile(sys.argv[2], 'r') as vcf_file:
|
|
for chr in range(1, 23):
|
|
vcf_file.header.add_line(f'##contig=<ID={chr}>')
|
|
for chr in ['X', 'Y', 'MT']:
|
|
vcf_file.header.add_line(f'##contig=<ID={chr}>')
|
|
output_vcf = pysam.VariantFile('hotspot.filtered.vcf', 'w', header=vcf_file.header)
|
|
record_list = set()
|
|
|
|
for record in vcf_file:
|
|
gene = record.info.get('GENE', '')
|
|
protein = record.info.get('AA', '')
|
|
for (g, p) in genes_and_proteins:
|
|
if g in gene and p in protein:
|
|
output_vcf.write(record)
|
|
record_list.add((g, p))
|
|
no_record = genes_and_proteins.difference(record_list)
|
|
|
|
print('没有获取到记录的有:', no_record)
|
|
|
|
output_vcf.close()
|