pipeline/script/correct_genome_3rule.py

77 lines
2.3 KiB
Python
Executable File

#! /usr/bin/env python3
# -*-coding:utf-8-*-
import re
import subprocess
import sys
# if len(sys.argv) < 2:
# print("usage:python genome_3rule.py input_vcf output_vcf reference")
# sys.exit()
def getseq(ref, chr, start, end):
cmd = f'samtools faidx {ref} {chr}:{start}-{end}'
refseq = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding="utf-8")
refseq = re.sub(f">\w+:\d+-\d+|\n", "", refseq.stdout)
return refseq
def find_index(query, refseq):
if len(list(set(query))) == 1:
query = list(set(query))[0]
bool = 1
index = 0
pattern = re.compile(query)
dup = pattern.match(refseq)
if dup:
index = len(query) * bool
while dup:
dup = pattern.match(refseq, index)
if dup:
bool += 1
index = len(query) * bool
return index
if len(sys.argv) < 4:
print("usage:python genome_3rule.py input_vcf output_vcf reference")
sys.exit()
else:
input = sys.argv[1]
output = sys.argv[2]
ref = sys.argv[3]
out = open(output, 'w')
with open(input, 'r') as file:
for line in file:
if re.match(r'^#', line):
out.write(line)
else:
lines = line.split('\t')
if len(lines[3]) == 1 and len(lines[4]) == 1:
out.write(line)
else:
query = lines[3] if len(lines[3]) > 1 else lines[4]
query = "".join(list(query)[1:])
(chr, pos) = lines[0:2]
start = int(pos) + 1
end = int(pos) + 100
refseq = getseq(ref, chr, start, end)
index = find_index(query, refseq)
if index:
if len(lines[3]) > 1:
lines[1] = str(int(pos) + index - len(query))
refbase = getseq(ref, chr, lines[1], lines[1])
lines[3] = str(refbase + query)
lines[4] = str(refbase)
else:
lines[1] = str(int(pos) + index)
refbase = getseq(ref, chr, lines[1], lines[1])
lines[4] = str(refbase + query)
lines[3] = str(refbase)
out.write('\t'.join(lines))
else:
out.write(line)