70 lines
2.2 KiB
Python
70 lines
2.2 KiB
Python
|
|
# -*-coding:utf-8-*-
|
||
|
|
import re
|
||
|
|
import subprocess
|
||
|
|
import sys
|
||
|
|
|
||
|
|
if len(sys.argv) < 2:
|
||
|
|
print("usage:python genome_3rule.py input_vcf output_vcf reference")
|
||
|
|
sys.exit()
|
||
|
|
|
||
|
|
ref = sys.argv[3]
|
||
|
|
|
||
|
|
|
||
|
|
def getseq(ref, chr, start, end):
|
||
|
|
cmd = f'samtools faidx {ref} {chr}:{start}-{end}'
|
||
|
|
refseq = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding="utf-8")
|
||
|
|
refseq = re.sub(f">\w+:\d+-\d+|\n", "", refseq.stdout)
|
||
|
|
return refseq
|
||
|
|
|
||
|
|
|
||
|
|
def find_index(query, refseq):
|
||
|
|
if len(list(set(query))) == 1:
|
||
|
|
query = list(set(query))[0]
|
||
|
|
bool = 1
|
||
|
|
index = 0
|
||
|
|
pattern = re.compile(query)
|
||
|
|
dup = pattern.match(refseq)
|
||
|
|
if dup:
|
||
|
|
index = len(query) * bool
|
||
|
|
while dup:
|
||
|
|
dup = pattern.match(refseq, index)
|
||
|
|
if dup:
|
||
|
|
bool += 1
|
||
|
|
index = len(query) * bool
|
||
|
|
return index
|
||
|
|
|
||
|
|
|
||
|
|
input = sys.argv[1]
|
||
|
|
output = sys.argv[2]
|
||
|
|
out = open(output, 'w')
|
||
|
|
with open(input, 'r') as file:
|
||
|
|
for line in file:
|
||
|
|
if re.match(r'^#', line):
|
||
|
|
out.write(line)
|
||
|
|
else:
|
||
|
|
lines = line.split('\t')
|
||
|
|
if len(lines[3]) == 1 and len(lines[4]) == 1:
|
||
|
|
out.write(line)
|
||
|
|
else:
|
||
|
|
query = lines[3] if len(lines[3]) > 1 else lines[4]
|
||
|
|
query = "".join(list(query)[1:])
|
||
|
|
(chr, pos) = lines[0:2]
|
||
|
|
start = int(pos) + 1
|
||
|
|
end = int(pos) + 100
|
||
|
|
refseq = getseq(ref, chr, start, end)
|
||
|
|
index = find_index(query, refseq)
|
||
|
|
if index:
|
||
|
|
if len(lines[3]) > 1:
|
||
|
|
lines[1] = str(int(pos) + index - len(query))
|
||
|
|
refbase = getseq(ref, chr, lines[1], lines[1])
|
||
|
|
lines[3] = str(refbase + query)
|
||
|
|
lines[4] = str(refbase)
|
||
|
|
else:
|
||
|
|
lines[1] = str(int(pos) + index)
|
||
|
|
refbase = getseq(ref, chr, lines[1], lines[1])
|
||
|
|
lines[4] = str(refbase + query)
|
||
|
|
lines[3] = str(refbase)
|
||
|
|
out.write('\t'.join(lines))
|
||
|
|
else:
|
||
|
|
out.write(line)
|