#! /usr/bin/env python3 # -*-coding:utf-8-*- import re import subprocess import sys if len(sys.argv) < 2: print("usage:python genome_3rule.py input_vcf output_vcf reference") sys.exit() ref = sys.argv[3] def getseq(ref, chr, start, end): cmd = f'samtools faidx {ref} {chr}:{start}-{end}' refseq = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding="utf-8") refseq = re.sub(f">\w+:\d+-\d+|\n", "", refseq.stdout) return refseq def find_index(query, refseq): if len(list(set(query))) == 1: query = list(set(query))[0] bool = 1 index = 0 pattern = re.compile(query) dup = pattern.match(refseq) if dup: index = len(query) * bool while dup: dup = pattern.match(refseq, index) if dup: bool += 1 index = len(query) * bool return index input = sys.argv[1] output = sys.argv[2] out = open(output, 'w') with open(input, 'r') as file: for line in file: if re.match(r'^#', line): out.write(line) else: lines = line.split('\t') if len(lines[3]) == 1 and len(lines[4]) == 1: out.write(line) else: query = lines[3] if len(lines[3]) > 1 else lines[4] query = "".join(list(query)[1:]) (chr, pos) = lines[0:2] start = int(pos) + 1 end = int(pos) + 100 refseq = getseq(ref, chr, start, end) index = find_index(query, refseq) if index: if len(lines[3]) > 1: lines[1] = str(int(pos) + index - len(query)) refbase = getseq(ref, chr, lines[1], lines[1]) lines[3] = str(refbase + query) lines[4] = str(refbase) else: lines[1] = str(int(pos) + index) refbase = getseq(ref, chr, lines[1], lines[1]) lines[4] = str(refbase + query) lines[3] = str(refbase) out.write('\t'.join(lines)) else: out.write(line)