2023-11-30 15:31:35 +08:00
|
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
|
|
|
|
import argparse
|
|
|
|
|
|
import os
|
|
|
|
|
|
|
|
|
|
|
|
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def single_monitoring(name, vcf, bed, freq_range, output_dir):
|
|
|
|
|
|
vcf_header = [] # 用于存储VCF文件头部分
|
|
|
|
|
|
vcf_data = [] # 用于存储筛选后的VCF数据
|
|
|
|
|
|
p_value_list = [] # 用于存储符合条件的 p-value
|
|
|
|
|
|
|
|
|
|
|
|
# 按行处理,保存为列表
|
|
|
|
|
|
with open(vcf, 'r') as vcf_file:
|
|
|
|
|
|
for line in vcf_file:
|
|
|
|
|
|
if line.startswith("#"):
|
|
|
|
|
|
vcf_header.append(line)
|
|
|
|
|
|
else:
|
|
|
|
|
|
vcf_data.append(line)
|
|
|
|
|
|
|
|
|
|
|
|
# bed 处理
|
|
|
|
|
|
df_position = pd.read_csv(bed, sep='\t', header=None, names=range(4))
|
|
|
|
|
|
position_list = list(df_position[2])
|
|
|
|
|
|
chr_list = list(df_position[0])
|
|
|
|
|
|
result_data = []
|
|
|
|
|
|
|
|
|
|
|
|
for i in range(len(position_list)):
|
|
|
|
|
|
filtered_lines = [line for line in vcf_data if
|
|
|
|
|
|
line.split()[1] == str(position_list[i]) and line.split()[0] == str(chr_list[i]) and len(
|
|
|
|
|
|
line.split()[3]) < 2 and len(line.split()[4]) < 2]
|
|
|
|
|
|
for line in filtered_lines:
|
|
|
|
|
|
p_value_str = line.split()[9].split(":")[4]
|
|
|
|
|
|
p_value = float(p_value_str[:-1]) / 100 if p_value_str[-1] == "%" else float(p_value_str)
|
|
|
|
|
|
if 0.1 <= p_value <= 0.9:
|
|
|
|
|
|
result_data.append(line)
|
|
|
|
|
|
p_value_list.append(p_value)
|
|
|
|
|
|
select_vcf = os.path.join(output_dir, f'{name}_select.vcf')
|
|
|
|
|
|
with open(select_vcf, 'w') as output:
|
|
|
|
|
|
for header_line in vcf_header:
|
|
|
|
|
|
output.write(header_line)
|
|
|
|
|
|
for data_line in result_data:
|
|
|
|
|
|
output.write(data_line)
|
|
|
|
|
|
|
|
|
|
|
|
count_normal = 0
|
|
|
|
|
|
count_exception = 0
|
|
|
|
|
|
for p_value in p_value_list:
|
|
|
|
|
|
if freq_range[0] <= p_value <= freq_range[1]:
|
|
|
|
|
|
count_normal += 1
|
|
|
|
|
|
else:
|
|
|
|
|
|
count_exception += 1
|
|
|
|
|
|
count_all = count_exception + count_normal
|
|
|
|
|
|
if count_all == 0:
|
|
|
|
|
|
z_score = 0
|
|
|
|
|
|
else:
|
|
|
|
|
|
z_score = count_exception / count_all
|
|
|
|
|
|
|
|
|
|
|
|
res = dict(
|
|
|
|
|
|
barcode=name,
|
|
|
|
|
|
count_normal=count_normal,
|
|
|
|
|
|
count_exception=count_exception,
|
|
|
|
|
|
z_score=z_score
|
|
|
|
|
|
)
|
2023-12-04 13:53:30 +08:00
|
|
|
|
pd.DataFrame([res]).to_csv(os.path.join(output_dir, f'{name}_pollution.csv'), sep="\t", index=False)
|
2023-11-30 15:31:35 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 根据小bed筛选vcf
|
|
|
|
|
|
def select_position(vcf, bed, matched_file, unmatched_file):
|
|
|
|
|
|
vcf_header = [] # 用于存储VCF文件头部分
|
|
|
|
|
|
vcf_data = [] # 用于存储筛选后的VCF数据
|
|
|
|
|
|
|
|
|
|
|
|
# 按行处理,保存为列表
|
|
|
|
|
|
with open(vcf, 'r') as vcf_file:
|
|
|
|
|
|
for line in vcf_file:
|
|
|
|
|
|
if line.startswith("#"):
|
|
|
|
|
|
vcf_header.append(line)
|
|
|
|
|
|
else:
|
|
|
|
|
|
vcf_data.append(line)
|
|
|
|
|
|
|
|
|
|
|
|
df_position = pd.read_csv(bed, sep='\t', header=None, names=range(4))
|
|
|
|
|
|
position_list = list(df_position[2])
|
|
|
|
|
|
chr_list = list(df_position[0])
|
|
|
|
|
|
result_data = []
|
|
|
|
|
|
unmatched_data = [] # 用于存储未匹配的数据
|
|
|
|
|
|
|
|
|
|
|
|
for i in range(len(position_list)):
|
|
|
|
|
|
filtered_lines = [line for line in vcf_data if
|
|
|
|
|
|
line.split()[1] == str(position_list[i]) and line.split()[0] == str(chr_list[i]) and len(
|
|
|
|
|
|
line.split()[3]) < 2 and len(line.split()[4]) < 2]
|
|
|
|
|
|
if not filtered_lines: # 如果没有匹配的点,添加到未匹配数据列表
|
|
|
|
|
|
unmatched_data.append(f"{chr_list[i]}\t{position_list[i]}\t.\t.\t.\t.\t.\n")
|
|
|
|
|
|
result_data.extend(filtered_lines)
|
|
|
|
|
|
|
|
|
|
|
|
with open(matched_file, 'w') as output:
|
|
|
|
|
|
for header_line in vcf_header:
|
|
|
|
|
|
output.write(header_line)
|
|
|
|
|
|
for data_line in result_data:
|
|
|
|
|
|
output.write(data_line)
|
|
|
|
|
|
|
|
|
|
|
|
with open(unmatched_file, 'w') as unmatched_output:
|
|
|
|
|
|
unmatched_output.writelines(unmatched_data)
|
|
|
|
|
|
|
|
|
|
|
|
return matched_file, unmatched_file
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 处理体系、胚系的vcf,得到目标信息
|
|
|
|
|
|
def process_judge_vcf(input_vcf, output_vcf):
|
|
|
|
|
|
with open(input_vcf, 'r') as input_file, open(output_vcf, 'w') as output_file:
|
|
|
|
|
|
for line in input_file:
|
|
|
|
|
|
if not line.startswith("#"):
|
|
|
|
|
|
fields = line.strip().split('\t')
|
|
|
|
|
|
info = fields[9].split(":")
|
2023-12-25 14:06:30 +08:00
|
|
|
|
percentage = float(info[6])
|
2023-11-30 15:31:35 +08:00
|
|
|
|
|
|
|
|
|
|
if 0.1 <= percentage <= 0.9:
|
|
|
|
|
|
b = 0.5
|
|
|
|
|
|
elif percentage < 0.1:
|
|
|
|
|
|
b = 0
|
|
|
|
|
|
elif percentage > 0.9:
|
|
|
|
|
|
b = 1
|
|
|
|
|
|
|
|
|
|
|
|
# 构建新的行数据
|
|
|
|
|
|
new_line = '\t'.join([fields[0], fields[1], fields[3], fields[4], info[4], str(b), info[2]])
|
|
|
|
|
|
output_file.write(new_line + '\n')
|
|
|
|
|
|
return output_vcf
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def merge_and_sort_files(matched_file, unmatched_file, output_file):
|
|
|
|
|
|
# 检查 unmatched_file 是否为空
|
|
|
|
|
|
if os.stat(unmatched_file).st_size == 0:
|
|
|
|
|
|
# 对 matched_file 进行排序并写入 output_file
|
|
|
|
|
|
matched_df = pd.read_csv(matched_file, sep='\t', header=None)
|
|
|
|
|
|
sorted_df = matched_df.sort_values(by=[0, 1])
|
|
|
|
|
|
sorted_df.to_csv(output_file, sep='\t', header=False, index=False)
|
|
|
|
|
|
return output_file
|
|
|
|
|
|
|
|
|
|
|
|
# 如果 unmatched_file 不为空,继续合并和排序操作
|
2023-12-25 14:06:30 +08:00
|
|
|
|
if os.stat(matched_file).st_size == 0:
|
|
|
|
|
|
matched_df = pd.DataFrame()
|
|
|
|
|
|
else:
|
|
|
|
|
|
matched_df = pd.read_csv(matched_file, sep='\t', header=None)
|
2023-11-30 15:31:35 +08:00
|
|
|
|
unmatched_df = pd.read_csv(unmatched_file, sep='\t', header=None)
|
|
|
|
|
|
|
|
|
|
|
|
# 合并数据帧
|
|
|
|
|
|
combined_df = pd.concat([matched_df, unmatched_df])
|
|
|
|
|
|
|
|
|
|
|
|
# 根据第一列和第二列排序
|
|
|
|
|
|
sorted_df = combined_df.sort_values(by=[0, 1])
|
|
|
|
|
|
|
|
|
|
|
|
# 将排序后的数据写入输出文件
|
|
|
|
|
|
sorted_df.to_csv(output_file, sep='\t', header=False, index=False)
|
|
|
|
|
|
return output_file
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 合并体系,胚系
|
|
|
|
|
|
def merge_and_compare_files(somatic_file, germline_file, output_merged_file, output_final_file):
|
|
|
|
|
|
# 合并两个文件
|
|
|
|
|
|
with open(somatic_file, 'r') as somatic, open(germline_file, 'r') as germline:
|
|
|
|
|
|
merged_lines = [f"{somatic_line.strip()}\t{germline_line.strip()}" for somatic_line, germline_line in
|
|
|
|
|
|
zip(somatic, germline)]
|
|
|
|
|
|
|
|
|
|
|
|
# 将合并后的数据写入输出文件
|
|
|
|
|
|
with open(output_merged_file, 'w') as output_file:
|
|
|
|
|
|
output_file.write('\n'.join(merged_lines))
|
|
|
|
|
|
|
|
|
|
|
|
# 比较两列数据并添加比较结果列
|
|
|
|
|
|
with open(output_merged_file, 'r') as merged, open(output_final_file, 'w') as final_output:
|
|
|
|
|
|
for line in merged:
|
|
|
|
|
|
fields = line.strip().split('\t')
|
|
|
|
|
|
if fields[5] == fields[12]:
|
|
|
|
|
|
comparison_result = "yes"
|
|
|
|
|
|
else:
|
|
|
|
|
|
comparison_result = "no"
|
|
|
|
|
|
final_output.write(f"{line.strip()}\t{comparison_result}\n")
|
|
|
|
|
|
return output_merged_file, output_final_file
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 根据大bed筛选vcf,作cnvkit的图
|
|
|
|
|
|
def select_cnvkit_vcf(vcf, bed, output_file):
|
|
|
|
|
|
vcf_header = [] # 用于存储VCF文件头部分
|
|
|
|
|
|
vcf_data = [] # 用于存储筛选后的VCF数据
|
|
|
|
|
|
p_value_list = [] # 用于存储符合条件的 p-value
|
|
|
|
|
|
|
|
|
|
|
|
# 按行处理,保存为列表
|
|
|
|
|
|
with open(vcf, 'r') as vcf_file:
|
|
|
|
|
|
for line in vcf_file:
|
|
|
|
|
|
if line.startswith("#"):
|
|
|
|
|
|
vcf_header.append(line)
|
|
|
|
|
|
else:
|
|
|
|
|
|
vcf_data.append(line)
|
|
|
|
|
|
|
|
|
|
|
|
df_position = pd.read_csv(bed, sep='\t', header=None, names=range(4))
|
|
|
|
|
|
position_list = list(df_position[2])
|
|
|
|
|
|
chr_list = list(df_position[0])
|
|
|
|
|
|
result_data = []
|
|
|
|
|
|
for i in range(len(position_list)):
|
|
|
|
|
|
filtered_lines = [line for line in vcf_data if
|
|
|
|
|
|
line.split()[1] == str(position_list[i]) and line.split()[0] == str(chr_list[i]) and len(
|
|
|
|
|
|
line.split()[3]) < 2 and len(line.split()[4]) < 2]
|
|
|
|
|
|
for line in filtered_lines:
|
2023-12-25 14:06:30 +08:00
|
|
|
|
p_value_str = line.split()[9].split(":")[6]
|
2023-11-30 15:31:35 +08:00
|
|
|
|
p_value = float(p_value_str[:-1]) / 100 if p_value_str[-1] == "%" else float(p_value_str)
|
|
|
|
|
|
if 0.1 <= p_value <= 0.9:
|
|
|
|
|
|
result_data.append(line)
|
|
|
|
|
|
|
|
|
|
|
|
with open(output_file, 'w') as output:
|
|
|
|
|
|
for header_line in vcf_header:
|
|
|
|
|
|
output.write(header_line)
|
|
|
|
|
|
for data_line in result_data:
|
|
|
|
|
|
output.write(data_line)
|
|
|
|
|
|
|
|
|
|
|
|
return output_file
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def paired_monitoring(name, somatic_vcf, germline_vcf, ref_bed, cnvkit_ref_bed, output_dir):
|
|
|
|
|
|
# 处理体系,根据bed筛选
|
|
|
|
|
|
select_position_output_file1 = os.path.join(output_dir, f'{name}_somatic_matched.vcf')
|
|
|
|
|
|
select_position_output_file2 = os.path.join(output_dir, f'{name}_somatic_unmatched.vcf')
|
|
|
|
|
|
somatic_matched_file, somatic_unmatched_file = select_position(somatic_vcf, ref_bed, select_position_output_file1,
|
|
|
|
|
|
select_position_output_file2)
|
|
|
|
|
|
# 处理胚系,根据bed筛选
|
|
|
|
|
|
select_position_output_file3 = os.path.join(output_dir, f'{name}_germline_matched.vcf')
|
|
|
|
|
|
select_position_output_file4 = os.path.join(output_dir, f'{name}_germline_unmatched.vcf')
|
2023-12-25 14:06:30 +08:00
|
|
|
|
germline_matched_file, germline_unmatched_file = select_position(germline_vcf, ref_bed,
|
2023-11-30 15:31:35 +08:00
|
|
|
|
select_position_output_file3,
|
|
|
|
|
|
select_position_output_file4)
|
|
|
|
|
|
# 处理体系,数值转换
|
|
|
|
|
|
process_judge_vcf_file1 = os.path.join(output_dir, f'{name}_somatic_matched_add_judge.vcf')
|
|
|
|
|
|
somatic_matched_add_judge_file = process_judge_vcf(somatic_matched_file, process_judge_vcf_file1)
|
|
|
|
|
|
# 处理胚系,数值转换
|
|
|
|
|
|
process_judge_vcf_file2 = os.path.join(output_dir, f'{name}_germline_matched_add_judge.vcf')
|
2023-12-25 14:06:30 +08:00
|
|
|
|
germline_matched_add_judge_file = process_judge_vcf(germline_matched_file, process_judge_vcf_file2)
|
2023-11-30 15:31:35 +08:00
|
|
|
|
# 合并体系,将匹配到的和未匹配到bed的的合并
|
|
|
|
|
|
merge_and_sort_files_file1 = os.path.join(output_dir, f'{name}_somatic_merged.vcf')
|
|
|
|
|
|
somatic_merged_file = merge_and_sort_files(somatic_matched_add_judge_file, somatic_unmatched_file,
|
|
|
|
|
|
merge_and_sort_files_file1)
|
|
|
|
|
|
# 合并胚系,将匹配到的和未匹配到bed的的合并
|
|
|
|
|
|
merge_and_sort_files_file2 = os.path.join(output_dir, f'{name}_germline__merged.vcf')
|
2023-12-25 14:06:30 +08:00
|
|
|
|
Germline_merged_file = merge_and_sort_files(germline_matched_add_judge_file, germline_unmatched_file,
|
2023-11-30 15:31:35 +08:00
|
|
|
|
merge_and_sort_files_file2)
|
|
|
|
|
|
# 合并胚系,体系,将体系,胚系两个合并文件再合并
|
|
|
|
|
|
result_pro_file = os.path.join(output_dir, f'{name}_result_pro.txt')
|
|
|
|
|
|
result_file = os.path.join(output_dir, f'{name}_contaminate_result.txt')
|
|
|
|
|
|
merge_and_compare_files(somatic_merged_file, Germline_merged_file, result_pro_file, result_file)
|
|
|
|
|
|
##筛选作图vcf
|
|
|
|
|
|
cnvkit_output_file = os.path.join(output_dir, f'{name}_select_cnvkit.vcf')
|
|
|
|
|
|
select_cnvkit_vcf(germline_vcf, cnvkit_ref_bed, cnvkit_output_file)
|
|
|
|
|
|
##删除中间文件
|
|
|
|
|
|
os.remove(select_position_output_file1)
|
|
|
|
|
|
os.remove(select_position_output_file2)
|
|
|
|
|
|
os.remove(select_position_output_file3)
|
|
|
|
|
|
os.remove(select_position_output_file4)
|
|
|
|
|
|
os.remove(process_judge_vcf_file1)
|
|
|
|
|
|
os.remove(process_judge_vcf_file2)
|
|
|
|
|
|
os.remove(merge_and_sort_files_file1)
|
|
|
|
|
|
os.remove(merge_and_sort_files_file2)
|
|
|
|
|
|
os.remove(result_pro_file)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
|
parser = argparse.ArgumentParser(description="Pollution Process Script")
|
|
|
|
|
|
|
|
|
|
|
|
parser.add_argument('-n', '--name', help="Name for sample", required=True)
|
|
|
|
|
|
parser.add_argument('-b', '--ref_bed', help="ref_bed", required=True)
|
|
|
|
|
|
parser.add_argument('-v', '--vcf', help="raw vcf for prbe 160 or 17 ; somatic vcf for prbe 682 or 624",
|
|
|
|
|
|
required=True)
|
2023-12-01 09:10:50 +08:00
|
|
|
|
parser.add_argument('-v2', '--vcf2', help="germline vcf; required when prbe 682 or 624", default='', required=False, nargs='?')
|
2023-11-30 15:31:35 +08:00
|
|
|
|
parser.add_argument('-c', '--cnvkit_bed', help="cnvkit_bed; required when prbe 682 or 624")
|
|
|
|
|
|
parser.add_argument('-p', '--probe', help="probe, 682, 624, 160, 17 for now ", required=True)
|
|
|
|
|
|
parser.add_argument('-o', '--output_dir', help="Output directory, default ./", default='')
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
|
|
|
|
bed_path = os.path.realpath(args.ref_bed)
|
|
|
|
|
|
print(f'污染检测使用ref_bed: {bed_path}')
|
|
|
|
|
|
probe = args.probe
|
|
|
|
|
|
|
|
|
|
|
|
if probe == '160' or probe == '17':
|
|
|
|
|
|
freq_range = {"17": [0.3452, 0.6512], "160": [0.2930, 0.6753]}.get(probe)
|
|
|
|
|
|
single_monitoring(args.name, args.vcf, bed_path, freq_range, args.output_dir)
|
|
|
|
|
|
elif probe == '682' or probe == '624':
|
|
|
|
|
|
if not args.vcf2:
|
|
|
|
|
|
parser.error('--vcf2 is required in prbe 682 or 624')
|
|
|
|
|
|
if not args.cnvkit_bed:
|
|
|
|
|
|
parser.error('--cnvkit_bed is required in prbe 682 or 624')
|
|
|
|
|
|
cnvkit_bed_path = os.path.realpath(args.cnvkit_bed)
|
|
|
|
|
|
print(f'污染检测使用cnvkit_bed: {cnvkit_bed_path}')
|
|
|
|
|
|
paired_monitoring(args.name, args.vcf, args.vcf2, bed_path, args.cnvkit_bed, args.output_dir)
|
|
|
|
|
|
else:
|
|
|
|
|
|
parser.error('probe error. 682, 624, 160, 17 for now')
|
|
|
|
|
|
|