pipeline/codes/pollution.py

288 lines
12 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#!/usr/bin/env python3
import argparse
import os
import pandas as pd
def single_monitoring(name, vcf, bed, freq_range, output_dir):
vcf_header = [] # 用于存储VCF文件头部分
vcf_data = [] # 用于存储筛选后的VCF数据
p_value_list = [] # 用于存储符合条件的 p-value
# 按行处理,保存为列表
with open(vcf, 'r') as vcf_file:
for line in vcf_file:
if line.startswith("#"):
vcf_header.append(line)
else:
vcf_data.append(line)
# bed 处理
df_position = pd.read_csv(bed, sep='\t', header=None, names=range(4))
position_list = list(df_position[2])
chr_list = list(df_position[0])
result_data = []
for i in range(len(position_list)):
filtered_lines = [line for line in vcf_data if
line.split()[1] == str(position_list[i]) and line.split()[0] == str(chr_list[i]) and len(
line.split()[3]) < 2 and len(line.split()[4]) < 2]
for line in filtered_lines:
p_value_str = line.split()[9].split(":")[4]
p_value = float(p_value_str[:-1]) / 100 if p_value_str[-1] == "%" else float(p_value_str)
if 0.1 <= p_value <= 0.9:
result_data.append(line)
p_value_list.append(p_value)
select_vcf = os.path.join(output_dir, f'{name}_select.vcf')
with open(select_vcf, 'w') as output:
for header_line in vcf_header:
output.write(header_line)
for data_line in result_data:
output.write(data_line)
count_normal = 0
count_exception = 0
for p_value in p_value_list:
if freq_range[0] <= p_value <= freq_range[1]:
count_normal += 1
else:
count_exception += 1
count_all = count_exception + count_normal
if count_all == 0:
z_score = 0
else:
z_score = count_exception / count_all
res = dict(
barcode=name,
count_normal=count_normal,
count_exception=count_exception,
z_score=z_score
)
pd.DataFrame([res]).to_csv(os.path.join(output_dir, f'{name}_pollution.csv'), sep="\t", index=False)
# 根据小bed筛选vcf
def select_position(vcf, bed, matched_file, unmatched_file):
vcf_header = [] # 用于存储VCF文件头部分
vcf_data = [] # 用于存储筛选后的VCF数据
# 按行处理,保存为列表
with open(vcf, 'r') as vcf_file:
for line in vcf_file:
if line.startswith("#"):
vcf_header.append(line)
else:
vcf_data.append(line)
df_position = pd.read_csv(bed, sep='\t', header=None, names=range(4))
position_list = list(df_position[2])
chr_list = list(df_position[0])
result_data = []
unmatched_data = [] # 用于存储未匹配的数据
for i in range(len(position_list)):
filtered_lines = [line for line in vcf_data if
line.split()[1] == str(position_list[i]) and line.split()[0] == str(chr_list[i]) and len(
line.split()[3]) < 2 and len(line.split()[4]) < 2]
if not filtered_lines: # 如果没有匹配的点,添加到未匹配数据列表
unmatched_data.append(f"{chr_list[i]}\t{position_list[i]}\t.\t.\t.\t.\t.\n")
result_data.extend(filtered_lines)
with open(matched_file, 'w') as output:
for header_line in vcf_header:
output.write(header_line)
for data_line in result_data:
output.write(data_line)
with open(unmatched_file, 'w') as unmatched_output:
unmatched_output.writelines(unmatched_data)
return matched_file, unmatched_file
# 处理体系、胚系的vcf得到目标信息
def process_judge_vcf(input_vcf, output_vcf):
with open(input_vcf, 'r') as input_file, open(output_vcf, 'w') as output_file:
for line in input_file:
if not line.startswith("#"):
fields = line.strip().split('\t')
info = fields[9].split(":")
percentage = float(info[4])
if 0.1 <= percentage <= 0.9:
b = 0.5
elif percentage < 0.1:
b = 0
elif percentage > 0.9:
b = 1
# 构建新的行数据
new_line = '\t'.join([fields[0], fields[1], fields[3], fields[4], info[4], str(b), info[2]])
output_file.write(new_line + '\n')
return output_vcf
def merge_and_sort_files(matched_file, unmatched_file, output_file):
# 检查 unmatched_file 是否为空
if os.stat(unmatched_file).st_size == 0:
# 对 matched_file 进行排序并写入 output_file
matched_df = pd.read_csv(matched_file, sep='\t', header=None)
sorted_df = matched_df.sort_values(by=[0, 1])
sorted_df.to_csv(output_file, sep='\t', header=False, index=False)
return output_file
# 如果 unmatched_file 不为空,继续合并和排序操作
matched_df = pd.read_csv(matched_file, sep='\t', header=None)
unmatched_df = pd.read_csv(unmatched_file, sep='\t', header=None)
# 合并数据帧
combined_df = pd.concat([matched_df, unmatched_df])
# 根据第一列和第二列排序
sorted_df = combined_df.sort_values(by=[0, 1])
# 将排序后的数据写入输出文件
sorted_df.to_csv(output_file, sep='\t', header=False, index=False)
return output_file
# 合并体系,胚系
def merge_and_compare_files(somatic_file, germline_file, output_merged_file, output_final_file):
# 合并两个文件
with open(somatic_file, 'r') as somatic, open(germline_file, 'r') as germline:
merged_lines = [f"{somatic_line.strip()}\t{germline_line.strip()}" for somatic_line, germline_line in
zip(somatic, germline)]
# 将合并后的数据写入输出文件
with open(output_merged_file, 'w') as output_file:
output_file.write('\n'.join(merged_lines))
# 比较两列数据并添加比较结果列
with open(output_merged_file, 'r') as merged, open(output_final_file, 'w') as final_output:
for line in merged:
fields = line.strip().split('\t')
if fields[5] == fields[12]:
comparison_result = "yes"
else:
comparison_result = "no"
final_output.write(f"{line.strip()}\t{comparison_result}\n")
return output_merged_file, output_final_file
# 根据大bed筛选vcf作cnvkit的图
def select_cnvkit_vcf(vcf, bed, output_file):
vcf_header = [] # 用于存储VCF文件头部分
vcf_data = [] # 用于存储筛选后的VCF数据
p_value_list = [] # 用于存储符合条件的 p-value
# 按行处理,保存为列表
with open(vcf, 'r') as vcf_file:
for line in vcf_file:
if line.startswith("#"):
vcf_header.append(line)
else:
vcf_data.append(line)
df_position = pd.read_csv(bed, sep='\t', header=None, names=range(4))
position_list = list(df_position[2])
chr_list = list(df_position[0])
result_data = []
for i in range(len(position_list)):
filtered_lines = [line for line in vcf_data if
line.split()[1] == str(position_list[i]) and line.split()[0] == str(chr_list[i]) and len(
line.split()[3]) < 2 and len(line.split()[4]) < 2]
for line in filtered_lines:
p_value_str = line.split()[9].split(":")[4]
p_value = float(p_value_str[:-1]) / 100 if p_value_str[-1] == "%" else float(p_value_str)
if 0.1 <= p_value <= 0.9:
result_data.append(line)
with open(output_file, 'w') as output:
for header_line in vcf_header:
output.write(header_line)
for data_line in result_data:
output.write(data_line)
return output_file
def paired_monitoring(name, somatic_vcf, germline_vcf, ref_bed, cnvkit_ref_bed, output_dir):
# 处理体系根据bed筛选
select_position_output_file1 = os.path.join(output_dir, f'{name}_somatic_matched.vcf')
select_position_output_file2 = os.path.join(output_dir, f'{name}_somatic_unmatched.vcf')
somatic_matched_file, somatic_unmatched_file = select_position(somatic_vcf, ref_bed, select_position_output_file1,
select_position_output_file2)
# 处理胚系根据bed筛选
select_position_output_file3 = os.path.join(output_dir, f'{name}_germline_matched.vcf')
select_position_output_file4 = os.path.join(output_dir, f'{name}_germline_unmatched.vcf')
Germline_matched_file, Germline_unmatched_file = select_position(germline_vcf, ref_bed,
select_position_output_file3,
select_position_output_file4)
# 处理体系,数值转换
process_judge_vcf_file1 = os.path.join(output_dir, f'{name}_somatic_matched_add_judge.vcf')
somatic_matched_add_judge_file = process_judge_vcf(somatic_matched_file, process_judge_vcf_file1)
# 处理胚系,数值转换
process_judge_vcf_file2 = os.path.join(output_dir, f'{name}_germline_matched_add_judge.vcf')
germline_matched_add_judge_file = process_judge_vcf(Germline_matched_file, process_judge_vcf_file2)
# 合并体系将匹配到的和未匹配到bed的的合并
merge_and_sort_files_file1 = os.path.join(output_dir, f'{name}_somatic_merged.vcf')
somatic_merged_file = merge_and_sort_files(somatic_matched_add_judge_file, somatic_unmatched_file,
merge_and_sort_files_file1)
# 合并胚系将匹配到的和未匹配到bed的的合并
merge_and_sort_files_file2 = os.path.join(output_dir, f'{name}_germline__merged.vcf')
Germline_merged_file = merge_and_sort_files(germline_matched_add_judge_file, Germline_unmatched_file,
merge_and_sort_files_file2)
# 合并胚系,体系,将体系,胚系两个合并文件再合并
result_pro_file = os.path.join(output_dir, f'{name}_result_pro.txt')
result_file = os.path.join(output_dir, f'{name}_contaminate_result.txt')
merge_and_compare_files(somatic_merged_file, Germline_merged_file, result_pro_file, result_file)
##筛选作图vcf
cnvkit_output_file = os.path.join(output_dir, f'{name}_select_cnvkit.vcf')
select_cnvkit_vcf(germline_vcf, cnvkit_ref_bed, cnvkit_output_file)
##删除中间文件
os.remove(select_position_output_file1)
os.remove(select_position_output_file2)
os.remove(select_position_output_file3)
os.remove(select_position_output_file4)
os.remove(process_judge_vcf_file1)
os.remove(process_judge_vcf_file2)
os.remove(merge_and_sort_files_file1)
os.remove(merge_and_sort_files_file2)
os.remove(result_pro_file)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Pollution Process Script")
parser.add_argument('-n', '--name', help="Name for sample", required=True)
parser.add_argument('-b', '--ref_bed', help="ref_bed", required=True)
parser.add_argument('-v', '--vcf', help="raw vcf for prbe 160 or 17 ; somatic vcf for prbe 682 or 624",
required=True)
parser.add_argument('-v2', '--vcf2', help="germline vcf; required when prbe 682 or 624", default='', required=False, nargs='?')
parser.add_argument('-c', '--cnvkit_bed', help="cnvkit_bed; required when prbe 682 or 624")
parser.add_argument('-p', '--probe', help="probe, 682, 624, 160, 17 for now ", required=True)
parser.add_argument('-o', '--output_dir', help="Output directory, default ./", default='')
args = parser.parse_args()
bed_path = os.path.realpath(args.ref_bed)
print(f'污染检测使用ref_bed: {bed_path}')
probe = args.probe
if probe == '160' or probe == '17':
freq_range = {"17": [0.3452, 0.6512], "160": [0.2930, 0.6753]}.get(probe)
single_monitoring(args.name, args.vcf, bed_path, freq_range, args.output_dir)
elif probe == '682' or probe == '624':
if not args.vcf2:
parser.error('--vcf2 is required in prbe 682 or 624')
if not args.cnvkit_bed:
parser.error('--cnvkit_bed is required in prbe 682 or 624')
cnvkit_bed_path = os.path.realpath(args.cnvkit_bed)
print(f'污染检测使用cnvkit_bed: {cnvkit_bed_path}')
paired_monitoring(args.name, args.vcf, args.vcf2, bed_path, args.cnvkit_bed, args.output_dir)
else:
parser.error('probe error. 682, 624, 160, 17 for now')