pipeline/codes/pollution.py

#!/usr/bin/env python3

import argparse
import os

import pandas as pd


def single_monitoring(name, vcf, bed, freq_range, output_dir):
    vcf_header = []  # 用于存储VCF文件头部分
    vcf_data = []  # 用于存储筛选后的VCF数据
    p_value_list = []  # 用于存储符合条件的 p-value

    # 按行处理，保存为列表
    with open(vcf, 'r') as vcf_file:
        for line in vcf_file:
            if line.startswith("#"):
                vcf_header.append(line)
            else:
                vcf_data.append(line)

    # bed 处理
    df_position = pd.read_csv(bed, sep='\t', header=None, names=range(4))
    position_list = list(df_position[2])
    chr_list = list(df_position[0])
    result_data = []

    for i in range(len(position_list)):
        filtered_lines = [line for line in vcf_data if
                          line.split()[1] == str(position_list[i]) and line.split()[0] == str(chr_list[i]) and len(
                              line.split()[3]) < 2 and len(line.split()[4]) < 2]
        for line in filtered_lines:
            p_value_str = line.split()[9].split(":")[4]
            p_value = float(p_value_str[:-1]) / 100 if p_value_str[-1] == "%" else float(p_value_str)
            if 0.1 <= p_value <= 0.9:
                result_data.append(line)
                p_value_list.append(p_value)
    select_vcf = os.path.join(output_dir, f'{name}_select.vcf')
    with open(select_vcf, 'w') as output:
        for header_line in vcf_header:
            output.write(header_line)
        for data_line in result_data:
            output.write(data_line)

    count_normal = 0
    count_exception = 0
    for p_value in p_value_list:
        if freq_range[0] <= p_value <= freq_range[1]:
            count_normal += 1
        else:
            count_exception += 1
    count_all = count_exception + count_normal
    if count_all == 0:
        z_score = 0
    else:
        z_score = count_exception / count_all

    res = dict(
        barcode=name,
        count_normal=count_normal,
        count_exception=count_exception,
        z_score=z_score
    )
    pd.DataFrame([res]).to_csv(os.path.join(output_dir, f'{name}_pollution.csv'), sep="\t", index=False)


# 根据小bed筛选vcf
def select_position(vcf, bed, matched_file, unmatched_file):
    vcf_header = []  # 用于存储VCF文件头部分
    vcf_data = []  # 用于存储筛选后的VCF数据

    # 按行处理，保存为列表
    with open(vcf, 'r') as vcf_file:
        for line in vcf_file:
            if line.startswith("#"):
                vcf_header.append(line)
            else:
                vcf_data.append(line)

    df_position = pd.read_csv(bed, sep='\t', header=None, names=range(4))
    position_list = list(df_position[2])
    chr_list = list(df_position[0])
    result_data = []
    unmatched_data = []  # 用于存储未匹配的数据

    for i in range(len(position_list)):
        filtered_lines = [line for line in vcf_data if
                          line.split()[1] == str(position_list[i]) and line.split()[0] == str(chr_list[i]) and len(
                              line.split()[3]) < 2 and len(line.split()[4]) < 2]
        if not filtered_lines:  # 如果没有匹配的点，添加到未匹配数据列表
            unmatched_data.append(f"{chr_list[i]}\t{position_list[i]}\t.\t.\t.\t.\t.\n")
        result_data.extend(filtered_lines)

    with open(matched_file, 'w') as output:
        for header_line in vcf_header:
            output.write(header_line)
        for data_line in result_data:
            output.write(data_line)

    with open(unmatched_file, 'w') as unmatched_output:
        unmatched_output.writelines(unmatched_data)

    return matched_file, unmatched_file


# 处理体系、胚系的vcf，得到目标信息
def process_judge_vcf(input_vcf, output_vcf):
    with open(input_vcf, 'r') as input_file, open(output_vcf, 'w') as output_file:
        for line in input_file:
            if not line.startswith("#"):
                fields = line.strip().split('\t')
                info = fields[9].split(":")
                percentage = float(info[4])

                if 0.1 <= percentage <= 0.9:
                    b = 0.5
                elif percentage < 0.1:
                    b = 0
                elif percentage > 0.9:
                    b = 1

                # 构建新的行数据
                new_line = '\t'.join([fields[0], fields[1], fields[3], fields[4], info[4], str(b), info[2]])
                output_file.write(new_line + '\n')
        return output_vcf


def merge_and_sort_files(matched_file, unmatched_file, output_file):
    # 检查 unmatched_file 是否为空
    if os.stat(unmatched_file).st_size == 0:
        # 对 matched_file 进行排序并写入 output_file
        matched_df = pd.read_csv(matched_file, sep='\t', header=None)
        sorted_df = matched_df.sort_values(by=[0, 1])
        sorted_df.to_csv(output_file, sep='\t', header=False, index=False)
        return output_file

    # 如果 unmatched_file 不为空，继续合并和排序操作
    matched_df = pd.read_csv(matched_file, sep='\t', header=None)
    unmatched_df = pd.read_csv(unmatched_file, sep='\t', header=None)

    # 合并数据帧
    combined_df = pd.concat([matched_df, unmatched_df])

    # 根据第一列和第二列排序
    sorted_df = combined_df.sort_values(by=[0, 1])

    # 将排序后的数据写入输出文件
    sorted_df.to_csv(output_file, sep='\t', header=False, index=False)
    return output_file


# 合并体系，胚系
def merge_and_compare_files(somatic_file, germline_file, output_merged_file, output_final_file):
    # 合并两个文件
    with open(somatic_file, 'r') as somatic, open(germline_file, 'r') as germline:
        merged_lines = [f"{somatic_line.strip()}\t{germline_line.strip()}" for somatic_line, germline_line in
                        zip(somatic, germline)]

    # 将合并后的数据写入输出文件
    with open(output_merged_file, 'w') as output_file:
        output_file.write('\n'.join(merged_lines))

    # 比较两列数据并添加比较结果列
    with open(output_merged_file, 'r') as merged, open(output_final_file, 'w') as final_output:
        for line in merged:
            fields = line.strip().split('\t')
            if fields[5] == fields[12]:
                comparison_result = "yes"
            else:
                comparison_result = "no"
            final_output.write(f"{line.strip()}\t{comparison_result}\n")
    return output_merged_file, output_final_file


# 根据大bed筛选vcf，作cnvkit的图
def select_cnvkit_vcf(vcf, bed, output_file):
    vcf_header = []  # 用于存储VCF文件头部分
    vcf_data = []  # 用于存储筛选后的VCF数据
    p_value_list = []  # 用于存储符合条件的 p-value

    # 按行处理，保存为列表
    with open(vcf, 'r') as vcf_file:
        for line in vcf_file:
            if line.startswith("#"):
                vcf_header.append(line)
            else:
                vcf_data.append(line)

    df_position = pd.read_csv(bed, sep='\t', header=None, names=range(4))
    position_list = list(df_position[2])
    chr_list = list(df_position[0])
    result_data = []
    for i in range(len(position_list)):
        filtered_lines = [line for line in vcf_data if
                          line.split()[1] == str(position_list[i]) and line.split()[0] == str(chr_list[i]) and len(
                              line.split()[3]) < 2 and len(line.split()[4]) < 2]
        for line in filtered_lines:
            p_value_str = line.split()[9].split(":")[4]
            p_value = float(p_value_str[:-1]) / 100 if p_value_str[-1] == "%" else float(p_value_str)
            if 0.1 <= p_value <= 0.9:
                result_data.append(line)

    with open(output_file, 'w') as output:
        for header_line in vcf_header:
            output.write(header_line)
        for data_line in result_data:
            output.write(data_line)

    return output_file


def paired_monitoring(name, somatic_vcf, germline_vcf, ref_bed, cnvkit_ref_bed, output_dir):
    # 处理体系，根据bed筛选
    select_position_output_file1 = os.path.join(output_dir, f'{name}_somatic_matched.vcf')
    select_position_output_file2 = os.path.join(output_dir, f'{name}_somatic_unmatched.vcf')
    somatic_matched_file, somatic_unmatched_file = select_position(somatic_vcf, ref_bed, select_position_output_file1,
                                                                   select_position_output_file2)
    # 处理胚系，根据bed筛选
    select_position_output_file3 = os.path.join(output_dir, f'{name}_germline_matched.vcf')
    select_position_output_file4 = os.path.join(output_dir, f'{name}_germline_unmatched.vcf')
    Germline_matched_file, Germline_unmatched_file = select_position(germline_vcf, ref_bed,
                                                                     select_position_output_file3,
                                                                     select_position_output_file4)
    # 处理体系，数值转换
    process_judge_vcf_file1 = os.path.join(output_dir, f'{name}_somatic_matched_add_judge.vcf')
    somatic_matched_add_judge_file = process_judge_vcf(somatic_matched_file, process_judge_vcf_file1)
    # 处理胚系，数值转换
    process_judge_vcf_file2 = os.path.join(output_dir, f'{name}_germline_matched_add_judge.vcf')
    germline_matched_add_judge_file = process_judge_vcf(Germline_matched_file, process_judge_vcf_file2)
    # 合并体系，将匹配到的和未匹配到bed的的合并
    merge_and_sort_files_file1 = os.path.join(output_dir, f'{name}_somatic_merged.vcf')
    somatic_merged_file = merge_and_sort_files(somatic_matched_add_judge_file, somatic_unmatched_file,
                                               merge_and_sort_files_file1)
    # 合并胚系，将匹配到的和未匹配到bed的的合并
    merge_and_sort_files_file2 = os.path.join(output_dir, f'{name}_germline__merged.vcf')
    Germline_merged_file = merge_and_sort_files(germline_matched_add_judge_file, Germline_unmatched_file,
                                                merge_and_sort_files_file2)
    # 合并胚系，体系，将体系，胚系两个合并文件再合并
    result_pro_file = os.path.join(output_dir, f'{name}_result_pro.txt')
    result_file = os.path.join(output_dir, f'{name}_contaminate_result.txt')
    merge_and_compare_files(somatic_merged_file, Germline_merged_file, result_pro_file, result_file)
    ##筛选作图vcf
    cnvkit_output_file = os.path.join(output_dir, f'{name}_select_cnvkit.vcf')
    select_cnvkit_vcf(germline_vcf, cnvkit_ref_bed, cnvkit_output_file)
    ##删除中间文件
    os.remove(select_position_output_file1)
    os.remove(select_position_output_file2)
    os.remove(select_position_output_file3)
    os.remove(select_position_output_file4)
    os.remove(process_judge_vcf_file1)
    os.remove(process_judge_vcf_file2)
    os.remove(merge_and_sort_files_file1)
    os.remove(merge_and_sort_files_file2)
    os.remove(result_pro_file)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Pollution Process Script")

    parser.add_argument('-n', '--name', help="Name for sample", required=True)
    parser.add_argument('-b', '--ref_bed', help="ref_bed", required=True)
    parser.add_argument('-v', '--vcf', help="raw vcf for prbe 160 or 17 ; somatic vcf for prbe 682 or 624",
                        required=True)
    parser.add_argument('-v2', '--vcf2', help="germline vcf; required when prbe 682 or 624", default='', required=False, nargs='?')
    parser.add_argument('-c', '--cnvkit_bed', help="cnvkit_bed; required when prbe 682 or 624")
    parser.add_argument('-p', '--probe', help="probe, 682, 624, 160, 17 for now ", required=True)
    parser.add_argument('-o', '--output_dir', help="Output directory, default ./", default='')
    args = parser.parse_args()

    bed_path = os.path.realpath(args.ref_bed)
    print(f'污染检测使用ref_bed: {bed_path}')
    probe = args.probe

    if probe == '160' or probe == '17':
        freq_range = {"17": [0.3452, 0.6512], "160": [0.2930, 0.6753]}.get(probe)
        single_monitoring(args.name, args.vcf, bed_path, freq_range, args.output_dir)
    elif probe == '682' or probe == '624':
        if not args.vcf2:
            parser.error('--vcf2 is required in prbe 682 or 624')
        if not args.cnvkit_bed:
            parser.error('--cnvkit_bed is required in prbe 682 or 624')
        cnvkit_bed_path = os.path.realpath(args.cnvkit_bed)
        print(f'污染检测使用cnvkit_bed: {cnvkit_bed_path}')
        paired_monitoring(args.name, args.vcf, args.vcf2, bed_path, args.cnvkit_bed, args.output_dir)
    else:
        parser.error('probe error. 682, 624, 160, 17 for now')