pipeline/script/qc_stat.py

#!/usr/bin/env python3
#-*-coding:utf-8-*-
import sys
import json
import re
import os
import pandas as pd
from pandas import DataFrame,Series
import numpy as np
from gender_detect import main

if len(sys.argv)<6:
    print("usage:python3 qc.py fastp_json_file bamdst_coverage_file bamdst_depth_distribution.plot output_dir sample_name tumor")
    sys.exit()

fastp_json=sys.argv[1]
bamdst_txt=sys.argv[2]
bamdst_depth=sys.argv[3]
out_dir=sys.argv[4]
name=sys.argv[5]
tumor=sys.argv[6]

out=open(''.join([out_dir,"/qc/",name,"_qc.txt"]),'w')

with open(fastp_json,'r')as file:
    fastp=json.load(file)
    out.write(''.join(["raw_reads\t",str(fastp["summary"]["before_filtering"]["total_reads"]),"\n"]))
    out.write(''.join(["raw_bases\t",str(fastp["summary"]["before_filtering"]["total_bases"]),"\n"]))
    out.write(''.join(["clean_reads\t",str(fastp["summary"]["after_filtering"]["total_reads"]),"\n"]))
    out.write(''.join(["clean_bases\t",str(fastp["summary"]["after_filtering"]["total_bases"]),"\n"]))
    out.write(''.join(["clean_reads_rate(%)\t",str('%.2f' % (fastp["summary"]["after_filtering"]["total_reads"]/fastp["summary"]["before_filtering"]["total_reads"]*100)),"\n"]))
    out.write(''.join(["Q20(%)\t",str('%.2f' % (fastp["summary"]["before_filtering"]["q20_rate"]*100)),"\n"]))
    out.write(''.join(["Q30(%)\t",str('%.2f' % (fastp["summary"]["before_filtering"]["q30_rate"]*100)),"\n"]))


with open(bamdst_txt,'r')as file2:
    bamdst={}
    for line in file2:
        if not re.match(r'^#',line):
            lines=line.strip().split('\t')
            bamdst[lines[0]]=lines[1]

    out.write(''.join(["mapped_reads\t",str(bamdst["[Total] Mapped Reads"]),"\n"]))
    out.write(''.join(["mapped_rate(%)\t",str('%.2f' % (int(bamdst["[Total] Mapped Reads"])/fastp["summary"]["after_filtering"]["total_reads"]*100)),"\n"]))
    out.write(''.join(["dup_reads\t",str(bamdst["[Total] PCR duplicate reads"]),"\n"]))
    out.write(''.join(["dup_rate(%)\t",str('%.2f' % (int(bamdst["[Total] PCR duplicate reads"])/int(bamdst["[Total] Mapped Reads"])*100)),"\n"]))
    out.write(''.join(["probe_bed_size\t",str(bamdst["[Target] Len of region"]),"\n"]))
    out.write(''.join(["target_reads\t",str(bamdst["[Target] Target Reads"]),"\n"]))
    out.write(''.join(["capture_rate(reads)\t",str('%.2f' % (int(bamdst["[Target] Target Reads"])/int(bamdst["[Total] Mapped Reads"])*100)),"\n"]))
    out.write(''.join(["mean_depth_raw\t",str(bamdst["[Target] Average depth"]),"\n"]))
    out.write(''.join(["mean_depth(dedup)\t",str(bamdst["[Target] Average depth(rmdup)"]),"\n"]))
    out.write(''.join(["coverage(>0x)\t",str(bamdst["[Target] Coverage (>0x)"].replace('%','')),"\n"]))
    out.write(''.join(["coverage(>10x)\t",str(bamdst["[Target] Coverage (>=10x)"].replace('%','')),"\n"]))
    uniformity=round((float(bamdst["[Target] Average depth"]))*0.2)


b=os.popen("sed -n %s'p' %s | cut -f5" %(uniformity,bamdst_depth))
b=float(b.read().strip())*100
out.write(''.join(["coverage(>=0.2*meanx)\t",str('%.2f' % (b)),"\n"]))

depth_tsv=''.join((out_dir,'/qc/',name,'_bamdst/depth.tsv.gz'))
depth_tsv_df=pd.read_csv(depth_tsv,compression='gzip',sep="\t")
coverage_gtoe_80= round(np.percentile(np.array(depth_tsv_df['Rmdup depth']),20))
out.write(''.join(["coverage(>=80%)\t",str(coverage_gtoe_80),"\n"]))
out.write(''.join(["gender\t",str(main(out_dir,tumor)),"\n"]))
out.close()