#!/usr/bin/env python3 #-*-coding:utf-8-*- import sys import json import re import os import pandas as pd from pandas import DataFrame,Series import numpy as np from gender_detect import main if len(sys.argv)<6: print("usage:python3 qc.py fastp_json_file bamdst_coverage_file bamdst_depth_distribution.plot output_dir sample_name tumor") sys.exit() fastp_json=sys.argv[1] bamdst_txt=sys.argv[2] bamdst_depth=sys.argv[3] out_dir=sys.argv[4] name=sys.argv[5] tumor=sys.argv[6] out=open(''.join([out_dir,"/qc/",name,"_qc.txt"]),'w') with open(fastp_json,'r')as file: fastp=json.load(file) out.write(''.join(["raw_reads\t",str(fastp["summary"]["before_filtering"]["total_reads"]),"\n"])) out.write(''.join(["raw_bases\t",str(fastp["summary"]["before_filtering"]["total_bases"]),"\n"])) out.write(''.join(["clean_reads\t",str(fastp["summary"]["after_filtering"]["total_reads"]),"\n"])) out.write(''.join(["clean_bases\t",str(fastp["summary"]["after_filtering"]["total_bases"]),"\n"])) out.write(''.join(["clean_reads_rate(%)\t",str('%.2f' % (fastp["summary"]["after_filtering"]["total_reads"]/fastp["summary"]["before_filtering"]["total_reads"]*100)),"\n"])) out.write(''.join(["Q20(%)\t",str('%.2f' % (fastp["summary"]["before_filtering"]["q20_rate"]*100)),"\n"])) out.write(''.join(["Q30(%)\t",str('%.2f' % (fastp["summary"]["before_filtering"]["q30_rate"]*100)),"\n"])) with open(bamdst_txt,'r')as file2: bamdst={} for line in file2: if not re.match(r'^#',line): lines=line.strip().split('\t') bamdst[lines[0]]=lines[1] out.write(''.join(["mapped_reads\t",str(bamdst["[Total] Mapped Reads"]),"\n"])) out.write(''.join(["mapped_rate(%)\t",str('%.2f' % (int(bamdst["[Total] Mapped Reads"])/fastp["summary"]["after_filtering"]["total_reads"]*100)),"\n"])) out.write(''.join(["dup_reads\t",str(bamdst["[Total] PCR duplicate reads"]),"\n"])) out.write(''.join(["dup_rate(%)\t",str('%.2f' % (int(bamdst["[Total] PCR duplicate reads"])/int(bamdst["[Total] Mapped Reads"])*100)),"\n"])) out.write(''.join(["probe_bed_size\t",str(bamdst["[Target] Len of region"]),"\n"])) out.write(''.join(["target_reads\t",str(bamdst["[Target] Target Reads"]),"\n"])) out.write(''.join(["capture_rate(reads)\t",str('%.2f' % (int(bamdst["[Target] Target Reads"])/int(bamdst["[Total] Mapped Reads"])*100)),"\n"])) out.write(''.join(["mean_depth_raw\t",str(bamdst["[Target] Average depth"]),"\n"])) out.write(''.join(["mean_depth(dedup)\t",str(bamdst["[Target] Average depth(rmdup)"]),"\n"])) out.write(''.join(["coverage(>0x)\t",str(bamdst["[Target] Coverage (>0x)"].replace('%','')),"\n"])) out.write(''.join(["coverage(>10x)\t",str(bamdst["[Target] Coverage (>=10x)"].replace('%','')),"\n"])) uniformity=round((float(bamdst["[Target] Average depth"]))*0.2) b=os.popen("sed -n %s'p' %s | cut -f5" %(uniformity,bamdst_depth)) b=float(b.read().strip())*100 out.write(''.join(["coverage(>=0.2*meanx)\t",str('%.2f' % (b)),"\n"])) depth_tsv=''.join((out_dir,'/qc/',name,'_bamdst/depth.tsv.gz')) depth_tsv_df=pd.read_csv(depth_tsv,compression='gzip',sep="\t") coverage_gtoe_80= round(np.percentile(np.array(depth_tsv_df['Rmdup depth']),20)) out.write(''.join(["coverage(>=80%)\t",str(coverage_gtoe_80),"\n"])) out.write(''.join(["gender\t",str(main(out_dir,tumor)),"\n"])) out.close()