version 1.0 import "humanwgs_structs.wdl" import "wdl-common/wdl/workflows/backend_configuration/backend_configuration.wdl" as BackendConfiguration import "process_trgt_catalog/process_trgt_catalog.wdl" as ProcessTrgtCatalog import "upstream/upstream.wdl" as Upstream import "downstream/downstream.wdl" as Downstream import "tertiary/tertiary.wdl" as TertiaryAnalysis import "wdl-common/wdl/tasks/utilities.wdl" as Utilities workflow humanwgs_singleton { meta { description: "PacBio HiFi human whole genome sequencing pipeline for individual samples." } parameter_meta { sample_id: { name: "Unique identifier for the sample" } sex: { name: "Sample sex", choices: ["MALE", "FEMALE"] } hifi_reads: { name: "Array of paths to HiFi reads in unaligned BAM format." } fail_reads: { name: "Array of paths to failed reads in unaligned BAM format; optional" } phenotypes: { name: "Comma-delimited list of HPO codes for phenotypes" } ref_map_file: { name: "TSV containing reference genome file paths; must match backend" } tertiary_map_file: { name: "TSV containing tertiary analysis file paths and thresholds; must match backend" } max_reads_per_alignment_chunk: { name: "Maximum reads per alignment chunk" } pharmcat_min_coverage: { name: "Minimum coverage for PharmCAT" } gpu: { name: "Use GPU when possible" } backend: { name: "Backend where the workflow will be executed", choices: ["GCP", "Azure", "AWS-HealthOmics", "HPC"] } zones: { name: "Zones where compute will take place; required if backend is set to 'GCP'" } cpuPlatform: { help: "Optional minimum CPU platform to use for tasks on GCP" } gpuType: { name: "GPU type to use; required if gpu is set to `true` for cloud backends; must match backend" } container_registry: { name: "Container registry where workflow images are hosted. If left blank, PacBio's public Quay.io registry will be used. Must be set if backend is set to 'AWS-HealthOmics'", default: "quay.io/pacbio" } preemptible: { name: "Where possible, run tasks preemptibly" } debug_version: { name: "Debug version for testing purposes" } } input { String sample_id String? sex Array[File] hifi_reads Array[File]? fail_reads String phenotypes = "HP:0000001" File ref_map_file File? tertiary_map_file Int max_reads_per_alignment_chunk = 500000 Int pharmcat_min_coverage = 10 Boolean gpu = false # Backend configuration String backend String? zones String? cpuPlatform String? gpuType String? container_registry Boolean preemptible = true String? debug_version } call BackendConfiguration.backend_configuration { input: backend = backend, zones = zones, cpuPlatform = cpuPlatform, gpuType = gpuType, container_registry = container_registry } RuntimeAttributes default_runtime_attributes = if preemptible then backend_configuration.spot_runtime_attributes else backend_configuration.on_demand_runtime_attributes Map[String, String] ref_map = read_map(ref_map_file) call ProcessTrgtCatalog.process_trgt_catalog { input: trgt_catalog = ref_map["trgt_tandem_repeat_bed"], # !FileCoercion ref_fasta = ref_map["fasta"], # !FileCoercion ref_index = ref_map["fasta_index"], # !FileCoercion default_runtime_attributes = default_runtime_attributes } call Upstream.upstream { input: sample_id = sample_id, sex = sex, hifi_reads = hifi_reads, fail_reads = fail_reads, fail_reads_bed = process_trgt_catalog.include_fail_reads_bed, fail_reads_bait_fasta = process_trgt_catalog.fail_reads_bait_fasta, fail_reads_bait_index = process_trgt_catalog.fail_reads_bait_index, ref_map_file = ref_map_file, max_reads_per_alignment_chunk = max_reads_per_alignment_chunk, single_sample = true, gpu = gpu, default_runtime_attributes = default_runtime_attributes } call Downstream.downstream { input: sample_id = sample_id, sex = upstream.inferred_sex, aligned_hifi_reads = upstream.aligned_hifi_reads, aligned_hifi_reads_index = upstream.aligned_hifi_reads_index, aligned_fail_reads = upstream.aligned_fail_reads, aligned_fail_reads_index = upstream.aligned_fail_reads_index, trgt_catalog = process_trgt_catalog.full_catalog, small_variant_vcf = upstream.small_variant_vcf, small_variant_vcf_index = upstream.small_variant_vcf_index, sv_vcf = select_first([upstream.sv_vcf]), sv_vcf_index = select_first([upstream.sv_vcf_index]), pharmcat_min_coverage = pharmcat_min_coverage, ref_map_file = ref_map_file, default_runtime_attributes = default_runtime_attributes } Map[String, String] pedigree_sex = { "MALE": "1", "FEMALE": "2", "": "." } # write sample metadata similar to pedigree format # family_id, sample_id, father_id, mother_id, sex, affected Array[String] sample_metadata = [ sample_id, sample_id, ".", ".", pedigree_sex[upstream.inferred_sex], "2" ] if (defined(tertiary_map_file)) { call TertiaryAnalysis.tertiary_analysis { input: sample_metadata = [sample_metadata], phenotypes = phenotypes, is_trio_kid = [false], is_duo_kid = [false], small_variant_vcf = downstream.phased_small_variant_vcf, small_variant_vcf_index = downstream.phased_small_variant_vcf_index, sv_vcf = downstream.phased_sv_vcf, sv_vcf_index = downstream.phased_sv_vcf_index, ref_map_file = ref_map_file, tertiary_map_file = select_first([tertiary_map_file]), default_runtime_attributes = default_runtime_attributes } } Array[Array[String]] stats = [ ['sample_id', sample_id], ['read_count', downstream.stat_read_count], ['read_length_mean', downstream.stat_read_length_mean], ['read_length_median', downstream.stat_read_length_median], ['read_length_n50', downstream.stat_read_length_n50], ['read_quality_mean', downstream.stat_read_quality_mean], ['read_quality_median', downstream.stat_read_quality_median], ['mapped_read_count', downstream.stat_mapped_read_count], ['mapped_read_percent', downstream.stat_mapped_read_percent], ['gap_compressed_identity_mean', downstream.stat_gap_compressed_identity_mean], ['gap_compressed_identity_median', downstream.stat_gap_compressed_identity_median], ['depth_mean', upstream.stat_depth_mean], ['inferred_sex', upstream.inferred_sex], ['stat_phased_basepairs', downstream.stat_phased_basepairs], ['phase_block_ng50', downstream.stat_phase_block_ng50], ['cpg_combined_count', downstream.stat_combined_cpg_count], ['cpg_hap1_count', downstream.stat_hap1_cpg_count], ['cpg_hap2_count', downstream.stat_hap2_cpg_count], ['methbat_methylated_count', downstream.stat_methbat_methylated_count], ['methbat_unmethylated_count', downstream.stat_methbat_unmethylated_count], ['methbat_asm_count', downstream.stat_methbat_asm_count], ['SNV_count', downstream.stat_SNV_count], ['TSTV_ratio', downstream.stat_TSTV_ratio], ['HETHOM_ratio', downstream.stat_HETHOM_ratio], ['INDEL_count', downstream.stat_INDEL_count], ['sv_DUP_count', downstream.stat_sv_DUP_count], ['sv_DEL_count', downstream.stat_sv_DEL_count], ['sv_INS_count', downstream.stat_sv_INS_count], ['sv_INV_count', downstream.stat_sv_INV_count], ['sv_SWAP_count', downstream.stat_sv_SWAP_count], ['sv_BND_count', downstream.stat_sv_BND_count], ['trgt_genotyped_count', downstream.stat_trgt_genotyped_count], ['trgt_uncalled_count', downstream.stat_trgt_uncalled_count] ] call Utilities.consolidate_stats { input: id = sample_id, stats = stats, msg_array = flatten([process_trgt_catalog.msg, upstream.msg]), runtime_attributes = default_runtime_attributes } output { # consolidated stats File stats_file = consolidate_stats.output_tsv File msg_file = consolidate_stats.messages # bam stats File bam_statistics = downstream.bam_statistics File read_length_plot = downstream.read_length_plot File? read_quality_plot = downstream.read_quality_plot File mapq_distribution_plot = downstream.mapq_distribution_plot File mg_distribution_plot = downstream.mg_distribution_plot String stat_read_count = downstream.stat_read_count String stat_read_length_mean = downstream.stat_read_length_mean String stat_read_length_median = downstream.stat_read_length_median String stat_read_length_n50 = downstream.stat_read_length_n50 String stat_read_quality_mean = downstream.stat_read_quality_mean String stat_read_quality_median = downstream.stat_read_quality_median String stat_mapped_read_count = downstream.stat_mapped_read_count String stat_mapped_read_percent = downstream.stat_mapped_read_percent String stat_gap_compressed_identity_mean = downstream.stat_gap_compressed_identity_mean String stat_gap_compressed_identity_median = downstream.stat_gap_compressed_identity_median # merged, haplotagged alignments File merged_haplotagged_bam = downstream.merged_haplotagged_bam File merged_haplotagged_bam_index = downstream.merged_haplotagged_bam_index # mosdepth outputs File mosdepth_summary = upstream.mosdepth_summary File mosdepth_region_bed = upstream.mosdepth_region_bed File mosdepth_region_bed_index = upstream.mosdepth_region_bed_index File mosdepth_depth_distribution_plot = upstream.mosdepth_depth_distribution_plot String stat_depth_mean = upstream.stat_depth_mean String inferred_sex = upstream.inferred_sex # phasing stats File phase_stats = downstream.phase_stats File phase_blocks = downstream.phase_blocks File phase_haplotags = downstream.phase_haplotags String stat_phased_basepairs = downstream.stat_phased_basepairs String stat_phase_block_ng50 = downstream.stat_phase_block_ng50 # methylation outputs and profile File? cpg_combined_bed = downstream.cpg_combined_bed File? cpg_combined_bed_index = downstream.cpg_combined_bed_index File? cpg_hap1_bed = downstream.cpg_hap1_bed File? cpg_hap1_bed_index = downstream.cpg_hap1_bed_index File? cpg_hap2_bed = downstream.cpg_hap2_bed File? cpg_hap2_bed_index = downstream.cpg_hap2_bed_index File? cpg_combined_bw = downstream.cpg_combined_bw File? cpg_hap1_bw = downstream.cpg_hap1_bw File? cpg_hap2_bw = downstream.cpg_hap2_bw String stat_cpg_hap1_count = downstream.stat_hap1_cpg_count String stat_cpg_hap2_count = downstream.stat_hap2_cpg_count String stat_cpg_combined_count = downstream.stat_combined_cpg_count File? methbat_profile = downstream.methbat_profile String stat_methbat_methylated_count = downstream.stat_methbat_methylated_count String stat_methbat_unmethylated_count = downstream.stat_methbat_unmethylated_count String stat_methbat_asm_count = downstream.stat_methbat_asm_count # sv outputs File phased_sv_vcf = downstream.phased_sv_vcf File phased_sv_vcf_index = downstream.phased_sv_vcf_index File sv_supporting_reads = select_first([upstream.sv_supporting_reads]) File sv_copynum_bedgraph = select_first([upstream.sv_copynum_bedgraph]) File sv_depth_bw = select_first([upstream.sv_depth_bw]) File sv_gc_bias_corrected_depth_bw = select_first([upstream.sv_gc_bias_corrected_depth_bw]) File sv_maf_bw = select_first([upstream.sv_maf_bw]) File sv_copynum_summary = select_first([upstream.sv_copynum_summary]) # sv stats String stat_sv_DUP_count = downstream.stat_sv_DUP_count String stat_sv_DEL_count = downstream.stat_sv_DEL_count String stat_sv_INS_count = downstream.stat_sv_INS_count String stat_sv_INV_count = downstream.stat_sv_INV_count String stat_sv_SWAP_count = downstream.stat_sv_SWAP_count String stat_sv_BND_count = downstream.stat_sv_BND_count # small variant outputs File phased_small_variant_vcf = downstream.phased_small_variant_vcf File phased_small_variant_vcf_index = downstream.phased_small_variant_vcf_index File small_variant_gvcf = upstream.small_variant_gvcf File small_variant_gvcf_index = upstream.small_variant_gvcf_index # small variant stats File small_variant_stats = downstream.small_variant_stats File bcftools_roh_out = downstream.bcftools_roh_out File bcftools_roh_bed = downstream.bcftools_roh_bed String stat_small_variant_SNV_count = downstream.stat_SNV_count String stat_small_variant_INDEL_count = downstream.stat_INDEL_count String stat_small_variant_TSTV_ratio = downstream.stat_TSTV_ratio String stat_small_variant_HETHOM_ratio = downstream.stat_HETHOM_ratio File snv_distribution_plot = downstream.snv_distribution_plot File indel_distribution_plot = downstream.indel_distribution_plot # trgt outputs File phased_trgt_vcf = downstream.trgt_vcf File phased_trgt_vcf_index = downstream.trgt_vcf_index File trgt_spanning_reads = downstream.trgt_spanning_reads File trgt_spanning_reads_index = downstream.trgt_spanning_reads_index File trgt_coverage_dropouts = downstream.trgt_coverage_dropouts String stat_trgt_genotyped_count = downstream.stat_trgt_genotyped_count String stat_trgt_uncalled_count = downstream.stat_trgt_uncalled_count # paraphase outputs File? paraphase_output_json = upstream.paraphase_output_json File? paraphase_realigned_bam = upstream.paraphase_realigned_bam File? paraphase_realigned_bam_index = upstream.paraphase_realigned_bam_index File? paraphase_vcfs = upstream.paraphase_vcfs # per sample mitorsaw outputs File mitorsaw_vcf = upstream.mitorsaw_vcf File mitorsaw_vcf_index = upstream.mitorsaw_vcf_index File mitorsaw_hap_stats = upstream.mitorsaw_hap_stats # PGx outputs File pbstarphase_json = downstream.pbstarphase_json File? pharmcat_match_json = downstream.pharmcat_match_json File? pharmcat_phenotype_json = downstream.pharmcat_phenotype_json File? pharmcat_report_html = downstream.pharmcat_report_html File? pharmcat_report_json = downstream.pharmcat_report_json # tertiary analysis outputs File? tertiary_small_variant_filtered_vcf = tertiary_analysis.small_variant_filtered_vcf File? tertiary_small_variant_filtered_vcf_index = tertiary_analysis.small_variant_filtered_vcf_index File? tertiary_small_variant_filtered_tsv = tertiary_analysis.small_variant_filtered_tsv File? tertiary_small_variant_compound_het_vcf = tertiary_analysis.small_variant_compound_het_vcf File? tertiary_small_variant_compound_het_vcf_index = tertiary_analysis.small_variant_compound_het_vcf_index File? tertiary_small_variant_compound_het_tsv = tertiary_analysis.small_variant_compound_het_tsv File? tertiary_sv_filtered_vcf = tertiary_analysis.sv_filtered_vcf File? tertiary_sv_filtered_vcf_index = tertiary_analysis.sv_filtered_vcf_index File? tertiary_sv_filtered_tsv = tertiary_analysis.sv_filtered_tsv # qc messages Array[String] msg = flatten( [ process_trgt_catalog.msg, upstream.msg, downstream.msg ] ) # workflow metadata String workflow_name = "humanwgs_singleton" String workflow_version = "v3.2.1" + if defined(debug_version) then "~{"-" + debug_version}" else "" } }