version 1.0 import "humanwgs_structs.wdl" import "wdl-common/wdl/workflows/backend_configuration/backend_configuration.wdl" as BackendConfiguration import "process_trgt_catalog/process_trgt_catalog.wdl" as ProcessTrgtCatalog import "upstream/upstream.wdl" as Upstream import "joint/joint.wdl" as Joint import "downstream/downstream.wdl" as Downstream import "wdl-common/wdl/tasks/bcftools.wdl" as Bcftools import "wdl-common/wdl/tasks/trgt.wdl" as Trgt import "tertiary/tertiary.wdl" as TertiaryAnalysis import "wdl-common/wdl/tasks/utilities.wdl" as Utilities workflow humanwgs_family { meta { description: "PacBio HiFi human whole genome sequencing pipeline, with joint calling for related samples." } parameter_meta { family: { name: "Family struct describing samples, relationships, and unaligned BAM paths" } phenotypes: { name: "Comma-delimited list of HPO codes for phenotypes" } ref_map_file: { name: "TSV containing reference genome file paths; must match backend" } tertiary_map_file: { name: "TSV containing tertiary analysis file paths and thresholds; must match backend" } max_reads_per_alignment_chunk: { name: "Maximum reads per alignment chunk" } pharmcat_min_coverage: { name: "Minimum coverage for PharmCAT" } glnexus_mem_gb: { name: "Override GLnexus memory request (GB)" } gpu: { name: "Use GPU when possible" } backend: { name: "Backend where the workflow will be executed", choices: ["GCP", "Azure", "AWS-HealthOmics", "HPC"] } zones: { name: "Zones where compute will take place; required if backend is set to 'GCP'" } cpuPlatform: { help: "Optional minimum CPU platform to use for tasks on GCP" } gpuType: { name: "GPU type to use; required if gpu is set to `true` for cloud backends; must match backend" } container_registry: { name: "Container registry where workflow images are hosted. If left blank, PacBio's public Quay.io registry will be used. Must be set if backend is set to 'AWS-HealthOmics'" } preemptible: { name: "Where possible, run tasks preemptibly" } debug_version: { name: "Debug version for testing purposes" } } input { Family family String phenotypes = "HP:0000001" File ref_map_file File? tertiary_map_file Int max_reads_per_alignment_chunk = 500000 Int pharmcat_min_coverage = 10 Int? glnexus_mem_gb Boolean gpu = false # Backend configuration String backend String? zones String? cpuPlatform String? gpuType String? container_registry Boolean preemptible = true String? debug_version } call BackendConfiguration.backend_configuration { input: backend = backend, zones = zones, cpuPlatform = cpuPlatform, gpuType = gpuType, container_registry = container_registry } RuntimeAttributes default_runtime_attributes = if preemptible then backend_configuration.spot_runtime_attributes else backend_configuration.on_demand_runtime_attributes Map [String, String] ref_map = read_map(ref_map_file) call ProcessTrgtCatalog.process_trgt_catalog { input: trgt_catalog = ref_map["trgt_tandem_repeat_bed"], # !FileCoercion ref_fasta = ref_map["fasta"], # !FileCoercion ref_index = ref_map["fasta_index"], # !FileCoercion default_runtime_attributes = default_runtime_attributes } Boolean single_sample = length(family.samples) == 1 Map[String, String] pedigree_sex = { "MALE": "1", "FEMALE": "2", "": "." } scatter (sample in family.samples) { String sample_id = sample.sample_id Boolean is_trio_kid = defined(sample.father_id) && defined(sample.mother_id) # !UnusedDeclaration Boolean is_duo_kid = defined(sample.father_id) != defined(sample.mother_id) # !UnusedDeclaration call Upstream.upstream { input: sample_id = sample.sample_id, sex = sample.sex, hifi_reads = sample.hifi_reads, fail_reads = sample.fail_reads, fail_reads_bed = process_trgt_catalog.include_fail_reads_bed, fail_reads_bait_fasta = process_trgt_catalog.fail_reads_bait_fasta, fail_reads_bait_index = process_trgt_catalog.fail_reads_bait_index, ref_map_file = ref_map_file, max_reads_per_alignment_chunk = max_reads_per_alignment_chunk, single_sample = single_sample, gpu = gpu, default_runtime_attributes = default_runtime_attributes } # write sample metadata similar to pedigree format # family_id, sample_id, father_id, mother_id, sex, affected Array[String] sample_metadata = [ family.family_id, sample.sample_id, select_first([sample.father_id, "."]), select_first([sample.mother_id, "."]), pedigree_sex[upstream.inferred_sex], if sample.affected then "2" else "1" ] } if (!single_sample) { call Joint.joint { input: family_id = family.family_id, sample_ids = sample_id, gvcfs = upstream.small_variant_gvcf, gvcf_indices = upstream.small_variant_gvcf_index, discover_tars = upstream.discover_tar, aligned_bams = upstream.aligned_hifi_reads, aligned_bam_indices = upstream.aligned_hifi_reads_index, ref_map_file = ref_map_file, glnexus_mem_gb = glnexus_mem_gb, default_runtime_attributes = default_runtime_attributes } } scatter (sample_index in range(length(family.samples))) { call Downstream.downstream { input: sample_id = sample_id[sample_index], sex = upstream.inferred_sex[sample_index], aligned_hifi_reads = upstream.aligned_hifi_reads[sample_index], aligned_hifi_reads_index = upstream.aligned_hifi_reads_index[sample_index], aligned_fail_reads = upstream.aligned_fail_reads[sample_index], aligned_fail_reads_index = upstream.aligned_fail_reads_index[sample_index], trgt_catalog = process_trgt_catalog.full_catalog, small_variant_vcf = select_first([joint.split_joint_small_variant_vcfs, upstream.small_variant_vcf])[sample_index], small_variant_vcf_index = select_first([joint.split_joint_small_variant_vcf_indices, upstream.small_variant_vcf_index])[sample_index], sv_vcf = select_first([joint.split_joint_structural_variant_vcfs, select_all(upstream.sv_vcf)])[sample_index], sv_vcf_index = select_first([joint.split_joint_structural_variant_vcf_indices, select_all(upstream.sv_vcf_index)])[sample_index], pharmcat_min_coverage = pharmcat_min_coverage, ref_map_file = ref_map_file, default_runtime_attributes = default_runtime_attributes } } if (!single_sample) { call Bcftools.bcftools_merge as merge_small_variant_vcfs { input: vcfs = downstream.phased_small_variant_vcf, vcf_indices = downstream.phased_small_variant_vcf_index, out_prefix = "~{family.family_id}.joint.~{ref_map['name']}.small_variants.phased", runtime_attributes = default_runtime_attributes } call Bcftools.bcftools_merge as merge_sv_vcfs { input: vcfs = downstream.phased_sv_vcf, vcf_indices = downstream.phased_sv_vcf_index, out_prefix = "~{family.family_id}.joint.~{ref_map['name']}.structural_variants.phased", runtime_attributes = default_runtime_attributes } call Trgt.trgt_merge { input: vcfs = downstream.trgt_vcf, vcf_indices = downstream.trgt_vcf_index, ref_fasta = ref_map["fasta"], # !FileCoercion ref_index = ref_map["fasta_index"], # !FileCoercion out_prefix = "~{family.family_id}.merged.~{ref_map['name']}.trgt", runtime_attributes = default_runtime_attributes } } if (defined(tertiary_map_file)) { call TertiaryAnalysis.tertiary_analysis { input: sample_metadata = sample_metadata, phenotypes = phenotypes, is_trio_kid = is_trio_kid, is_duo_kid = is_duo_kid, small_variant_vcf = select_first([merge_small_variant_vcfs.merged_vcf, downstream.phased_small_variant_vcf[0]]), small_variant_vcf_index = select_first([merge_small_variant_vcfs.merged_vcf_index, downstream.phased_small_variant_vcf_index[0]]), sv_vcf = select_first([merge_sv_vcfs.merged_vcf, downstream.phased_sv_vcf[0]]), sv_vcf_index = select_first([merge_sv_vcfs.merged_vcf_index, downstream.phased_sv_vcf_index[0]]), ref_map_file = ref_map_file, tertiary_map_file = select_first([tertiary_map_file]), default_runtime_attributes = default_runtime_attributes } } Array[Array[String]] stats = [ flatten([['sample_id'], sample_id]), flatten([['read_count'], downstream.stat_read_count]), flatten([['read_length_mean'], downstream.stat_read_length_mean]), flatten([['read_length_median'], downstream.stat_read_length_median]), flatten([['read_length_n50'], downstream.stat_read_length_n50]), flatten([['read_quality_mean'], downstream.stat_read_quality_mean]), flatten([['read_quality_median'], downstream.stat_read_quality_median]), flatten([['mapped_read_count'], downstream.stat_mapped_read_count]), flatten([['mapped_read_percent'], downstream.stat_mapped_read_percent]), flatten([['gap_compressed_identity_mean'], downstream.stat_gap_compressed_identity_mean]), flatten([['gap_compressed_identity_median'], downstream.stat_gap_compressed_identity_median]), flatten([['depth_mean'], upstream.stat_depth_mean]), flatten([['inferred_sex'], upstream.inferred_sex]), flatten([['stat_phased_basepairs'], downstream.stat_phased_basepairs]), flatten([['phase_block_ng50'], downstream.stat_phase_block_ng50]), flatten([['cpg_combined_count'], downstream.stat_combined_cpg_count]), flatten([['cpg_hap1_count'], downstream.stat_hap1_cpg_count]), flatten([['cpg_hap2_count'], downstream.stat_hap2_cpg_count]), flatten([['methbat_methylated_count'], downstream.stat_methbat_methylated_count]), flatten([['methbat_unmethylated_count'], downstream.stat_methbat_unmethylated_count]), flatten([['methbat_asm_count'], downstream.stat_methbat_asm_count]), flatten([['SNV_count'], downstream.stat_SNV_count]), flatten([['TSTV_ratio'], downstream.stat_TSTV_ratio]), flatten([['HETHOM_ratio'], downstream.stat_HETHOM_ratio]), flatten([['INDEL_count'], downstream.stat_INDEL_count]), flatten([['sv_DUP_count'], downstream.stat_sv_DUP_count]), flatten([['sv_DEL_count'], downstream.stat_sv_DEL_count]), flatten([['sv_INS_count'], downstream.stat_sv_INS_count]), flatten([['sv_INV_count'], downstream.stat_sv_INV_count]), flatten([['sv_SWAP_count'], downstream.stat_sv_SWAP_count]), flatten([['sv_BND_count'], downstream.stat_sv_BND_count]), flatten([['trgt_genotyped_count'], downstream.stat_trgt_genotyped_count]), flatten([['trgt_uncalled_count'], downstream.stat_trgt_uncalled_count]) ] call Utilities.consolidate_stats { input: id = family.family_id, stats = stats, msg_array = flatten([process_trgt_catalog.msg, flatten(upstream.msg)]), runtime_attributes = default_runtime_attributes } output { # to maintain order of samples Array[String] sample_ids = sample_id File stats_file = consolidate_stats.output_tsv File msg_file = consolidate_stats.messages # bam stats Array[File] bam_statistics = downstream.bam_statistics Array[File] read_length_plot = downstream.read_length_plot Array[File?] read_quality_plot = downstream.read_quality_plot Array[File] mapq_distribution_plot = downstream.mapq_distribution_plot Array[File] mg_distribution_plot = downstream.mg_distribution_plot Array[String] stat_read_count = downstream.stat_read_count Array[String] stat_read_length_mean = downstream.stat_read_length_mean Array[String] stat_read_length_median = downstream.stat_read_length_median Array[String] stat_read_length_n50 = downstream.stat_read_length_n50 Array[String] stat_read_quality_mean = downstream.stat_read_quality_mean Array[String] stat_read_quality_median = downstream.stat_read_quality_median Array[String] stat_mapped_read_count = downstream.stat_mapped_read_count Array[String] stat_mapped_read_percent = downstream.stat_mapped_read_percent Array[String] stat_gap_compressed_identity_mean = downstream.stat_gap_compressed_identity_mean Array[String] stat_gap_compressed_identity_median = downstream.stat_gap_compressed_identity_median # merged, haplotagged alignments Array[File] merged_haplotagged_bam = downstream.merged_haplotagged_bam Array[File] merged_haplotagged_bam_index = downstream.merged_haplotagged_bam_index # mosdepth outputs Array[File] mosdepth_summary = upstream.mosdepth_summary Array[File] mosdepth_region_bed = upstream.mosdepth_region_bed Array[File] mosdepth_region_bed_index = upstream.mosdepth_region_bed_index Array[File] mosdepth_depth_distribution_plot = upstream.mosdepth_depth_distribution_plot Array[String] stat_depth_mean = upstream.stat_depth_mean Array[String] inferred_sex = upstream.inferred_sex # phasing stats Array[File] phase_stats = downstream.phase_stats Array[File] phase_blocks = downstream.phase_blocks Array[File] phase_haplotags = downstream.phase_haplotags Array[String] stat_phased_basepairs = downstream.stat_phased_basepairs Array[String] stat_phase_block_ng50 = downstream.stat_phase_block_ng50 # methylation outputs and profile Array[File?] cpg_combined_bed = downstream.cpg_combined_bed Array[File?] cpg_combined_bed_index = downstream.cpg_combined_bed_index Array[File?] cpg_hap1_bed = downstream.cpg_hap1_bed Array[File?] cpg_hap1_bed_index = downstream.cpg_hap1_bed_index Array[File?] cpg_hap2_bed = downstream.cpg_hap2_bed Array[File?] cpg_hap2_bed_index = downstream.cpg_hap2_bed_index Array[File?] cpg_combined_bw = downstream.cpg_combined_bw Array[File?] cpg_hap1_bw = downstream.cpg_hap1_bw Array[File?] cpg_hap2_bw = downstream.cpg_hap2_bw Array[String] stat_cpg_hap1_count = downstream.stat_hap1_cpg_count Array[String] stat_cpg_hap2_count = downstream.stat_hap2_cpg_count Array[String] stat_cpg_combined_count = downstream.stat_combined_cpg_count Array[File?] methbat_profile = downstream.methbat_profile Array[String] stat_methbat_methylated_count = downstream.stat_methbat_methylated_count Array[String] stat_methbat_unmethylated_count = downstream.stat_methbat_unmethylated_count Array[String] stat_methbat_asm_count = downstream.stat_methbat_asm_count # sv outputs Array[File] phased_sv_vcf = downstream.phased_sv_vcf Array[File] phased_sv_vcf_index = downstream.phased_sv_vcf_index File sv_supporting_reads = select_first([joint.sv_supporting_reads, upstream.sv_supporting_reads[0]]) Array[File] sv_copynum_bedgraph = select_first([joint.sv_copynum_bedgraph, select_all(upstream.sv_copynum_bedgraph)]) Array[File] sv_depth_bw = select_first([joint.sv_depth_bw, select_all(upstream.sv_depth_bw)]) Array[File] sv_gc_bias_corrected_depth_bw = select_first([joint.sv_gc_bias_corrected_depth_bw, select_all(upstream.sv_gc_bias_corrected_depth_bw)]) Array[File] sv_maf_bw = select_first([joint.sv_maf_bw, select_all(upstream.sv_maf_bw)]) Array[File] sv_copynum_summary = select_first([joint.sv_copynum_summary, select_all(upstream.sv_copynum_summary)]) # sv stats Array[String] stat_sv_DUP_count = downstream.stat_sv_DUP_count Array[String] stat_sv_DEL_count = downstream.stat_sv_DEL_count Array[String] stat_sv_INS_count = downstream.stat_sv_INS_count Array[String] stat_sv_INV_count = downstream.stat_sv_INV_count Array[String] stat_sv_SWAP_count = downstream.stat_sv_SWAP_count Array[String] stat_sv_BND_count = downstream.stat_sv_BND_count # small variant outputs Array[File] phased_small_variant_vcf = downstream.phased_small_variant_vcf Array[File] phased_small_variant_vcf_index = downstream.phased_small_variant_vcf_index Array[File] small_variant_gvcf = upstream.small_variant_gvcf Array[File] small_variant_gvcf_index = upstream.small_variant_gvcf_index # small variant stats Array[File] small_variant_stats = downstream.small_variant_stats Array[File] bcftools_roh_out = downstream.bcftools_roh_out Array[File] bcftools_roh_bed = downstream.bcftools_roh_bed Array[String] stat_small_variant_SNV_count = downstream.stat_SNV_count Array[String] stat_small_variant_INDEL_count = downstream.stat_INDEL_count Array[String] stat_small_variant_TSTV_ratio = downstream.stat_TSTV_ratio Array[String] stat_small_variant_HETHOM_ratio = downstream.stat_HETHOM_ratio Array[File] snv_distribution_plot = downstream.snv_distribution_plot Array[File] indel_distribution_plot = downstream.indel_distribution_plot # trgt outputs Array[File] phased_trgt_vcf = downstream.trgt_vcf Array[File] phased_trgt_vcf_index = downstream.trgt_vcf_index Array[File] trgt_spanning_reads = downstream.trgt_spanning_reads Array[File] trgt_spanning_reads_index = downstream.trgt_spanning_reads_index Array[File] trgt_coverage_dropouts = downstream.trgt_coverage_dropouts Array[String] stat_trgt_genotyped_count = downstream.stat_trgt_genotyped_count Array[String] stat_trgt_uncalled_count = downstream.stat_trgt_uncalled_count # paraphase outputs Array[File?] paraphase_output_json = upstream.paraphase_output_json Array[File?] paraphase_realigned_bam = upstream.paraphase_realigned_bam Array[File?] paraphase_realigned_bam_index = upstream.paraphase_realigned_bam_index Array[File?] paraphase_vcfs = upstream.paraphase_vcfs # per sample mitorsaw outputs Array[File] mitorsaw_vcf = upstream.mitorsaw_vcf Array[File] mitorsaw_vcf_index = upstream.mitorsaw_vcf_index Array[File] mitorsaw_hap_stats = upstream.mitorsaw_hap_stats # PGx outputs Array[File] pbstarphase_json = downstream.pbstarphase_json Array[File?] pharmcat_match_json = downstream.pharmcat_match_json Array[File?] pharmcat_phenotype_json = downstream.pharmcat_phenotype_json Array[File?] pharmcat_report_html = downstream.pharmcat_report_html Array[File?] pharmcat_report_json = downstream.pharmcat_report_json # joint call outputs File? joint_small_variants_vcf = merge_small_variant_vcfs.merged_vcf File? joint_small_variants_vcf_index = merge_small_variant_vcfs.merged_vcf_index File? joint_sv_vcf = merge_sv_vcfs.merged_vcf File? joint_sv_vcf_index = merge_sv_vcfs.merged_vcf_index File? joint_trgt_vcf = trgt_merge.merged_vcf File? joint_trgt_vcf_index = trgt_merge.merged_vcf_index # tertiary analysis outputs File? tertiary_small_variant_filtered_vcf = tertiary_analysis.small_variant_filtered_vcf File? tertiary_small_variant_filtered_vcf_index = tertiary_analysis.small_variant_filtered_vcf_index File? tertiary_small_variant_filtered_tsv = tertiary_analysis.small_variant_filtered_tsv File? tertiary_small_variant_compound_het_vcf = tertiary_analysis.small_variant_compound_het_vcf File? tertiary_small_variant_compound_het_vcf_index = tertiary_analysis.small_variant_compound_het_vcf_index File? tertiary_small_variant_compound_het_tsv = tertiary_analysis.small_variant_compound_het_tsv File? tertiary_sv_filtered_vcf = tertiary_analysis.sv_filtered_vcf File? tertiary_sv_filtered_vcf_index = tertiary_analysis.sv_filtered_vcf_index File? tertiary_sv_filtered_tsv = tertiary_analysis.sv_filtered_tsv # qc messages Array[String] msg = flatten( [ process_trgt_catalog.msg, flatten(upstream.msg), flatten(downstream.msg) ] ) # workflow metadata String workflow_name = "humanwgs_family" String workflow_version = "v3.2.0" + if defined(debug_version) then "~{"-" + debug_version}" else "" } }