version 1.0 import "downstream/downstream.wdl" as Downstream import "process_trgt_catalog/process_trgt_catalog.wdl" as ProcessTrgtCatalog import "tertiary/tertiary.wdl" as TertiaryAnalysis import "upstream/upstream.wdl" as Upstream import "wdl-common/wdl/tasks/utilities.wdl" as Utilities import "wdl-common/wdl/workflows/backend_configuration/backend_configuration.wdl" as BackendConfiguration workflow humanwgs_singleton { meta { description: "PacBio HiFi human whole genome sequencing pipeline for individual samples." outputs: { stats_file: { description: "Table of summary statistics" }, msg_file: { description: "File containing messages from the workflow" }, bam_statistics: { description: "BAM statistics" }, read_length_plot: { description: "Distribution of read lengths" }, read_quality_plot: { description: "Distribution of read qualities" }, mapq_distribution_plot: { description: "Distribution of mapping quality per alignment" }, mg_distribution_plot: { description: "Distribution of gap-compressed identity per alignment" }, stat_read_count: { description: "Number of reads" }, stat_read_length_mean: { description: "Mean read length" }, stat_read_length_median: { description: "Median read length" }, stat_read_length_n50: { description: "Read length N50" }, stat_read_quality_mean: { description: "Mean read quality" }, stat_read_quality_median: { description: "Median read quality" }, stat_mapped_read_count: { description: "Number of reads mapped to reference" }, stat_mapped_read_percent: { description: "Percent of reads mapped to reference" }, stat_gap_compressed_identity_mean: { description: "Mean gap-compressed identity" }, stat_gap_compressed_identity_median: { description: "Median gap-compressed identity" }, merged_haplotagged_bam: { description: "Merged, haplotagged alignments" }, merged_haplotagged_bam_index: { description: "Index for merged, haplotagged alignments" }, mosdepth_summary: { description: "Summary of aligned read depth" }, mosdepth_region_bed: { description: "Median aligned read depth by 500bp windows" }, mosdepth_region_bed_index: { description: "Index for median aligned read depth by 500bp windows" }, mosdepth_depth_distribution_plot: { description: "Distribution of aligned read depth" }, stat_depth_mean: { description: "Mean depth" }, inferred_sex: { description: "Inferred sex" }, phase_stats: { description: "Phasing statistics" }, phase_blocks: { description: "Phase blocks" }, phase_haplotags: { description: "Per-read phase assignment" }, stat_phased_basepairs: { description: "Number of basepairs within phase blocks" }, stat_phase_block_ng50: { description: "Phase block NG50" }, cpg_combined_bed: { description: "5mCpG combined BED" }, cpg_combined_bed_index: { description: "Index for 5mCpG combined BED" }, cpg_hap1_bed: { description: "5mCpG haplotype 1 BED" }, cpg_hap1_bed_index: { description: "Index for 5mCpG haplotype 1 BED" }, cpg_hap2_bed: { description: "5mCpG haplotype 2 BED" }, cpg_hap2_bed_index: { description: "Index for 5mCpG haplotype 2 BED" }, cpg_combined_bw: { description: "5mCpG combined BigWig" }, cpg_hap1_bw: { description: "5mCpG haplotype 1 BigWig" }, cpg_hap2_bw: { description: "5mCpG haplotype 2 BigWig" }, stat_cpg_hap1_count: { description: "Number of scored reference 5mCpGs in haplotype 1" }, stat_cpg_hap2_count: { description: "Number of scored reference 5mCpGs in haplotype 2" }, stat_cpg_combined_count: { description: "Number of scored reference 5mCpGs combined" }, methbat_profile: { description: "MethBat 5mCpG profile" }, stat_methbat_methylated_count: { description: "Number of profiled regions labeled as methylated" }, stat_methbat_unmethylated_count: { description: "Number of profiled regions labeled as unmethylated" }, stat_methbat_asm_count: { description: "Number of profiled regions labeled as having allele-specific methylation" }, phased_sv_vcf: { description: "Phased structural variant VCF" }, phased_sv_vcf_index: { description: "Index for phased structural variant VCF" }, sv_supporting_reads: { description: "Supporting reads for structural variants" }, sv_copynum_bedgraph: { description: "CNV copy number BEDGraph" }, sv_depth_bw: { description: "CNV depth BigWig" }, sv_gc_bias_corrected_depth_bw: { description: "CNV GC-bias corrected depth BigWig" }, sv_maf_bw: { description: "CNV MAF BigWig" }, sv_copynum_summary: { description: "CNV copy number summary JSON" }, stat_sv_DUP_count: { description: "Number of DUP structural variants" }, stat_sv_DEL_count: { description: "Number of DEL structural variants" }, stat_sv_INS_count: { description: "Number of INS structural variants" }, stat_sv_INV_count: { description: "Number of INV structural variants" }, stat_sv_SWAP_count: { description: "Number of structural variant sequence swap events" }, stat_sv_BND_count: { description: "Number of BND structural variants" }, phased_small_variant_vcf: { description: "Phased small variant VCF" }, phased_small_variant_vcf_index: { description: "Index for phased small variant VCF" }, small_variant_gvcf: { description: "Small variant GVCF" }, small_variant_gvcf_index: { description: "Index for small variant GVCF" }, small_variant_stats: { description: "Small variant statistics" }, bcftools_roh_out: { description: "Regions of homozygosity" }, bcftools_roh_bed: { description: "Regions of homozygosity BED" }, stat_small_variant_SNV_count: { description: "Number of SNVs" }, stat_small_variant_INDEL_count: { description: "Number of INDELs" }, stat_small_variant_TSTV_ratio: { description: "Ts/Tv ratio" }, stat_small_variant_HETHOM_ratio: { description: "Het/Hom ratio for SNVs" }, snv_distribution_plot: { description: "Distribution of SNVs by REF, ALT" }, indel_distribution_plot: { description: "Distribution of indels by size" }, phased_trgt_vcf: { description: "Phased TRGT VCF" }, phased_trgt_vcf_index: { description: "Index for phased TRGT VCF" }, trgt_spanning_reads: { description: "Aligned TRGT spanning reads" }, trgt_spanning_reads_index: { description: "Index for aligned TRGT spanning reads" }, trgt_coverage_dropouts: { description: "TRGT regions with coverage dropouts" }, stat_trgt_genotyped_count: { description: "Number of sites genotyped by TRGT" }, stat_trgt_uncalled_count: { description: "Number of sites ungenotyped by TRGT" }, paraphase_summary: { description: "Paraphase summary" }, paraphase_realigned_bam: { description: "BAM file of reads realigned by Paraphase" }, paraphase_realigned_bam_index: { description: "Index for BAM file of reads realigned by Paraphase" }, paraphase_vcfs: { description: "Paraphase VCFs" }, mitorsaw_vcf: { description: "Mitochondrial variant VCF" }, mitorsaw_vcf_index: { description: "Index for mitochondrial variant VCF" }, mitorsaw_hap_stats: { description: "Mitochondrial haplotype statistics" }, pbstarphase_summary: { description: "StarPhase summary" }, pharmcat_match_json: { description: "PharmCAT match JSON" }, pharmcat_phenotype_json: { description: "PharmCAT phenotype JSON" }, pharmcat_report_html: { description: "PharmCAT report HTML" }, pharmcat_report_json: { description: "PharmCAT report JSON" }, tertiary_small_variant_filtered_vcf: { description: "Filtered, annotated small variant VCF" }, tertiary_small_variant_filtered_vcf_index: { description: "Index for filtered, annotated small variant VCF" }, tertiary_small_variant_filtered_tsv: { description: "Filtered, annotated small variant TSV" }, tertiary_small_variant_compound_het_vcf: { description: "Filtered, annotated compound heterozygous small variant VCF" }, tertiary_small_variant_compound_het_vcf_index: { description: "Index for filtered, annotated compound heterozygous small variant VCF" }, tertiary_small_variant_compound_het_tsv: { description: "Filtered, annotated compound heterozygous small variant TSV" }, tertiary_sv_filtered_vcf: { description: "Filtered, annotated structural variant VCF" }, tertiary_sv_filtered_vcf_index: { description: "Index for filtered, annotated structural variant VCF" }, tertiary_sv_filtered_tsv: { description: "Filtered, annotated structural variant TSV" }, msg: { description: "Messages from the workflow" }, workflow_name: { description: "Workflow name" }, workflow_version: { description: "Workflow version" } } } parameter_meta { sample_id: { description: "Unique identifier for the sample" } sex: { description: "Sample sex", choices: [ "MALE", "FEMALE" ] } hifi_reads: { description: "Array of paths to hifi_reads in unaligned BAM format" } fail_reads: { description: "Array of paths to fail_reads in unaligned BAM format" } phenotypes: { description: "Comma-delimited list of HPO terms for phenotypes", external_help: "https://hpo.jax.org" } ref_map_file: { description: "TSV containing reference genome file paths; must match backend" } tertiary_map_file: { description: "TSV containing tertiary analysis file paths and thresholds; must match backend" } max_reads_per_alignment_chunk: { description: "Maximum reads per alignment chunk" } pharmcat_min_coverage: { description: "Minimum coverage for PharmCAT" } use_gpu: { description: "Use GPU when possible" } use_parabricks_deepvariant: { description: "Use Parabricks DeepVariant for small variant calling when GPU is enabled" } backend: { description: "Backend where the workflow will be executed", choices: [ "GCP", "Azure", "AWS-HealthOmics", "HPC" ] } zones: { description: "Zones where compute will take place; required if backend is set to 'GCP'" } cpuPlatform: { description: "Optional minimum CPU platform to use for tasks on GCP" } gpuType: { description: "GPU type to use; required if gpu is set to `true` for cloud backends; must match backend" } container_registry: { description: "Container registry where workflow images are hosted. If left blank, PacBio's public Quay.io registry will be used. Must be set if backend is set to 'AWS-HealthOmics'", default: "quay.io/pacbio" } preemptible: { description: "Where possible, run tasks preemptibly" } debug_version: { description: "Debug version for testing purposes" } } input { String sample_id String? sex Array[File] hifi_reads Array[File]? fail_reads String phenotypes = "HP:0000001" File ref_map_file File? tertiary_map_file Int max_reads_per_alignment_chunk = 500000 Int pharmcat_min_coverage = 10 Boolean use_gpu = false Boolean use_parabricks_deepvariant = false # Backend configuration String backend String? zones String? cpuPlatform String? gpuType String? container_registry Boolean preemptible = true String? debug_version } call BackendConfiguration.backend_configuration { input: backend = backend, zones = zones, cpuPlatform = cpuPlatform, gpuType = gpuType, container_registry = container_registry } RuntimeAttributes default_runtime_attributes = if preemptible then backend_configuration.spot_runtime_attributes else backend_configuration.on_demand_runtime_attributes #@ except: DeclarationName Map[String, String] ref_map = read_map(ref_map_file) call ProcessTrgtCatalog.process_trgt_catalog { input: trgt_catalog = ref_map["trgt_tandem_repeat_bed"], # !FileCoercion ref_fasta = ref_map["fasta"], # !FileCoercion ref_index = ref_map["fasta_index"], # !FileCoercion default_runtime_attributes = default_runtime_attributes } call Upstream.upstream { input: sample_id = sample_id, sex = sex, hifi_reads = hifi_reads, fail_reads = fail_reads, fail_reads_bed = process_trgt_catalog.include_fail_reads_bed, fail_reads_bait_index = process_trgt_catalog.fail_reads_bait_index, ref_map_file = ref_map_file, max_reads_per_alignment_chunk = max_reads_per_alignment_chunk, single_sample = true, use_gpu = use_gpu, use_parabricks_deepvariant = use_parabricks_deepvariant, default_runtime_attributes = default_runtime_attributes } call Downstream.downstream { input: sample_id = sample_id, sex = upstream.inferred_sex, aligned_hifi_reads = upstream.aligned_hifi_reads, aligned_hifi_reads_index = upstream.aligned_hifi_reads_index, aligned_fail_reads = upstream.aligned_fail_reads, aligned_fail_reads_index = upstream.aligned_fail_reads_index, trgt_catalog = process_trgt_catalog.full_catalog, small_variant_vcf = upstream.small_variant_vcf, small_variant_vcf_index = upstream.small_variant_vcf_index, sv_vcf = select_first([ upstream.sv_vcf ]), sv_vcf_index = select_first([ upstream.sv_vcf_index ]), pharmcat_min_coverage = pharmcat_min_coverage, ref_map_file = ref_map_file, default_runtime_attributes = default_runtime_attributes } Map[String, String] pedigree_sex = { "MALE": "1", "FEMALE": "2", "": "." } # write sample metadata similar to pedigree format # family_id, sample_id, father_id, mother_id, sex, affected Array[String] sample_metadata = [ sample_id, sample_id, ".", ".", pedigree_sex[upstream.inferred_sex], "2" ] if (defined(tertiary_map_file)) { call TertiaryAnalysis.tertiary_analysis { input: sample_metadata = [ sample_metadata ], phenotypes = phenotypes, is_trio_kid = [ false ], is_duo_kid = [ false ], small_variant_vcf = downstream.phased_small_variant_vcf, small_variant_vcf_index = downstream.phased_small_variant_vcf_index, sv_vcf = downstream.phased_sv_vcf, sv_vcf_index = downstream.phased_sv_vcf_index, ref_map_file = ref_map_file, tertiary_map_file = select_first([ tertiary_map_file ]), default_runtime_attributes = default_runtime_attributes } } Array[Array[String]] stats = [ [ "sample_id", sample_id ], [ "read_count", downstream.stat_read_count ], [ "read_length_mean", downstream.stat_read_length_mean ], [ "read_length_median", downstream.stat_read_length_median ], [ "read_length_n50", downstream.stat_read_length_n50 ], [ "read_quality_mean", downstream.stat_read_quality_mean ], [ "read_quality_median", downstream.stat_read_quality_median ], [ "mapped_read_count", downstream.stat_mapped_read_count ], [ "mapped_read_percent", downstream.stat_mapped_read_percent ], [ "gap_compressed_identity_mean", downstream.stat_gap_compressed_identity_mean ], [ "gap_compressed_identity_median", downstream.stat_gap_compressed_identity_median ], [ "depth_mean", upstream.stat_depth_mean ], [ "inferred_sex", upstream.inferred_sex ], [ "stat_phased_basepairs", downstream.stat_phased_basepairs ], [ "phase_block_ng50", downstream.stat_phase_block_ng50 ], [ "cpg_combined_count", downstream.stat_combined_cpg_count ], [ "cpg_hap1_count", downstream.stat_hap1_cpg_count ], [ "cpg_hap2_count", downstream.stat_hap2_cpg_count ], [ "methbat_methylated_count", downstream.stat_methbat_methylated_count ], [ "methbat_unmethylated_count", downstream.stat_methbat_unmethylated_count ], [ "methbat_asm_count", downstream.stat_methbat_asm_count ], [ "SNV_count", downstream.stat_SNV_count ], [ "TSTV_ratio", downstream.stat_TSTV_ratio ], [ "HETHOM_ratio", downstream.stat_HETHOM_ratio ], [ "INDEL_count", downstream.stat_INDEL_count ], [ "sv_DUP_count", downstream.stat_sv_DUP_count ], [ "sv_DEL_count", downstream.stat_sv_DEL_count ], [ "sv_INS_count", downstream.stat_sv_INS_count ], [ "sv_INV_count", downstream.stat_sv_INV_count ], [ "sv_SWAP_count", downstream.stat_sv_SWAP_count ], [ "sv_BND_count", downstream.stat_sv_BND_count ], [ "trgt_genotyped_count", downstream.stat_trgt_genotyped_count ], [ "trgt_uncalled_count", downstream.stat_trgt_uncalled_count ] ] call Utilities.consolidate_stats { input: out_prefix = sample_id, stats = stats, msg_array = flatten([ process_trgt_catalog.msg, upstream.msg ]), runtime_attributes = default_runtime_attributes } output { # consolidated stats File stats_file = consolidate_stats.stats_tsv File msg_file = consolidate_stats.messages # bam stats File bam_statistics = downstream.bam_statistics File read_length_plot = downstream.read_length_plot File? read_quality_plot = downstream.read_quality_plot File mapq_distribution_plot = downstream.mapq_distribution_plot File mg_distribution_plot = downstream.mg_distribution_plot String stat_read_count = downstream.stat_read_count String stat_read_length_mean = downstream.stat_read_length_mean String stat_read_length_median = downstream.stat_read_length_median String stat_read_length_n50 = downstream.stat_read_length_n50 String stat_read_quality_mean = downstream.stat_read_quality_mean String stat_read_quality_median = downstream.stat_read_quality_median String stat_mapped_read_count = downstream.stat_mapped_read_count String stat_mapped_read_percent = downstream.stat_mapped_read_percent String stat_gap_compressed_identity_mean = downstream.stat_gap_compressed_identity_mean String stat_gap_compressed_identity_median = downstream.stat_gap_compressed_identity_median # merged, haplotagged alignments File merged_haplotagged_bam = downstream.merged_haplotagged_bam File merged_haplotagged_bam_index = downstream.merged_haplotagged_bam_index # mosdepth outputs File mosdepth_summary = upstream.mosdepth_summary File mosdepth_region_bed = upstream.mosdepth_region_bed File mosdepth_region_bed_index = upstream.mosdepth_region_bed_index File mosdepth_depth_distribution_plot = upstream.mosdepth_depth_distribution_plot String stat_depth_mean = upstream.stat_depth_mean String inferred_sex = upstream.inferred_sex # phasing stats File phase_stats = downstream.phase_stats File phase_blocks = downstream.phase_blocks File phase_haplotags = downstream.phase_haplotags String stat_phased_basepairs = downstream.stat_phased_basepairs String stat_phase_block_ng50 = downstream.stat_phase_block_ng50 # methylation outputs and profile File? cpg_combined_bed = downstream.cpg_combined_bed File? cpg_combined_bed_index = downstream.cpg_combined_bed_index File? cpg_hap1_bed = downstream.cpg_hap1_bed File? cpg_hap1_bed_index = downstream.cpg_hap1_bed_index File? cpg_hap2_bed = downstream.cpg_hap2_bed File? cpg_hap2_bed_index = downstream.cpg_hap2_bed_index File? cpg_combined_bw = downstream.cpg_combined_bw File? cpg_hap1_bw = downstream.cpg_hap1_bw File? cpg_hap2_bw = downstream.cpg_hap2_bw String stat_cpg_hap1_count = downstream.stat_hap1_cpg_count String stat_cpg_hap2_count = downstream.stat_hap2_cpg_count String stat_cpg_combined_count = downstream.stat_combined_cpg_count File? methbat_profile = downstream.methbat_profile String stat_methbat_methylated_count = downstream.stat_methbat_methylated_count String stat_methbat_unmethylated_count = downstream.stat_methbat_unmethylated_count String stat_methbat_asm_count = downstream.stat_methbat_asm_count # sv outputs File phased_sv_vcf = downstream.phased_sv_vcf File phased_sv_vcf_index = downstream.phased_sv_vcf_index File sv_supporting_reads = select_first([ upstream.sv_supporting_reads ]) File sv_copynum_bedgraph = select_first([ upstream.sv_copynum_bedgraph ]) File sv_depth_bw = select_first([ upstream.sv_depth_bw ]) File sv_gc_bias_corrected_depth_bw = select_first([ upstream.sv_gc_bias_corrected_depth_bw ]) File sv_maf_bw = select_first([ upstream.sv_maf_bw ]) File sv_copynum_summary = select_first([ upstream.sv_copynum_summary ]) # sv stats String stat_sv_DUP_count = downstream.stat_sv_DUP_count String stat_sv_DEL_count = downstream.stat_sv_DEL_count String stat_sv_INS_count = downstream.stat_sv_INS_count String stat_sv_INV_count = downstream.stat_sv_INV_count String stat_sv_SWAP_count = downstream.stat_sv_SWAP_count String stat_sv_BND_count = downstream.stat_sv_BND_count # small variant outputs File phased_small_variant_vcf = downstream.phased_small_variant_vcf File phased_small_variant_vcf_index = downstream.phased_small_variant_vcf_index File? small_variant_gvcf = upstream.small_variant_gvcf File? small_variant_gvcf_index = upstream.small_variant_gvcf_index # small variant stats File small_variant_stats = downstream.small_variant_stats File bcftools_roh_out = downstream.bcftools_roh_out File bcftools_roh_bed = downstream.bcftools_roh_bed String stat_small_variant_SNV_count = downstream.stat_SNV_count String stat_small_variant_INDEL_count = downstream.stat_INDEL_count String stat_small_variant_TSTV_ratio = downstream.stat_TSTV_ratio String stat_small_variant_HETHOM_ratio = downstream.stat_HETHOM_ratio File snv_distribution_plot = downstream.snv_distribution_plot File indel_distribution_plot = downstream.indel_distribution_plot # trgt outputs File phased_trgt_vcf = downstream.trgt_vcf File phased_trgt_vcf_index = downstream.trgt_vcf_index File trgt_spanning_reads = downstream.trgt_spanning_reads File trgt_spanning_reads_index = downstream.trgt_spanning_reads_index File trgt_coverage_dropouts = downstream.trgt_coverage_dropouts String stat_trgt_genotyped_count = downstream.stat_trgt_genotyped_count String stat_trgt_uncalled_count = downstream.stat_trgt_uncalled_count # paraphase outputs File? paraphase_summary = upstream.paraphase_output_json File? paraphase_realigned_bam = upstream.paraphase_realigned_bam File? paraphase_realigned_bam_index = upstream.paraphase_realigned_bam_index File? paraphase_vcfs = upstream.paraphase_vcfs # per sample mitorsaw outputs File mitorsaw_vcf = upstream.mitorsaw_vcf File mitorsaw_vcf_index = upstream.mitorsaw_vcf_index File mitorsaw_hap_stats = upstream.mitorsaw_hap_stats # PGx outputs File pbstarphase_summary = downstream.pbstarphase_json File? pharmcat_match_json = downstream.pharmcat_match_json File? pharmcat_phenotype_json = downstream.pharmcat_phenotype_json File? pharmcat_report_html = downstream.pharmcat_report_html File? pharmcat_report_json = downstream.pharmcat_report_json # tertiary analysis outputs File? tertiary_small_variant_filtered_vcf = tertiary_analysis.small_variant_filtered_vcf File? tertiary_small_variant_filtered_vcf_index = tertiary_analysis.small_variant_filtered_vcf_index File? tertiary_small_variant_filtered_tsv = tertiary_analysis.small_variant_filtered_tsv File? tertiary_small_variant_compound_het_vcf = tertiary_analysis.small_variant_compound_het_vcf File? tertiary_small_variant_compound_het_vcf_index = tertiary_analysis.small_variant_compound_het_vcf_index File? tertiary_small_variant_compound_het_tsv = tertiary_analysis.small_variant_compound_het_tsv File? tertiary_sv_filtered_vcf = tertiary_analysis.sv_filtered_vcf File? tertiary_sv_filtered_vcf_index = tertiary_analysis.sv_filtered_vcf_index File? tertiary_sv_filtered_tsv = tertiary_analysis.sv_filtered_tsv # qc messages Array[String] msg = flatten([ process_trgt_catalog.msg, upstream.msg, downstream.msg ]) # workflow metadata String workflow_name = "humanwgs_singleton" String workflow_version = "v3.3.1" + if defined(debug_version) then "~{"-" + debug_version}" else "" } }