version 1.0 import "Structs.wdl" import "BatchEvidenceMerging.wdl" as bem import "CNMOPS.wdl" as cnmops import "CollectCoverage.wdl" as cov import "DepthPreprocessing.wdl" as dpn import "MakeBincovMatrix.wdl" as mbm import "MatrixQC.wdl" as mqc import "MedianCov.wdl" as mc import "GatherBatchEvidenceMetrics.wdl" as metrics import "PESRPreprocessing.wdl" as pp import "GermlineCNVCase.wdl" as gcnv import "PloidyEstimation.wdl" as pe import "TinyResolve.wdl" as tiny import "Utils.wdl" as util # Batch-level workflow: # - Merge sample evidence data into a single batch # - Run cnMOPS # - Run gCNV # - Run MedianCoverage workflow GatherBatchEvidence { input { # Batch info String batch Array[String] samples Array[String]? ref_panel_samples # Optional QC tasks Boolean run_matrix_qc # Global files File ped_file File genome_file File primary_contigs_fai # .fai file of included contigs File ref_dict # PE/SR/BAF/bincov files # If neither SD_files nor ref_panel_SD_files is present, BAF_files must be supplied # If BAF_files is absent, SD_files and/or ref_panel_SD_files and sd_locs_vcf must be supplied Array[File] counts File? ref_panel_bincov_matrix File? bincov_matrix File? bincov_matrix_index Boolean subset_primary_contigs = false # PE/SR/BAF files will be subsetted to primary contigs only (for legacy files with bad sorting) Boolean rename_samples = false # Rename samples in PE/SR/BAF to IDs in the "samples" array (always done for RD) Array[File?]? BAF_files # Required for MatrixQC Array[File] PE_files Array[File]? ref_panel_PE_files Array[File] SR_files Array[File]? ref_panel_SR_files Array[File]? SD_files # required unless BAF_files or ref_panel_SD_files is supplied Array[File]? ref_panel_SD_files # required unless BAF_files or SD_files is supplied File? sd_locs_vcf # must be same sd_locs_vcf that was presented to GatherSampleEvidence # Condense read counts Int? min_interval_size Int? max_interval_size # gCNV inputs File contig_ploidy_model_tar Array[File] gcnv_model_tars File? gatk4_jar_override Float? gcnv_p_alt Float? gcnv_cnv_coherence_length Int? gcnv_max_copy_number Float? gcnv_mapping_error_rate Float? gcnv_sample_psi_scale Float? gcnv_depth_correction_tau String? gcnv_copy_number_posterior_expectation_mode Int? gcnv_active_class_padding_hybrid_mode Float? gcnv_learning_rate Float? gcnv_adamax_beta_1 Float? gcnv_adamax_beta_2 Int? gcnv_log_emission_samples_per_round Float? gcnv_log_emission_sampling_median_rel_error Int? gcnv_log_emission_sampling_rounds Int? gcnv_max_advi_iter_first_epoch Int? gcnv_max_advi_iter_subsequent_epochs Int? gcnv_min_training_epochs Int? gcnv_max_training_epochs Float? gcnv_initial_temperature Int? gcnv_num_thermal_advi_iters Int? gcnv_convergence_snr_averaging_window Float? gcnv_convergence_snr_trigger_threshold Int? gcnv_convergence_snr_countdown_window Int? gcnv_max_calling_iters Float? gcnv_caller_update_convergence_threshold Float? gcnv_caller_internal_admixing_rate Float? gcnv_caller_external_admixing_rate Boolean? gcnv_disable_annealing Float? ploidy_sample_psi_scale Int ref_copy_number_autosomal_contigs Array[String]? allosomal_contigs Boolean run_ploidy = false # Option to add first sample to the ped file (for single sample mode); run_ploidy must be true Boolean append_first_sample_to_ped = false Int gcnv_qs_cutoff # QS filtering cutoff Float? defragment_max_dist # SV tool calls Array[File]? manta_vcfs # Manta VCF Array[File]? melt_vcfs # Melt VCF Array[File]? scramble_vcfs # Scramble VCF Array[File]? wham_vcfs # Wham VCF Int min_svsize # Minimum SV length to include # CNMops files File cnmops_chrom_file File cnmops_exclude_list File cnmops_allo_file Int? cnmops_large_min_size # minimum size call to be detected by CNMOPS running in large mode # Resolve files File cytoband File mei_bed # QC files Int matrix_qc_distance # Module metrics parameters # Run module metrics workflow at the end - off by default for GatherBatchEvidence because of runtime/expense Boolean? run_module_metrics File? primary_contigs_list # required if run_module_metrics = true # baseline files are optional for metrics workflow # run ClusterBatch for vcf metrics File? baseline_merged_dels File? baseline_merged_dups File? baseline_median_cov # Runtime parameters String sv_base_mini_docker String sv_base_docker String sv_pipeline_docker String sv_pipeline_qc_docker String linux_docker String condense_counts_docker String gatk_docker String? gcnv_gatk_docker String cnmops_docker RuntimeAttr? median_cov_runtime_attr # Memory ignored, use median_cov_mem_gb_per_sample Float? median_cov_mem_gb_per_sample RuntimeAttr? evidence_merging_bincov_runtime_attr RuntimeAttr? runtime_attr_bem RuntimeAttr? cnmops_sample10_runtime_attr RuntimeAttr? cnmops_sample3_runtime_attr RuntimeAttr? ploidy_score_runtime_attr RuntimeAttr? ploidy_build_runtime_attr RuntimeAttr? runtime_attr_subset_ped RuntimeAttr? runtime_attr_validate_ped RuntimeAttr? add_sample_to_ped_runtime_attr RuntimeAttr? condense_counts_runtime_attr RuntimeAttr? preprocess_calls_runtime_attr RuntimeAttr? depth_merge_set_runtime_attr RuntimeAttr? depth_merge_sample_runtime_attr RuntimeAttr? cnmops_ped_runtime_attr RuntimeAttr? cnmops_clean_runtime_attr RuntimeAttr? matrix_qc_pesrbaf_runtime_attr RuntimeAttr? matrix_qc_rd_runtime_attr RuntimeAttr? runtime_attr_tiny_untar RuntimeAttr? runtime_attr_tiny_resolve RuntimeAttr? runtime_attr_ploidy RuntimeAttr? runtime_attr_case RuntimeAttr? runtime_attr_postprocess RuntimeAttr? runtime_attr_explode } Array[String] all_samples = flatten(select_all([samples, ref_panel_samples])) Array[File] all_PE_files = flatten(select_all([PE_files, ref_panel_PE_files])) Array[File] all_SR_files = flatten(select_all([SR_files, ref_panel_SR_files])) Array[File] all_SD_files = flatten(select_all([SD_files, ref_panel_SD_files])) if(defined(ref_panel_bincov_matrix) || !(defined(bincov_matrix) && defined(bincov_matrix_index))) { call mbm.MakeBincovMatrix as MakeBincovMatrix { input: samples = samples, count_files = counts, bincov_matrix = ref_panel_bincov_matrix, bincov_matrix_samples = ref_panel_samples, batch = batch, sv_base_mini_docker = sv_base_mini_docker, sv_base_docker = sv_base_docker, runtime_attr_override = evidence_merging_bincov_runtime_attr } } File merged_bincov_ = select_first([MakeBincovMatrix.merged_bincov, bincov_matrix]) File merged_bincov_idx_ = select_first([MakeBincovMatrix.merged_bincov_idx, bincov_matrix_index]) if (run_ploidy) { call pe.Ploidy as Ploidy { input: bincov_matrix = merged_bincov_, batch = batch, sv_base_mini_docker = sv_base_mini_docker, sv_pipeline_qc_docker = sv_pipeline_qc_docker, runtime_attr_score = ploidy_score_runtime_attr, runtime_attr_build = ploidy_build_runtime_attr } } Array[String] samples_batch = select_first([ref_panel_samples, samples]) call util.ValidatePedFile { input: ped_file = ped_file, sample_list = write_lines(samples_batch), sv_pipeline_docker = sv_pipeline_docker, runtime_attr_override = runtime_attr_validate_ped } call util.SubsetPedFile { input: ped_file = ValidatePedFile.output_ped, sample_list = write_lines(samples_batch), subset_name = batch, sv_base_mini_docker = sv_base_mini_docker, runtime_attr_override = runtime_attr_subset_ped } if (append_first_sample_to_ped) { call AddCaseSampleToPed { input: ref_ped_file = SubsetPedFile.ped_subset_file, ploidy_plots = select_first([Ploidy.ploidy_plots]), sample_id = samples[0], sv_base_mini_docker = sv_base_mini_docker, runtime_attr_override = add_sample_to_ped_runtime_attr } } call bem.BatchEvidenceMerging as BatchEvidenceMerging { input: samples = all_samples, BAF_files = BAF_files, PE_files = all_PE_files, SR_files = all_SR_files, SD_files = all_SD_files, sd_locs_vcf = sd_locs_vcf, reference_dict = ref_dict, primary_contigs_fai = primary_contigs_fai, subset_primary_contigs = subset_primary_contigs, rename_samples = rename_samples, batch = batch, gatk_docker = gatk_docker, runtime_attr_override = runtime_attr_bem } call cnmops.CNMOPS as CNMOPS { input: r1 = "3", r2 = "10", batch = batch, samples = all_samples, bincov_matrix = merged_bincov_, bincov_matrix_index = merged_bincov_idx_, chrom_file = cnmops_chrom_file, ped_file = select_first([AddCaseSampleToPed.combined_ped_file, SubsetPedFile.ped_subset_file]), exclude_list = cnmops_exclude_list, allo_file = cnmops_allo_file, ref_dict = ref_dict, prefix = "header", stitch_and_clean_large_events = false, linux_docker = linux_docker, sv_pipeline_docker = sv_pipeline_docker, cnmops_docker = cnmops_docker, runtime_attr_sample10 = cnmops_sample10_runtime_attr, runtime_attr_sample3 = cnmops_sample3_runtime_attr, runtime_attr_ped = cnmops_ped_runtime_attr, runtime_attr_clean = cnmops_clean_runtime_attr } call cnmops.CNMOPS as CNMOPSLarge { input: r1 = "1000", r2 = "100", batch = batch, samples = all_samples, bincov_matrix = merged_bincov_, bincov_matrix_index = merged_bincov_idx_, chrom_file = cnmops_chrom_file, ped_file = select_first([AddCaseSampleToPed.combined_ped_file, SubsetPedFile.ped_subset_file]), exclude_list = cnmops_exclude_list, allo_file = cnmops_allo_file, ref_dict = ref_dict, prefix = "large", min_size=cnmops_large_min_size, stitch_and_clean_large_events = true, linux_docker = linux_docker, sv_pipeline_docker = sv_pipeline_docker, cnmops_docker = cnmops_docker, runtime_attr_sample10 = cnmops_sample10_runtime_attr, runtime_attr_sample3 = cnmops_sample3_runtime_attr, runtime_attr_ped = cnmops_ped_runtime_attr, runtime_attr_clean = cnmops_clean_runtime_attr } scatter (i in range(length(samples))) { call cov.CondenseReadCounts as CondenseReadCounts { input: counts = counts[i], sample = samples[i], min_interval_size = min_interval_size, max_interval_size = max_interval_size, condense_counts_docker = condense_counts_docker, runtime_attr_override=condense_counts_runtime_attr } } call gcnv.CNVGermlineCaseWorkflow as gCNVCase { input: counts = CondenseReadCounts.out, count_entity_ids = samples, contig_ploidy_model_tar = contig_ploidy_model_tar, gcnv_model_tars = gcnv_model_tars, gatk_docker = select_first([gcnv_gatk_docker, gatk_docker]), linux_docker = linux_docker, sv_base_mini_docker = sv_base_mini_docker, gatk4_jar_override = gatk4_jar_override, gcnv_p_alt = gcnv_p_alt, gcnv_cnv_coherence_length = gcnv_cnv_coherence_length, gcnv_max_copy_number = gcnv_max_copy_number, gcnv_mapping_error_rate = gcnv_mapping_error_rate, gcnv_sample_psi_scale = gcnv_sample_psi_scale, gcnv_depth_correction_tau = gcnv_depth_correction_tau, gcnv_copy_number_posterior_expectation_mode = gcnv_copy_number_posterior_expectation_mode, gcnv_active_class_padding_hybrid_mode = gcnv_active_class_padding_hybrid_mode, gcnv_learning_rate = gcnv_learning_rate, gcnv_adamax_beta_1 = gcnv_adamax_beta_1, gcnv_adamax_beta_2 = gcnv_adamax_beta_2, gcnv_log_emission_samples_per_round = gcnv_log_emission_samples_per_round, gcnv_log_emission_sampling_median_rel_error = gcnv_log_emission_sampling_median_rel_error, gcnv_log_emission_sampling_rounds = gcnv_log_emission_sampling_rounds, gcnv_max_advi_iter_first_epoch = gcnv_max_advi_iter_first_epoch, gcnv_max_advi_iter_subsequent_epochs = gcnv_max_advi_iter_subsequent_epochs, gcnv_min_training_epochs = gcnv_min_training_epochs, gcnv_max_training_epochs = gcnv_max_training_epochs, gcnv_initial_temperature = gcnv_initial_temperature, gcnv_num_thermal_advi_iters = gcnv_num_thermal_advi_iters, gcnv_convergence_snr_averaging_window = gcnv_convergence_snr_averaging_window, gcnv_convergence_snr_trigger_threshold = gcnv_convergence_snr_trigger_threshold, gcnv_convergence_snr_countdown_window = gcnv_convergence_snr_countdown_window, gcnv_max_calling_iters = gcnv_max_calling_iters, gcnv_caller_update_convergence_threshold = gcnv_caller_update_convergence_threshold, gcnv_caller_internal_admixing_rate = gcnv_caller_internal_admixing_rate, gcnv_caller_external_admixing_rate = gcnv_caller_external_admixing_rate, gcnv_disable_annealing = gcnv_disable_annealing, ref_copy_number_autosomal_contigs = ref_copy_number_autosomal_contigs, allosomal_contigs = allosomal_contigs, runtime_attr_ploidy = runtime_attr_ploidy, runtime_attr_case = runtime_attr_case, runtime_attr_postprocess = runtime_attr_postprocess, runtime_attr_explode = runtime_attr_explode } call dpn.MergeDepth as MergeDepth { input: samples = samples, genotyped_segments_vcfs = gCNVCase.genotyped_segments_vcf, contig_ploidy_calls = gCNVCase.sample_contig_ploidy_calls_tars, gcnv_qs_cutoff = gcnv_qs_cutoff, defragment_max_dist = defragment_max_dist, std_cnmops_del = CNMOPS.Del, std_cnmops_dup = CNMOPS.Dup, large_cnmops_del = CNMOPSLarge.Del, large_cnmops_dup = CNMOPSLarge.Dup, batch = batch, sv_pipeline_docker = sv_pipeline_docker, sv_base_mini_docker = sv_base_mini_docker, runtime_attr_merge_sample = depth_merge_sample_runtime_attr, runtime_attr_merge_set = depth_merge_set_runtime_attr } Float median_cov_mem_gb = select_first([median_cov_mem_gb_per_sample, 0.5]) * length(all_samples) + 7.5 call mc.MedianCov as MedianCov { input: bincov_matrix = merged_bincov_, cohort_id = batch, sv_pipeline_qc_docker = sv_pipeline_qc_docker, runtime_attr = median_cov_runtime_attr, mem_gb_override = median_cov_mem_gb } call pp.PreprocessPESR as PreprocessPESR { input: samples = samples, manta_vcfs = manta_vcfs, melt_vcfs = melt_vcfs, scramble_vcfs = scramble_vcfs, wham_vcfs = wham_vcfs, contigs = primary_contigs_fai, min_svsize = min_svsize, batch = batch, sv_pipeline_docker = sv_pipeline_docker, runtime_attr = preprocess_calls_runtime_attr } if (defined(manta_vcfs)) { call tiny.TinyResolve as TinyResolve { input: samples = samples, manta_vcf_tar = select_first([PreprocessPESR.std_manta_vcf_tar]), cytoband=cytoband, discfile=PE_files, mei_bed=mei_bed, sv_pipeline_docker = sv_pipeline_docker, linux_docker = linux_docker, runtime_attr_resolve = runtime_attr_tiny_resolve, runtime_attr_untar = runtime_attr_tiny_untar } } if (run_matrix_qc) { call mqc.MatrixQC as MatrixQC { input: distance = matrix_qc_distance, genome_file = genome_file, batch = batch, PE_file = BatchEvidenceMerging.merged_PE, PE_idx = BatchEvidenceMerging.merged_PE_index, BAF_file = BatchEvidenceMerging.merged_BAF, BAF_idx = BatchEvidenceMerging.merged_BAF_index, RD_file = merged_bincov_, RD_idx = merged_bincov_idx_, SR_file = BatchEvidenceMerging.merged_SR, SR_idx = BatchEvidenceMerging.merged_SR_index, ref_dict = ref_dict, sv_pipeline_docker = sv_pipeline_docker, runtime_attr_pesrbaf = matrix_qc_pesrbaf_runtime_attr, runtime_attr_rd = matrix_qc_rd_runtime_attr } } Boolean run_module_metrics_ = if defined(run_module_metrics) then select_first([run_module_metrics]) else false if (run_module_metrics_) { call metrics.GatherBatchEvidenceMetrics { input: name = batch, samples = samples, merged_BAF = BatchEvidenceMerging.merged_BAF, merged_SR = BatchEvidenceMerging.merged_SR, merged_PE = BatchEvidenceMerging.merged_PE, merged_bincov = merged_bincov_, merged_dels = MergeDepth.del, merged_dups = MergeDepth.dup, median_cov = MedianCov.medianCov, baseline_merged_dels = baseline_merged_dels, baseline_merged_dups = baseline_merged_dups, baseline_median_cov = baseline_median_cov, contig_list = select_first([primary_contigs_list]), sv_pipeline_docker = sv_pipeline_docker, linux_docker = linux_docker } } output { File merged_BAF = BatchEvidenceMerging.merged_BAF File merged_BAF_index = BatchEvidenceMerging.merged_BAF_index File merged_SR = BatchEvidenceMerging.merged_SR File merged_SR_index = BatchEvidenceMerging.merged_SR_index File merged_PE = BatchEvidenceMerging.merged_PE File merged_PE_index = BatchEvidenceMerging.merged_PE_index File merged_bincov = merged_bincov_ File merged_bincov_index = merged_bincov_idx_ File? batch_ploidy_matrix = Ploidy.ploidy_matrix File? batch_ploidy_plots = Ploidy.ploidy_plots File? combined_ped_file = AddCaseSampleToPed.combined_ped_file File merged_dels = MergeDepth.del File merged_dups = MergeDepth.dup File cnmops_del = CNMOPS.Del File cnmops_del_index = CNMOPS.Del_idx File cnmops_dup = CNMOPS.Dup File cnmops_dup_index = CNMOPS.Dup_idx File cnmops_large_del = CNMOPSLarge.Del File cnmops_large_del_index = CNMOPSLarge.Del_idx File cnmops_large_dup = CNMOPSLarge.Dup File cnmops_large_dup_index = CNMOPSLarge.Dup_idx File median_cov = MedianCov.medianCov File? std_manta_vcf_tar = PreprocessPESR.std_manta_vcf_tar File? std_melt_vcf_tar = PreprocessPESR.std_melt_vcf_tar File? std_scramble_vcf_tar = PreprocessPESR.std_scramble_vcf_tar File? std_wham_vcf_tar = PreprocessPESR.std_wham_vcf_tar File? PE_stats = MatrixQC.PE_stats File? RD_stats = MatrixQC.RD_stats File? SR_stats = MatrixQC.SR_stats File? BAF_stats = MatrixQC.BAF_stats File? Matrix_QC_plot = MatrixQC.QC_plot Array[File]? manta_tloc = TinyResolve.tloc_manta_vcf File? metrics_file_batchevidence = GatherBatchEvidenceMetrics.metrics_file } } task AddCaseSampleToPed { input { File ref_ped_file File ploidy_plots String sample_id String sv_base_mini_docker RuntimeAttr? runtime_attr_override } RuntimeAttr default_attr = object { cpu_cores: 1, mem_gb: 2, disk_gb: 10, boot_disk_gb: 10, preemptible_tries: 3, max_retries: 1 } RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) output { File combined_ped_file = "combined_ped_file.ped" } command <<< set -euo pipefail tar xzf ~{ploidy_plots} -C . RECORD=$(gunzip -c ploidy_est/sample_sex_assignments.txt.gz | { grep -w "^~{sample_id}" || true; }) if [ -z "$RECORD" ]; then >&2 echo "Error: Sample ~{sample_id} not found in ploidy calls" exit 1 fi SEX=$(echo "$RECORD" | cut -f2) awk -v sample=~{sample_id} '$2 == sample { print "ERROR: A sample with the name "sample" is already present in the ped file." > "/dev/stderr"; exit 1; }' < ~{ref_ped_file} awk -v sample=~{sample_id} -v sex=$SEX '{print} END {OFS="\t"; print "case_sample",sample,"0","0",sex,"1" }' < ~{ref_ped_file} > combined_ped_file.ped >>> runtime { cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) docker: sv_base_mini_docker preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) } }