version 1.0

import "../../../tasks/VariantCalling/SRJointGenotypingPopulationScale.wdl" as SRJOINT
import "../../../tasks/Utility/VariantUtils.wdl" as VARUTIL
import "../../../tasks/Utility/Utils.wdl" as UTILS
import "../../../tasks/TertiaryAnalysis/FunctionalAnnotation.wdl" as FUNK
import "../../../tasks/Utility/SGKit.wdl" as SGKit
import "../../../tasks/Utility/Finalize.wdl" as FF

workflow SRJointCallGVCFsWithGenomicsDBPopulationScale {

    meta {
        author: "Jonn Smith"
        description: "A workflow that performs joint calling on single-sample gVCFs from GATK4 HaplotypeCaller using GenomicsDB.  This Workflow relies on previously constructed genomicsDB instances to provide population-scale context for joint calling.  NOTE: Currently assumes the interval list consists of only whole contigs."
    }
    parameter_meta {
        gvcfs:  "Array of GVCF files to use as inputs for joint calling."
        gvcf_indices:   "Array of gvcf index files for `gvcfs`.  Order should correspond to that in `gvcfs`."
        ref_map_file:  "Reference map file indicating reference sequence and auxillary file locations" 
        genomicsdb_tar_contig_map_file: "File containing a map of contigs to GenomicsDB tar files.  This file is used to determine which GenomicsDB tar file to use for each contig."

        heterozygosity: "Joint Genotyping Parameter - Heterozygosity value used to compute prior likelihoods for any locus. See the GATKDocs for full details on the meaning of this population genetics concept"
        heterozygosity_stdev: "Joint Genotyping Parameter - Standard deviation of heterozygosity for SNP and indel calling."
        indel_heterozygosity: "Joint Genotyping Parameter - Heterozygosity for indel calling. See the GATKDocs for heterozygosity for full details on the meaning of this population genetics concept"

        snp_calibration_sensitivity:    "VETS (ScoreVariantAnnotations) parameter - score below which SNP variants will be filtered."
        snp_max_unlabeled_variants: "VETS (ExtractVariantAnnotations) parameter - maximum number of unlabeled SNP variants/alleles to randomly sample with reservoir sampling.  If nonzero, annotations will also be extracted from unlabeled sites."
        snp_recalibration_annotation_values:    "VETS (ScoreSnpVariantAnnotations/ScoreVariantAnnotations) parameter - Array of annotation names to use to create the SNP variant scoring model and over which to score SNP variants."

        snp_known_reference_variants: "Array of VCF files to use as input reference variants for SNPs.  Each can be designated as either calibration or training using `snp_is_training` and `snp_is_calibration`."
        snp_known_reference_variants_index: "Array of VCF index files for `snp_known_reference_variants`.  Order should correspond to that in `snp_known_reference_variants`."
        snp_known_reference_variants_identifier: "Array of names to give to the VCF files given in `snp_known_reference_variants`.  Order should correspond to that in `snp_known_reference_variants`."
        snp_is_training: "Array of booleans indicating which files in `snp_known_reference_variants` should be used as training sets.  True -> training set.  False -> NOT a training set."
        snp_is_calibration: "Array of booleans indicating which files in `snp_known_reference_variants` should be used as calibration sets.  True ->calibration set.  False -> NOT a calibration set."

        indel_calibration_sensitivity:    "VETS (ScoreVariantAnnotations) parameter - score below which INDEL variants will be filtered."
        indel_max_unlabeled_variants: "VETS (ExtractVariantAnnotations) parameter - maximum number of unlabeled INDEL variants/alleles to randomly sample with reservoir sampling.  If nonzero, annotations will also be extracted from unlabeled sites."
        indel_recalibration_annotation_values:    "VETS (ScoreSnpVariantAnnotations/ScoreVariantAnnotations) parameter - Array of annotation names to use to create the INDEL variant scoring model and over which to score INDEL variants."

        indel_known_reference_variants: "Array of VCF files to use as input reference variants for INDELs.  Each can be designated as either calibration or training using `indel_is_training` and `indel_is_calibration`."
        indel_known_reference_variants_index: "Array of VCF index files for `indel_known_reference_variants`.  Order should correspond to that in `indel_known_reference_variants`."
        indel_known_reference_variants_identifier: "Array of names to give to the VCF files given in `indel_known_reference_variants`.  Order should correspond to that in `indel_known_reference_variants`."
        indel_is_training: "Array of booleans indicating which files in `indel_known_reference_variants` should be used as training sets.  True -> training set.  False -> NOT a training set."
        indel_is_calibration: "Array of booleans indicating which files in `indel_known_reference_variants` should be used as calibration sets.  True ->calibration set.  False -> NOT a calibration set."

        annotation_bed_files:   "Array of bed files to use to FILTER/annotate variants in the output file.  Annotations will be placed in the FILTER column, effectively filtering variants that overlap these regions."
        annotation_bed_file_indexes:    "Array of bed indexes for `annotation_bed_files`.  Order should correspond to `annotation_bed_files`."
        annotation_bed_file_annotation_names:   "Array of names/FILTER column entries to use for each given file in `annotation_bed_files`.  Order should correspond to `annotation_bed_files`."

        shard_max_interval_size_bp: "Maximum size of the interval on each shard.  This along with the given sequence dictionary determines how many shards there will be.  To shard by contig, set to a very high number.  Default is 999999999."

        prefix: "Prefix to use for output files."

        background_sample_gvcfs: "Array of GVCFs to use as background samples for joint calling."
        background_sample_gvcf_indices: "Array of GVCF index files for `background_sample_gvcfs`.  Order should correspond to that in `background_sample_gvcfs`."

        gcs_out_root_dir:    "GCS Bucket into which to finalize outputs.  If no bucket is given, outputs will not be finalized and instead will remain in their native execution location."
    }

    input {
        Array[File] gvcfs
        Array[File] gvcf_indices

        File ref_map_file

        File genomicsdb_tar_contig_map_file

        Float heterozygosity = 0.001
        Float heterozygosity_stdev = 0.01
        Float indel_heterozygosity = 0.000125

        Float snp_calibration_sensitivity = 0.99
        Int snp_max_unlabeled_variants = 0
        # TODO: Fix the annotations here to include the missing ones.  Must debug.
#        Array[String] snp_recalibration_annotation_values = [ "BaseQRankSum", "ExcessHet", "FS", "HAPCOMP", "HAPDOM", "HEC", "MQ", "MQRankSum", "QD", "ReadPosRankSum", "SOR", "DP" ]
        Array[String] snp_recalibration_annotation_values = [ "BaseQRankSum", "ExcessHet", "FS", "MQ", "MQRankSum", "QD", "ReadPosRankSum", "SOR", "DP" ]

        Array[File] snp_known_reference_variants
        Array[File] snp_known_reference_variants_index
        Array[File] snp_known_reference_variants_identifier
        Array[Boolean] snp_is_training
        Array[Boolean] snp_is_calibration

        Float indel_calibration_sensitivity = 0.99
        Int indel_max_unlabeled_variants = 0
        # TODO: Fix the annotations here to include the missing ones.  Must debug.
#        Array[String] indel_recalibration_annotation_values = [ "BaseQRankSum", "ExcessHet", "FS", "HAPCOMP", "HAPDOM", "HEC", "MQ", "MQRankSum", "QD", "ReadPosRankSum", "SOR", "DP" ]
        Array[String] indel_recalibration_annotation_values = [ "BaseQRankSum", "ExcessHet", "FS", "MQ", "MQRankSum", "QD", "ReadPosRankSum", "SOR", "DP" ]

        Array[File] indel_known_reference_variants
        Array[File] indel_known_reference_variants_index
        Array[File] indel_known_reference_variants_identifier
        Array[Boolean] indel_is_training
        Array[Boolean] indel_is_calibration

        Array[File]?   annotation_bed_files
        Array[File]?   annotation_bed_file_indexes
        Array[String]? annotation_bed_file_annotation_names

        File? snpeff_db
        String? snpeff_db_identifier

        File? interval_list

        Boolean do_zarr_conversion = false

        Int shard_max_interval_size_bp = 999999999

        String prefix

        Array[Array[File]]? background_sample_gvcfs
        Array[Array[File]]? background_sample_gvcf_indices

        String? gcs_out_root_dir
    }

    Map[String, String] ref_map = read_map(ref_map_file)
    Map[String, File] genomicsdb_tar_contig_map = read_map(genomicsdb_tar_contig_map_file)

    # Resolve the db_snp_vcf file, with preference to the db_snp_vcf file if it exists:
    call UTILS.ResolveMapKeysInPriorityOrder as ResolveMapKeysInPriorityOrder {
        input:
            map = ref_map,
            keys = ["test_bad_key_should_not_be_found", "dbsnp_vcf", "known_sites_vcf"]
    }
    File db_snp_vcf = ref_map[ResolveMapKeysInPriorityOrder.key]

    # Create sample-name map:
    call SRJOINT.CreateSampleNameMap as CreateSampleNameMap {
        input:
            gvcfs = gvcfs,
            background_sample_gvcfs = if defined(background_sample_gvcfs) then flatten(select_first([background_sample_gvcfs])) else [],
            prefix = prefix
    }

    # Get our interval list:
    if (!defined(interval_list)) {
        # If we have to, create interval list over which to shard the processing:
        call UTILS.MakeIntervalListFromSequenceDictionary as MakeIntervalListFromSequenceDictionary {
            input:
                ref_dict = ref_map['dict'],
                max_interval_size = shard_max_interval_size_bp
        }
    }
    File actual_interval_list = select_first([interval_list, MakeIntervalListFromSequenceDictionary.interval_list])

    # Get the interval name info for our files below:
    call UTILS.ExtractIntervalNamesFromIntervalOrBamFile as ExtractIntervalNamesFromIntervalOrBamFile {
        input:
            interval_file = actual_interval_list
    }

    # Shard by contig for speed:
    scatter (idx_1 in range(length(ExtractIntervalNamesFromIntervalOrBamFile.interval_info))) {

        String interval_name = ExtractIntervalNamesFromIntervalOrBamFile.interval_info[idx_1][0] + "_" + ExtractIntervalNamesFromIntervalOrBamFile.interval_info[idx_1][1] + "_" + ExtractIntervalNamesFromIntervalOrBamFile.interval_info[idx_1][2]
        String contig = ExtractIntervalNamesFromIntervalOrBamFile.interval_info[idx_1][0]
        File existing_genomicsdb_tar = genomicsdb_tar_contig_map[contig]

        # To make sure the interval names and the files themselves correspond, we need to make the
        # interval list file here:
        call UTILS.CreateIntervalListFileFromIntervalInfo as CreateIntervalListFileFromIntervalInfo {
            input:
                contig = ExtractIntervalNamesFromIntervalOrBamFile.interval_info[idx_1][0],
                start = ExtractIntervalNamesFromIntervalOrBamFile.interval_info[idx_1][1],
                end = ExtractIntervalNamesFromIntervalOrBamFile.interval_info[idx_1][2]
        }

        # Import our data into GenomicsDB:
        call SRJOINT.ImportGVCFs as ImportGVCFsIntoGenomicsDB {
            input:
                sample_name_map = CreateSampleNameMap.sample_name_map,
                interval_list   = CreateIntervalListFileFromIntervalInfo.interval_list,
                existing_genomicsdb_tar = existing_genomicsdb_tar,
                ref_fasta       = ref_map['fasta'],
                ref_fasta_fai   = ref_map['fai'],
                ref_dict        = ref_map['dict'],
                prefix          = prefix + "." + interval_name,
                batch_size      = 50,
                total_gvcf_size_gb = CreateSampleNameMap.total_gvcf_size_gb,
                runtime_attr_override = object {preemptible_tries: 0}
        }

        # Joint call
        call SRJOINT.GnarlyGenotypeGVCFs as GnarlyJointCallGVCFs {
            input:
                input_gvcf_data = ImportGVCFsIntoGenomicsDB.output_genomicsdb,
                interval_list   = CreateIntervalListFileFromIntervalInfo.interval_list,
                ref_fasta       = ref_map['fasta'],
                ref_fasta_fai   = ref_map['fai'],
                ref_dict        = ref_map['dict'],
                dbsnp_vcf       = db_snp_vcf,
                prefix          = prefix + "." + interval_name + ".gnarly_genotyper.raw",
                heterozygosity = heterozygosity,
                heterozygosity_stdev = heterozygosity_stdev,
                indel_heterozygosity = indel_heterozygosity,
              # runtime_attr_override = object {preemptible_tries: 0},  # Disable preemption for prototype.

        }
        # Select the VCF + index for the raw joint called file:
        File joint_vcf = GnarlyJointCallGVCFs.output_vcf
        File joint_vcf_index = GnarlyJointCallGVCFs.output_vcf_index

        # First make a sites-only VCF for recal (smaller file, easier to work with):
        call VARUTIL.MakeSitesOnlyVcf as MakeSitesOnlyVCF {
            input:
                vcf = joint_vcf,
                vcf_index = joint_vcf_index,
                prefix = prefix + "." + interval_name + ".sites_only"
        }
    }

    # Merge all sites-only VCFs
    call VARUTIL.GatherVcfs as MergeSitesOnlyVCFs {
        input:
            input_vcfs = MakeSitesOnlyVCF.sites_only_vcf,
            input_vcf_indices = MakeSitesOnlyVCF.sites_only_vcf_index,
            prefix = prefix + ".sites_only"
    }

    ########################################################################
    # Call VETS / VQSR-lite:
    call VARUTIL.ExtractVariantAnnotations as ExtractIndelVariantAnnotations {
        input:
            vcf = MergeSitesOnlyVCFs.output_vcf,
            vcf_index = MergeSitesOnlyVCFs.output_vcf_index,

            prefix = prefix,
            mode = "INDEL",

            recalibration_annotation_values = indel_recalibration_annotation_values,

            known_reference_variants = indel_known_reference_variants,
            known_reference_variants_index = indel_known_reference_variants_index,
            known_reference_variants_identifier = indel_known_reference_variants_identifier,
            is_training = indel_is_training,
            is_calibration = indel_is_calibration,

            max_unlabeled_variants = indel_max_unlabeled_variants,
    }

    call VARUTIL.ExtractVariantAnnotations as ExtractSnpVariantAnnotations  {
        input:
            vcf = MergeSitesOnlyVCFs.output_vcf,
            vcf_index = MergeSitesOnlyVCFs.output_vcf_index,

            prefix = prefix,
            mode = "SNP",

            recalibration_annotation_values = snp_recalibration_annotation_values,

            known_reference_variants = snp_known_reference_variants,
            known_reference_variants_index = snp_known_reference_variants_index,
            known_reference_variants_identifier = snp_known_reference_variants_identifier,
            is_training = snp_is_training,
            is_calibration = snp_is_calibration,

            max_unlabeled_variants = snp_max_unlabeled_variants,
    }

    call VARUTIL.TrainVariantAnnotationsModel as TrainIndelVariantAnnotationsModel {
        input:
            annotation_hdf5 = ExtractIndelVariantAnnotations.annotation_hdf5,
            mode = "INDEL",
            prefix = prefix,
    }

    call VARUTIL.TrainVariantAnnotationsModel as TrainSnpVariantAnnotationsModel {
        input:
            annotation_hdf5 = ExtractSnpVariantAnnotations.annotation_hdf5,
            mode = "SNP",
            prefix = prefix,
    }

    # Shard by contig for speed:
    scatter (idx_2 in range(length(ExtractIntervalNamesFromIntervalOrBamFile.interval_info))) {

        String interval_name2 = ExtractIntervalNamesFromIntervalOrBamFile.interval_info[idx_2][0] + "_" + ExtractIntervalNamesFromIntervalOrBamFile.interval_info[idx_2][1] + "_" + ExtractIntervalNamesFromIntervalOrBamFile.interval_info[idx_2][2]
        File joint_called_vcf = joint_vcf[idx_2]
        File joint_called_vcf_index = joint_vcf_index[idx_2]

        call VARUTIL.ScoreVariantAnnotations as ScoreSnpVariantAnnotations {
            input:
                vcf = joint_called_vcf,
                vcf_index = joint_called_vcf_index,

                sites_only_extracted_vcf = ExtractSnpVariantAnnotations.sites_only_vcf,
                sites_only_extracted_vcf_index = ExtractSnpVariantAnnotations.sites_only_vcf_index,

                model_prefix = prefix + "_train_SNP",
                model_files = flatten([[TrainSnpVariantAnnotationsModel.training_scores, TrainSnpVariantAnnotationsModel.positive_model_scorer_pickle], select_all([
                    TrainSnpVariantAnnotationsModel.unlabeled_positive_model_scores,
                    TrainSnpVariantAnnotationsModel.calibration_set_scores,
                    TrainSnpVariantAnnotationsModel.negative_model_scorer_pickle
                ])]),
                prefix = prefix + "_SNP_" + interval_name2,
                mode = "SNP",

                calibration_sensitivity_threshold = snp_calibration_sensitivity,

                recalibration_annotation_values = snp_recalibration_annotation_values,

                known_reference_variants = snp_known_reference_variants,
                known_reference_variants_index = snp_known_reference_variants_index,
                known_reference_variants_identifier = snp_known_reference_variants_identifier,
                is_training = snp_is_training,
                is_calibration = snp_is_calibration,
        }

        call VARUTIL.ScoreVariantAnnotations as ScoreIndelVariantAnnotations {
            input:
                vcf = ScoreSnpVariantAnnotations.scored_vcf,
                vcf_index = ScoreSnpVariantAnnotations.scored_vcf_index,

                sites_only_extracted_vcf = ExtractIndelVariantAnnotations.sites_only_vcf,
                sites_only_extracted_vcf_index = ExtractIndelVariantAnnotations.sites_only_vcf_index,

                model_prefix = prefix + "_train_INDEL",
                model_files = flatten([[TrainIndelVariantAnnotationsModel.training_scores, TrainIndelVariantAnnotationsModel.positive_model_scorer_pickle], select_all([
                    TrainIndelVariantAnnotationsModel.unlabeled_positive_model_scores,
                    TrainIndelVariantAnnotationsModel.calibration_set_scores,
                    TrainIndelVariantAnnotationsModel.negative_model_scorer_pickle
                ])]),
                prefix = prefix + "_ALL_" + interval_name2,
                mode = "INDEL",

                calibration_sensitivity_threshold = indel_calibration_sensitivity,

                recalibration_annotation_values = indel_recalibration_annotation_values,

                known_reference_variants = indel_known_reference_variants,
                known_reference_variants_index = indel_known_reference_variants_index,
                known_reference_variants_identifier = indel_known_reference_variants_identifier,
                is_training = indel_is_training,
                is_calibration = indel_is_calibration,
        }

        # Now we need to annotate our variants by region:
        if (defined(annotation_bed_files)) {
            call VARUTIL.AnnotateVcfWithBedRegions as AnnotateVcfRegions {
                input:
                    vcf = ScoreIndelVariantAnnotations.scored_vcf,
                    vcf_index = ScoreIndelVariantAnnotations.scored_vcf_index,
                    bed_files = select_first([annotation_bed_files]),
                    bed_file_indexes = select_first([annotation_bed_file_indexes]),
                    bed_file_annotation_names = select_first([annotation_bed_file_annotation_names]),
                    prefix = basename(basename(ScoreIndelVariantAnnotations.scored_vcf, ".vcf.gz"), ".vcf") + ".region_annotated",
            }
        }

        File recalibrated_vcf = select_first([AnnotateVcfRegions.annotated_vcf, ScoreIndelVariantAnnotations.scored_vcf])
        File recalibrated_vcf_index = select_first([AnnotateVcfRegions.annotated_vcf_index, ScoreIndelVariantAnnotations.scored_vcf_index])

        # Now functionally annotate each VCF:
        if (defined(snpeff_db)) {
            call FUNK.FunctionallyAnnotateVariants as FunctionallyAnnotate {
                input:
                    vcf = recalibrated_vcf,
                    snpeff_db = select_first([snpeff_db]),
                    snpeff_db_identifier = select_first([snpeff_db_identifier])
            }
        }

        File vcf_for_merging = select_first([FunctionallyAnnotate.annotated_vcf, recalibrated_vcf])
        File vcf_index_for_merging = select_first([FunctionallyAnnotate.annotated_vcf_index, recalibrated_vcf_index])
    }

    # Consolidate files:
    call VARUTIL.GatherVcfs as GatherRescoredVcfs {
        input:
            input_vcfs = vcf_for_merging,
            input_vcf_indices = vcf_index_for_merging,
            prefix = prefix + ".rescored.combined"
    }

    ################################
    # Split the VCF by contig:
    ############
    call UTILS.MakeChrIntervalList as GetContigsFromRefDict {
        input:
            ref_dict = ref_map['dict'],
            filter = []
    }

    scatter (indx_3 in range(length(GetContigsFromRefDict.contig_interval_list_files))) {
        String contig_name = GetContigsFromRefDict.chrs[indx_3][0]
        String contig_interval_string = GetContigsFromRefDict.contig_interval_strings[indx_3]

        # We expect this to run a long time, so we disable preemption:
        call VARUTIL.SubsetVCF as SubsetVcfPerContig {
            input:
                vcf_gz = GatherRescoredVcfs.output_vcf,
                vcf_tbi = GatherRescoredVcfs.output_vcf_index,
                locus = contig_name,
                prefix = prefix + ".rescored." + contig_name,
                runtime_attr_override = object {preemptible_tries: 0}
        }

        if (do_zarr_conversion) {
            # Convert to Zarr
            call SGKit.ConvertToZarrStore as ConvertToZarr {
                input:
                    vcf = GatherRescoredVcfs.output_vcf,
                    tbi = GatherRescoredVcfs.output_vcf_index,
                    prefix = prefix,
                    runtime_attr_override = object {preemptible_tries: 0}
            }
        }
    }

    ################################
    # Finalize the regular output files:
    ############

    if (defined(gcs_out_root_dir)) {

        String concrete_gcs_out_root_dir = select_first([gcs_out_root_dir])
        String outdir = sub(concrete_gcs_out_root_dir, "/$", "") + "/SRJointCallGVCFsWithGenomicsDBPopulationScale/~{prefix}"

        String recalibration_dir = outdir + "/recalibration_files"
        String recalibration_model_dir = outdir + "/recalibration_files/model"
        String recalibration_results_dir = outdir + "/recalibration_files/results"
        String snpeff_results_dir = outdir + "/snpEff_results"

        File keyfile = GatherRescoredVcfs.output_vcf_index

        call FF.FinalizeToDir as FinalizeGenomicsDB { input: outdir = outdir + "/GenomicsDB", keyfile = keyfile, files = ImportGVCFsIntoGenomicsDB.output_genomicsdb }

        call FF.FinalizeToFile as FinalizeVETSVCF { input: outdir = outdir, keyfile = keyfile, file = GatherRescoredVcfs.output_vcf }
        call FF.FinalizeToFile as FinalizeVETSTBI { input: outdir = outdir, keyfile = keyfile, file = GatherRescoredVcfs.output_vcf_index }

        if (defined(snpeff_db)) {
            call FF.FinalizeToDir as FinalizeSnpEffSummary { input: outdir = snpeff_results_dir, keyfile = keyfile, files = select_all(FunctionallyAnnotate.snpEff_summary) }
            call FF.FinalizeToDir as FinalizeSnpEffGenes { input: outdir = snpeff_results_dir, keyfile = keyfile, files = select_all(FunctionallyAnnotate.snpEff_genes) }
        }

        ################################
        # Finalize the VETS files:
        ############

        # ExtractVariantAnnotations:
        call FF.FinalizeToFile as FinalizeSnpExtractedAnnotations { input: outdir = recalibration_model_dir, keyfile = keyfile, file = ExtractSnpVariantAnnotations.annotation_hdf5 }
        call FF.FinalizeToFile as FinalizeSnpExtractedSitesOnlyVcf { input: outdir = recalibration_model_dir, keyfile = keyfile, file = ExtractSnpVariantAnnotations.sites_only_vcf }
        call FF.FinalizeToFile as FinalizeSnpExtractedSitesOnlyVcfIndex { input: outdir = recalibration_model_dir, keyfile = keyfile, file = ExtractSnpVariantAnnotations.sites_only_vcf_index }
        if (defined(ExtractSnpVariantAnnotations.unlabeled_annotation_hdf5)) {
            call FF.FinalizeToFile as FinalizeSnpExtractedUnlabeledAnnotations { input: outdir = recalibration_model_dir, keyfile = keyfile, file = select_first([ExtractSnpVariantAnnotations.unlabeled_annotation_hdf5]) }
        }
        call FF.FinalizeToFile as FinalizeIndelExtractedAnnotations { input: outdir = recalibration_model_dir, keyfile = keyfile, file = ExtractIndelVariantAnnotations.annotation_hdf5 }
        call FF.FinalizeToFile as FinalizeIndelExtractedSitesOnlyVcf { input: outdir = recalibration_model_dir, keyfile = keyfile, file = ExtractIndelVariantAnnotations.sites_only_vcf }
        call FF.FinalizeToFile as FinalizeIndelExtractedSitesOnlyVcfIndex { input: outdir = recalibration_model_dir, keyfile = keyfile, file = ExtractIndelVariantAnnotations.sites_only_vcf_index }
        if (defined(ExtractIndelVariantAnnotations.unlabeled_annotation_hdf5)) {
            call FF.FinalizeToFile as FinalizeIndelExtractedUnlabeledAnnotations { input: outdir = recalibration_model_dir, keyfile = keyfile, file = select_first([ExtractIndelVariantAnnotations.unlabeled_annotation_hdf5]) }
        }

        # TrainVariantAnnotationsModel
        call FF.FinalizeToFile as FinalizeSnpTrainVariantAnnotationsTrainingScores { input: outdir = recalibration_model_dir, keyfile = keyfile, file = TrainSnpVariantAnnotationsModel.training_scores }
        call FF.FinalizeToFile as FinalizeSnpTrainVariantAnnotationsPositiveModelScorer { input: outdir = recalibration_model_dir, keyfile = keyfile, file = TrainSnpVariantAnnotationsModel.positive_model_scorer_pickle }
        if (defined(TrainSnpVariantAnnotationsModel.unlabeled_positive_model_scores)) {
            call FF.FinalizeToFile as FinalizeSnpTrainVariantAnnotationsUnlabeledPositiveModelScores { input: outdir = recalibration_model_dir, keyfile = keyfile, file = select_first([TrainSnpVariantAnnotationsModel.unlabeled_positive_model_scores]) }
        }
        if (defined(TrainSnpVariantAnnotationsModel.calibration_set_scores)) {
            call FF.FinalizeToFile as FinalizeSnpTrainVariantAnnotationsCalibrationSetScores { input: outdir = recalibration_model_dir, keyfile = keyfile, file = select_first([TrainSnpVariantAnnotationsModel.calibration_set_scores]) }
        }
        if (defined(TrainSnpVariantAnnotationsModel.negative_model_scorer_pickle)) {
            call FF.FinalizeToFile as FinalizeSnpTrainVariantAnnotationsNegativeModelScorer { input: outdir = recalibration_model_dir, keyfile = keyfile, file = select_first([TrainSnpVariantAnnotationsModel.negative_model_scorer_pickle]) }
        }

        call FF.FinalizeToFile as FinalizeIndelTrainVariantAnnotationsTrainingScores { input: outdir = recalibration_model_dir, keyfile = keyfile, file = TrainIndelVariantAnnotationsModel.training_scores }
        call FF.FinalizeToFile as FinalizeIndelTrainVariantAnnotationsPositiveModelScorer { input: outdir = recalibration_model_dir, keyfile = keyfile, file = TrainIndelVariantAnnotationsModel.positive_model_scorer_pickle }
        if (defined(TrainIndelVariantAnnotationsModel.unlabeled_positive_model_scores)) {
            call FF.FinalizeToFile as FinalizeIndelTrainVariantAnnotationsUnlabeledPositiveModelScores { input: outdir = recalibration_model_dir, keyfile = keyfile, file = select_first([TrainIndelVariantAnnotationsModel.unlabeled_positive_model_scores]) }
        }
        if (defined(TrainIndelVariantAnnotationsModel.calibration_set_scores)) {
            call FF.FinalizeToFile as FinalizeIndelTrainVariantAnnotationsCalibrationSetScores { input: outdir = recalibration_model_dir, keyfile = keyfile, file = select_first([TrainIndelVariantAnnotationsModel.calibration_set_scores]) }
        }
        if (defined(TrainIndelVariantAnnotationsModel.negative_model_scorer_pickle)) {
            call FF.FinalizeToFile as FinalizeIndelTrainVariantAnnotationsNegativeModelScorer { input: outdir = recalibration_model_dir, keyfile = keyfile, file = select_first([TrainIndelVariantAnnotationsModel.negative_model_scorer_pickle]) }
        }

        # ScoreVariantAnnotations
        # This was done per-contig, so we need to finalize per-contig:
        scatter (idx_3 in range(length(ExtractIntervalNamesFromIntervalOrBamFile.interval_info))) {

            String interval_name3 = ExtractIntervalNamesFromIntervalOrBamFile.interval_info[idx_3][0] + "_" + ExtractIntervalNamesFromIntervalOrBamFile.interval_info[idx_3][1] + "_" + ExtractIntervalNamesFromIntervalOrBamFile.interval_info[idx_3][2]

            call FF.FinalizeToFile as FinalizeScoreSnpVariantAnnotationsScoredVcf { input: outdir = recalibration_results_dir + "/" + interval_name3, keyfile = keyfile, file = ScoreSnpVariantAnnotations.scored_vcf[idx_3] }
            call FF.FinalizeToFile as FinalizeScoreSnpVariantAnnotationsScoredVcfIndex { input: outdir = recalibration_results_dir + "/" + interval_name3, keyfile = keyfile, file = ScoreSnpVariantAnnotations.scored_vcf_index[idx_3] }
            if (defined(ScoreSnpVariantAnnotations.annotations_hdf5)) {
                call FF.FinalizeToFile as FinalizeScoreSnpVariantAnnotationsAnnotationsHdf5 { input: outdir = recalibration_results_dir + "/" + interval_name3, keyfile = keyfile, file = select_first([ScoreSnpVariantAnnotations.annotations_hdf5[idx_3]]) }
            }
            if (defined(ScoreSnpVariantAnnotations.scores_hdf5)) {
                call FF.FinalizeToFile as FinalizeScoreSnpVariantAnnotationsScoresHdf5 { input: outdir = recalibration_results_dir + "/" + interval_name3, keyfile = keyfile, file = select_first([ScoreSnpVariantAnnotations.scores_hdf5[idx_3]]) }
            }

            call FF.FinalizeToFile as FinalizeScoreIndelVariantAnnotationsScoredVcf { input: outdir = recalibration_results_dir + "/" + interval_name3, keyfile = keyfile, file = ScoreIndelVariantAnnotations.scored_vcf[idx_3] }
            call FF.FinalizeToFile as FinalizeScoreIndelVariantAnnotationsScoredVcfIndex { input: outdir = recalibration_results_dir + "/" + interval_name3, keyfile = keyfile, file = ScoreIndelVariantAnnotations.scored_vcf_index[idx_3] }
            if (defined(ScoreIndelVariantAnnotations.annotations_hdf5)) {
                call FF.FinalizeToFile as FinalizeScoreIndelVariantAnnotationsAnnotationsHdf5 { input: outdir = recalibration_results_dir + "/" + interval_name3, keyfile = keyfile, file = select_first([ScoreIndelVariantAnnotations.annotations_hdf5[idx_3]]) }
            }
            if (defined(ScoreIndelVariantAnnotations.scores_hdf5)) {
                call FF.FinalizeToFile as FinalizeScoreIndelVariantAnnotationsScoresHdf5 { input: outdir = recalibration_results_dir + "/" + interval_name3, keyfile = keyfile, file = select_first([ScoreIndelVariantAnnotations.scores_hdf5[idx_3]]) }
            }
        }

        if (do_zarr_conversion) {
            scatter (final_zarr in ConvertToZarr.zarr) {
                call FF.FinalizeToFile as FinalizeZarrs { input: outdir = outdir, keyfile = keyfile, file = select_first([final_zarr]) }
            }
        }

        # Set up variable for outputs:
        Array[String] final_genomicsdb_location = [FinalizeGenomicsDB.gcs_dir]
    }

    # Make an alias for the functionally annotated data:
    if (defined(snpeff_db)) {
        File annotated_vcf = if defined(gcs_out_root_dir) then select_first([FinalizeVETSVCF.gcs_path]) else GatherRescoredVcfs.output_vcf
        File annotated_vcf_tbi = if defined(gcs_out_root_dir) then select_first([FinalizeVETSTBI.gcs_path]) else GatherRescoredVcfs.output_vcf_index

        Array[String] final_snpeff_summary = if defined(gcs_out_root_dir) then [select_first([FinalizeSnpEffSummary.gcs_dir])] else select_all(FunctionallyAnnotate.snpEff_summary)
        Array[String] final_snpEff_genes = if defined(gcs_out_root_dir) then [select_first([FinalizeSnpEffGenes.gcs_dir])] else select_all(FunctionallyAnnotate.snpEff_genes)
    }

    # Resolve final joint Zarr files if we have created them:
    if (do_zarr_conversion) {
        if (defined(gcs_out_root_dir)) {
            Array[String] final_joint_zarrs_finalized = select_first([FinalizeZarrs.gcs_path])
        }
        if (! defined(gcs_out_root_dir)) {
            Array[File] final_joint_zarrs_not_finalized = select_all(ConvertToZarr.zarr)
        }
        Array[File] final_joint_zarrs = select_first([final_joint_zarrs_finalized, final_joint_zarrs_not_finalized])
    }

    output {
        Array[File] vcfs_per_contig = select_all(SubsetVcfPerContig.subset_vcf)
        Array[File] vcf_indices_per_contig = select_all(SubsetVcfPerContig.subset_tbi)

        Array[String] genomicsDB = select_first([final_genomicsdb_location, ImportGVCFsIntoGenomicsDB.output_genomicsdb])

        Array[File]? joint_zarrs = final_joint_zarrs
        Array[String]? snpEff_summary = final_snpeff_summary
        Array[String]? snpEff_genes = final_snpEff_genes

    }
}