version 1.0

workflow Glimpse2LowPassImputation {
    String pipeline_version = "0.0.6"
    String quota_consumed_version = "0.0.1"
    String input_qc_version = "1.0.0"
    
    input {

        Array[String] contigs

        # this is the path to a directory that contains sites vcf, sites table, and reference chunks file. should end with a "/"
        String reference_panel_prefix

        File? input_vcf
        File? input_vcf_index
        Array[File]? crams
        Array[File]? cram_indices
        Array[String] sample_ids
        File fasta
        File fasta_index
        String output_basename

        File ref_dict

        Boolean impute_reference_only_variants = false
        Boolean call_indels = false

        # batch size used when calling SplitIntoBatches to make variant calls from the crams
        Int calling_batch_size = 100

        String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.6.0.0"
        String glimpse_docker = "us.gcr.io/broad-dsde-methods/glimpse:kachulis_ck_bam_reader_retry_cf5822c"
    }

    if (defined(input_vcf)) {
        call CountSamples {
            input:
                vcf = select_first([input_vcf])
        }
    }

    Int n_samples = select_first([CountSamples.nSamples, length(select_first([crams]))])

    if (defined(crams)) {
        if (length(select_first([crams])) > 1) {
            call SplitIntoBatches {
                input:
                    batch_size = calling_batch_size,
                    crams = select_first([crams]),
                    cram_indices = select_first([cram_indices]),
                    sample_ids = sample_ids
            }
        }
    }

    scatter(contig in contigs) {
        File sites_vcf = reference_panel_prefix + "sites." + contig + ".vcf.gz"
        File sites_vcf_index =reference_panel_prefix + "sites." + contig + ".vcf.gz.tbi"
        File sites_table = reference_panel_prefix + "sites_table." + contig + ".gz"
        File sites_table_index = reference_panel_prefix + "sites_table." + contig + ".gz.tbi"
        File reference_chunks = reference_panel_prefix + "reference_chunks." + contig + ".txt"

        if (defined(crams)) {
            Array[Array[String]] crams_batches = select_first([SplitIntoBatches.crams_batches, [select_first([crams])]])
            Array[Array[String]] cram_indices_batches = select_first([SplitIntoBatches.cram_indices_batches, [select_first([cram_indices])]])
            Array[Array[String]] sample_ids_batches = select_first([SplitIntoBatches.sample_ids_batches, [select_first([sample_ids])]])

            scatter(i in range(length(crams_batches))) {
                call BcftoolsMpileup {
                    input:
                        crams = crams_batches[i],
                        cram_indices = cram_indices_batches[i],
                        sample_ids = sample_ids_batches[i],
                        fasta = fasta,
                        fasta_index = fasta_index,
                        call_indels = call_indels,
                        sites_vcf = sites_vcf,
                }

                call BcftoolsCall {
                    input:
                        mpileup_bcf = BcftoolsMpileup.output_bcf,
                        sites_table = sites_table,
                        sites_table_index = sites_table_index,
                }

                call BcftoolsNorm {
                    input:
                        calls_bcf = BcftoolsCall.output_bcf,
                }
            }

            if (length(BcftoolsNorm.output_vcf) > 1) {
                call BcftoolsMerge {
                    input:
                        vcfs = BcftoolsNorm.output_vcf,
                        vcf_indices = BcftoolsNorm.output_vcf_index,
                        output_basename = output_basename
                }
            }

            File phase_input_vcf = select_first([BcftoolsMerge.merged_vcf, BcftoolsNorm.output_vcf[0], input_vcf])
            File phase_input_vcf_index = select_first([BcftoolsMerge.merged_vcf_index, BcftoolsNorm.output_vcf_index[0], input_vcf_index])
        }

        ## this task is used to grab the reference chunk but does not affect memory usage of glimpsePhase.
        ## still tbd which method makes the most sense cost wise
        call ComputeShardsAndMemoryPerShard {
            input:
                reference_chunks_memory = reference_chunks,
                n_samples = n_samples
        }

        scatter (reference_chunk_index in range(length(ComputeShardsAndMemoryPerShard.reference_chunk_file_paths))) {

            call GlimpsePhase {
                input:
                    reference_chunk = ComputeShardsAndMemoryPerShard.reference_chunk_file_paths[reference_chunk_index],
                    input_vcf = phase_input_vcf,
                    input_vcf_index = phase_input_vcf_index,
                    impute_reference_only_variants = impute_reference_only_variants,
                    call_indels = call_indels,
                    sample_ids = sample_ids,
                    fasta = fasta,
                    fasta_index = fasta_index,
                    docker = glimpse_docker
            }
        }

        call GlimpseLigate {
            input:
                imputed_chunks = GlimpsePhase.imputed_vcf,
                imputed_chunks_indices = GlimpsePhase.imputed_vcf_index,
                output_basename = output_basename,
                ref_dict = ref_dict,
                docker = glimpse_docker
        }
        Array[File] contig_coverage_metrics = select_all(GlimpsePhase.coverage_metrics)

        call SelectVariantRecordsOnly {
            input:
                vcf = GlimpseLigate.imputed_vcf,
                vcf_index = GlimpseLigate.imputed_vcf_index,
                basename = output_basename + "." + contig + ".imputed.only_variants",
        }

        call CreateHomRefSitesOnlyVcf {
            input:
                vcf = GlimpseLigate.imputed_vcf,
                vcf_index = GlimpseLigate.imputed_vcf_index,
                basename = output_basename + "." + contig + ".imputed.only_hom_ref.sites_only",
        }
    }

    call GatherVcfsNoIndex {
        input:
            input_vcfs = SelectVariantRecordsOnly.output_vcf,
            output_vcf_basename = output_basename + ".imputed",
            gatk_docker = gatk_docker
    }

    call CreateVcfIndexAndMd5 {
        input:
            vcf_input = GatherVcfsNoIndex.output_vcf,
            gatk_docker = gatk_docker,
            preemptible = 0
    }

    call GatherVcfsNoIndex as GatherVcfsNoIndexHomRefOnly {
        input:
            input_vcfs = CreateHomRefSitesOnlyVcf.output_vcf,
            output_vcf_basename = output_basename + ".imputed.hom_ref_sites_only",
            gatk_docker = gatk_docker
    }

    call CreateVcfIndexAndMd5 as CreateVcfIndexAndMd5HomRefOnly {
        input:
            vcf_input = GatherVcfsNoIndexHomRefOnly.output_vcf,
            gatk_docker = gatk_docker,
            preemptible = 0
    }

    Array[File] genome_coverage_metrics = flatten(contig_coverage_metrics)
    if (length(genome_coverage_metrics) > 0) {
        call CombineCoverageMetrics {
            input:
                cov_metrics = genome_coverage_metrics,
                output_basename = output_basename
        }
    }

    call CollectQCMetrics {
        input:
            imputed_vcf = GatherVcfsNoIndex.output_vcf,
            output_basename = output_basename
    }


    output {
        File imputed_vcf = CreateVcfIndexAndMd5.output_vcf
        File imputed_vcf_index = CreateVcfIndexAndMd5.output_vcf_index
        File imputed_vcf_md5sum = CreateVcfIndexAndMd5.output_vcf_md5sum

        File imputed_hom_ref_sites_only_vcf = CreateVcfIndexAndMd5HomRefOnly.output_vcf
        File imputed_hom_ref_sites_only_vcf_inex = CreateVcfIndexAndMd5HomRefOnly.output_vcf_index
        File imputed_hom_ref_sites_only_vcf_md5 = CreateVcfIndexAndMd5HomRefOnly.output_vcf_md5sum

        File qc_metrics = CollectQCMetrics.qc_metrics
        File? coverage_metrics = CombineCoverageMetrics.coverage_metrics
    }
}

task SplitIntoBatches {
    input {
        Int batch_size

        Array[String] crams
        Array[String] cram_indices
        Array[String] sample_ids
    }

    command <<<
        cat <<EOF > script.py
        import json

        batch_size = ~{batch_size}
        crams = ['~{sep="', '" crams}']
        cram_indices = ['~{sep="', '" cram_indices}']
        sample_ids = ['~{sep="', '" sample_ids}']

        crams_batches = [crams[i:i + batch_size] for i in range(0, len(crams), batch_size)]
        cram_indices_batches = [cram_indices[i:i + batch_size] for i in range(0, len(cram_indices), batch_size)]
        sample_ids_batches = [sample_ids[i:i + batch_size] for i in range(0, len(sample_ids), batch_size)]

        with open('crams.json', 'w') as json_file:
            json.dump(crams_batches, json_file)
        with open('cram_indices.json', 'w') as json_file:
            json.dump(cram_indices_batches, json_file)
        with open('sample_ids.json', 'w') as json_file:
            json.dump(sample_ids_batches, json_file)
        EOF
        python3 script.py
    >>>

    runtime {
        docker: "us.gcr.io/broad-dsde-methods/python-data-slim:1.0"
        cpu: 1
        disks: "local-disk 10 HDD"
        memory: "1 GiB"
        preemptible: 3
        noAddress: true
    }

    output {
        Array[Array[String]] crams_batches = read_json('crams.json')
        Array[Array[String]] cram_indices_batches = read_json('cram_indices.json')
        Array[Array[String]] sample_ids_batches = read_json('sample_ids.json')
    }
}

task ComputeShardsAndMemoryPerShard {
    input {
        File reference_chunks_memory
        Int n_samples
    }

    command <<<
        python3 << EOF
        import pandas as pd
        import numpy as np


        df = pd.read_csv('~{reference_chunks_memory}', sep='\t', header=None, names=['contig', 'reference_shard', 'base_gb', 'slope_per_sample_gb'])

        # write out reference shards to process
        df['reference_shard'].to_csv('reference_shard_file_paths.tsv', sep='\t', index=False, header=None)

        # calculate memory usage and save to file
        df['mem_gb'] = df['base_gb'] + df['slope_per_sample_gb'] * ~{n_samples}
        df['mem_gb'] = df['mem_gb'].apply(lambda x: min(256, int(np.ceil(x))))  # cap at 256 GB
        df['mem_gb'].to_csv('memory_per_chunk.tsv', sep='\t', index=False, header=None)
        EOF
    >>>

    runtime {
        docker : "us.gcr.io/broad-dsde-methods/python-data-slim:1.0"
        noAddress: true
    }

    output {
        Array[String] reference_chunk_file_paths = read_lines("reference_shard_file_paths.tsv")
        Array[Int] mem_gb_per_chunk = read_lines("memory_per_chunk.tsv")
    }
}

task BcftoolsMpileup {
    input {
        Array[File] crams
        Array[File] cram_indices
        File fasta
        File fasta_index
        Boolean call_indels
        Array[String] sample_ids

        File sites_vcf

        Int seed = 12345
        Int mem_gb = 6
        Int cpu = 1
        Int preemptible = 0
        Int max_retries = 3
    }

    Int disk_size_gb = ceil(1.5*size(crams, "GiB") + size(fasta, "GiB") + size(sites_vcf, "GiB")) + 10

    command <<<
        set -xeuo pipefail

        crams=(~{sep=' ' crams})
        sample_ids=(~{sep=' ' sample_ids})

        for i in "${!crams[@]}"; do
            echo "* ${crams[$i]} ${sample_ids[$i]}" >> sample_name_mapping.txt
        done

        bcftools mpileup -f ~{fasta} ~{if !call_indels then "-I" else ""} -G sample_name_mapping.txt --seed ~{seed} -E -a 'FORMAT/DP,FORMAT/AD' -T ~{sites_vcf} -Ou -o mpileup.bcf ~{sep=" " crams}
    >>>

    runtime {
        docker: "us.gcr.io/broad-dsde-methods/vcfeval_docker:v1.1"
        disks: "local-disk " + disk_size_gb + " HDD"
        memory: mem_gb + " GiB"
        cpu: cpu
        preemptible: preemptible
        maxRetries: max_retries
        noAddress: true
    }

    output {
        File output_bcf = "mpileup.bcf"
    }
}

task BcftoolsCall {
    input {
        File mpileup_bcf

        File sites_table
        File sites_table_index

        Int mem_gb = 12
        Int cpu = 1
        Int preemptible = 3
        Int max_retries = 3
    }

    Int disk_size_gb = ceil(3*size(mpileup_bcf, "GiB") + size(sites_table, "GiB")) + 10

    command <<<
        set -xeuo pipefail

        bcftools call -Aim -C alleles -T ~{sites_table} -Ou ~{mpileup_bcf} -o calls.bcf
    >>>

    runtime {
        docker: "us.gcr.io/broad-dsde-methods/vcfeval_docker:v1.1"
        disks: "local-disk " + disk_size_gb + " SSD"
        memory: mem_gb + " GiB"
        cpu: cpu
        preemptible: preemptible
        maxRetries: max_retries
        noAddress: true
    }

    output {
        File output_bcf = "calls.bcf"
    }
}

task BcftoolsNorm {
    input {
        File calls_bcf

        Int mem_gb = 6
        Int cpu = 1
        Int preemptible = 3
        Int max_retries = 3
    }

    Int disk_size_gb = ceil(3*size(calls_bcf, "GiB")) + 10

    command <<<
        set -xeuo pipefail


        bcftools norm -m -both -Oz -o normalized.vcf.gz ~{calls_bcf}
        bcftools index -t normalized.vcf.gz
    >>>

    runtime {
        docker: "us.gcr.io/broad-dsde-methods/vcfeval_docker:v1.1"
        disks: "local-disk " + disk_size_gb + " SSD"
        memory: mem_gb + " GiB"
        cpu: cpu
        preemptible: preemptible
        maxRetries: max_retries
        noAddress: true
    }

    output {
        File output_vcf = "normalized.vcf.gz"
        File output_vcf_index = "normalized.vcf.gz.tbi"
    }
}

task BcftoolsMerge {
    input {
        Array[File] vcfs
        Array[File] vcf_indices
        Int mem_gb = 6
        Int cpu = 1
        Int preemptible = 0
        Int max_retries = 3

        String output_basename
    }

    Int disk_size_gb = ceil(3*size(vcfs, "GiB")) + 50

    command <<<
        set -euo pipefail
        bcftools merge -O z -o ~{output_basename}.bcftools.merged.vcf.gz ~{sep=" " vcfs}
        bcftools index -t ~{output_basename}.bcftools.merged.vcf.gz
    >>>

    runtime {
        docker: "us.gcr.io/broad-dsde-methods/bcftools:v1.3"
        disks: "local-disk " + disk_size_gb + " HDD"
        memory: mem_gb + " GiB"
        cpu: cpu
        preemptible: preemptible
        maxRetries: max_retries
        noAddress: true
    }

    output {
        File merged_vcf = "~{output_basename}.bcftools.merged.vcf.gz"
        File merged_vcf_index = "~{output_basename}.bcftools.merged.vcf.gz.tbi"
    }
}

task GlimpsePhase {
    input {
        File? input_vcf
        File? input_vcf_index
        Array[File]? crams
        Array[File]? cram_indices
        Array[String] sample_ids
        File? fasta
        File? fasta_index
        File reference_chunk

        Boolean impute_reference_only_variants
        Boolean call_indels
        Int? n_burnin
        Int? n_main
        Int? effective_population_size

        Int mem_gb = 16
        Int cpu = 4
        Int disk_size_gb = ceil(2.2 * size(input_vcf, "GiB") + size(reference_chunk, "GiB") + 0.003 * length(select_first([crams, []])) + 10)
        Int preemptible = 30
        Int max_retries = 3
        String docker
    }

    parameter_meta {
        crams: {
                   localization_optional: true
               }
        cram_indices: {
                          localization_optional: true
                      }
        input_vcf: {
                   localization_optional: true
               }
        input_vcf_index: {
                          localization_optional: true
                      }
    }

    String bam_file_list_input = if defined(crams) then "--bam-list crams.list" else ""
    command <<<
        set -euo pipefail

        export GCS_OAUTH_TOKEN=$(/root/google-cloud-sdk/bin/gcloud auth application-default print-access-token)

        cram_paths=( ~{sep=" " crams} )
        cram_index_paths=( ~{sep=" " cram_indices} )
        sample_ids=( ~{sep=" " sample_ids} )

        duplicate_cram_filenames=$(printf "%s\n" "${cram_paths[@]}" | xargs -I {} basename {} | sort | uniq -d)
        if [ ! -z "$duplicate_cram_filenames" ]; then
            echo "ERROR: The input CRAMs contain multiple files with the same basename, which leads to an error due to the way that htslib is implemented. Duplicate filenames:"
            printf "%s\n" "${duplicate_cram_filenames[@]}"
            exit 1
        fi

        if ~{if defined(cram_indices) then "true" else "false"}; then
            for i in "${!cram_paths[@]}" ; do
                echo -e "${cram_paths[$i]}##idx##${cram_index_paths[$i]} ${sample_ids[$i]}" >> crams.list
            done
        else
            for i in "${!cram_paths[@]}"; do
                echo -e "${cram_paths[$i]} ${sample_ids[$i]}" >> crams.list
            done
        fi

        cmd="/bin/GLIMPSE2_phase \
        ~{"--input-gl " + input_vcf} \
        --reference ~{reference_chunk} \
        --output phase_output.bcf \
        --threads ~{cpu} \
        ~{if impute_reference_only_variants then "--impute-reference-only-variants" else ""} ~{if call_indels then "--call-indels" else ""} \
        ~{"--burnin " + n_burnin} ~{"--main " + n_main} \
        ~{"--ne " + effective_population_size} \
        ~{bam_file_list_input} \
        ~{"--fasta " + fasta} \
        --checkpoint-file-out checkpoint.bin"

        if [ -s "checkpoint.bin" ]; then
            cmd="$cmd --checkpoint-file-in checkpoint.bin"
        fi


        #check for read error which corresponds exactly to end of cram/bam block.
        #This currently triggers a warning message from htslib, but doesn't return any error.
        #We need to make sure that stderr is maintained since cromwell looks for oom strings
        #in stderr

        eval $cmd 2> >(tee glimpse_stderr.log >&2)

        if grep -q "EOF marker is absent" glimpse_stderr.log; then
            echo "An input file appears to be truncated.  This may be either a truly truncated file which needs to be fixed, or a networking error which can just be retried."
        exit 1
        fi
    >>>

    runtime {
        docker: docker
        disks: "local-disk " + disk_size_gb + " SSD"
        memory: mem_gb + " GiB"
        cpu: cpu
        preemptible: preemptible
        maxRetries: max_retries
        checkpointFile: "checkpoint.bin"
        noAddress: true
    }

    output {
        File imputed_vcf = "phase_output.bcf"
        File imputed_vcf_index = "phase_output.bcf.csi"
        File? coverage_metrics = "phase_output_stats_coverage.txt.gz"
    }
}

task GlimpseLigate {
    input {
        Array[File] imputed_chunks
        Array[File] imputed_chunks_indices
        String output_basename
        File ref_dict

        Int mem_gb = 4
        Int cpu = 2
        Int disk_size_gb = ceil(3 * size(imputed_chunks, "GiB") + 100)
        Int preemptible = 0
        Int max_retries = 3
        String docker
    }

    command <<<
        set -xeuo pipefail

        NPROC=$(nproc)
        echo "nproc reported ${NPROC} CPUs, using that number as the threads argument for GLIMPSE."

        /bin/GLIMPSE2_ligate --input ~{write_lines(imputed_chunks)} --output ligated.vcf.gz --threads ${NPROC}

        # Set correct reference dictionary
        bcftools view -h --no-version ligated.vcf.gz > old_header.vcf
        java -jar /picard.jar UpdateVcfSequenceDictionary -I old_header.vcf --SD ~{ref_dict} -O new_header.vcf
        bcftools reheader -h new_header.vcf -o ~{output_basename}.imputed.vcf.gz ligated.vcf.gz
        tabix ~{output_basename}.imputed.vcf.gz
    >>>

    runtime {
        docker: docker
        disks: "local-disk " + disk_size_gb + " HDD"
        memory: mem_gb + " GiB"
        cpu: cpu
        preemptible: preemptible
        maxRetries: max_retries
        noAddress: true
    }

    output {
        File imputed_vcf = "~{output_basename}.imputed.vcf.gz"
        File imputed_vcf_index = "~{output_basename}.imputed.vcf.gz.tbi"
    }
}

task CollectQCMetrics {
    input {
        File imputed_vcf
        String output_basename

        Int preemptible = 0
        String docker = "hailgenetics/hail:0.2.126-py3.11"
        Int cpu = 4
        Int mem_gb = 8
    }

    parameter_meta {
        imputed_vcf: {
                         localization_optional: true
                     }
    }

    Int disk_size_gb = ceil(2*size(imputed_vcf, "GiB") + 50)

    command <<<
        set -euo pipefail

        cat <<'EOF' > script.py
        import hail as hl
        import pandas as pd

        # Calculate metrics
        hl.init(default_reference='GRCh38', idempotent=True)
        vcf = hl.import_vcf('~{imputed_vcf}', force_bgz=True)
        qc = hl.sample_qc(vcf)
        qc_pd = qc.cols().flatten() \
        .rename({'sample_qc.' + col: col for col in list(qc['sample_qc'])}) \
        .rename({'s': 'sample_id'}) \
        .to_pandas()
        qc_pd.to_csv('~{output_basename}.qc_metrics.tsv', sep='\t', index=False, float_format='%.4f')
        EOF
        python3 script.py
    >>>

    runtime {
        docker: docker
        disks: "local-disk " + disk_size_gb + " HDD"
        memory: mem_gb + " GiB"
        cpu: cpu
        preemptible: preemptible
        noAddress: true
    }

    output {
        File qc_metrics = "~{output_basename}.qc_metrics.tsv"
    }
}

task CountSamples {
    input {
        File vcf

        String bcftools_docker = "us.gcr.io/broad-gotc-prod/imputation-bcf-vcf:1.0.7-1.10.2-0.1.16-1669908889"
        Int cpu = 1
        Int memory_mb = 3000
        Int disk_size_gb = 10 + ceil(size(vcf, "GiB"))
    }

    command <<<
        bcftools query -l ~{vcf} | wc -l
    >>>

    runtime {
        docker: bcftools_docker
        disks: "local-disk ${disk_size_gb} HDD"
        memory: "${memory_mb} MiB"
        cpu: cpu
        noAddress: true
    }
    output {
        Int nSamples = read_int(stdout())
    }
}

task CombineCoverageMetrics
{
    input {
        Array[File] cov_metrics
        String output_basename
    }

    command <<<
        set -euo pipefail

        cov_files=( ~{sep=" " cov_metrics} )

        for i in "${!cov_files[@]}"; do
            if [ $i -eq 0 ]; then
                n_skip=1
                echo 'Chunk' > chunk_col.txt
            else
                n_skip=2
            fi
            # glimpse coverage metrics are formatted to be human readable in a command line, not machine readable or consistent.  ie, number of tabs
            # are variable between columns depending on length of sample names, odd things like that.  We want these to be machine readable tables,
            # so need to fix this.
            zcat ${cov_files[$i]} | tail -n +$((n_skip + 1)) | sed s/%//g | sed s/"No data"/"No data pct"/g | sed s/\\t\\t/\\t/g >> cov_file.txt
            n_lines_cov=$(< cov_file.txt wc -l)
            n_lines_chunk=$(< chunk_col.txt wc -l)
            n_lines_out=$((n_lines_cov-n_lines_chunk))
            echo 'n_lines_out=' ${n_lines_out}
            echo ${cov_files[$i]}
            { yes ${i} || :; } | head -n ${n_lines_out} >> chunk_col.txt
        done

        paste chunk_col.txt cov_file.txt > ~{output_basename}.coverage_metrics.txt

    >>>

    runtime {
        docker: "us.gcr.io/broad-dsde-methods/ubuntu:20.04"
        noAddress: true
    }

    output {
        File coverage_metrics="~{output_basename}.coverage_metrics.txt"
    }
}

task GatherVcfsNoIndex {
    input {
        Array[File] input_vcfs
        String output_vcf_basename

        String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.6.1.0"
        Int cpu = 2
        Int memory_mb = 10000
        Int disk_size_gb = ceil(3*size(input_vcfs, "GiB")) + 10
    }
    Int command_mem = memory_mb - 1500
    Int max_heap = memory_mb - 1000

    command <<<
        set -e -o pipefail

        gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" \
        GatherVcfs \
        -I ~{sep=' -I ' input_vcfs} \
        --REORDER_INPUT_BY_FIRST_VARIANT \
        -O ~{output_vcf_basename}.vcf.gz
    >>>
    runtime {
        docker: gatk_docker
        disks: "local-disk ${disk_size_gb} SSD"
        memory: "${memory_mb} MiB"
        cpu: cpu
        maxRetries: 1
        noAddress: true
    }
    output {
        File output_vcf = "~{output_vcf_basename}.vcf.gz"
    }
}

task CreateVcfIndexAndMd5 {
    input {
        File vcf_input

        Int disk_size_gb = ceil(1.1*size(vcf_input, "GiB")) + 10
        Int cpu = 1
        Int memory_mb = 6000
        String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0"
        Int preemptible = 3
    }

    String vcf_basename = basename(vcf_input, ".vcf.gz")

    command <<<
        set -e -o pipefail

        ln -sf ~{vcf_input} ~{vcf_basename}.vcf.gz

        bcftools index -t ~{vcf_basename}.vcf.gz

        md5sum ~{vcf_basename}.vcf.gz | awk '{ print $1 }' > ~{vcf_basename}.md5sum
    >>>
    runtime {
        docker: gatk_docker
        disks: "local-disk ${disk_size_gb} SSD"
        memory: "${memory_mb} MiB"
        cpu: cpu
        preemptible: preemptible
        maxRetries: 1
        noAddress: true
    }
    output {
        File output_vcf = "~{vcf_basename}.vcf.gz"
        File output_vcf_index = "~{vcf_basename}.vcf.gz.tbi"
        File output_vcf_md5sum = "~{vcf_basename}.md5sum"
    }
}

task SelectVariantRecordsOnly {
    input {
        File vcf
        File vcf_index
        String basename

        Int disk_size_gb = ceil(2*size(vcf, "GiB")) + 10
        Int cpu = 1
        Int memory_mb = 3000
        String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.6.1.0"
    }
    Int command_mem = memory_mb - 1500
    Int max_heap = memory_mb - 1000

    command {
        set -e -o pipefail

        # keep alt sites (i.e. remove hom ref sites)
        bcftools view -i 'GT[*]="alt"' -Oz -o ~{basename}.vcf.gz ~{vcf}
    }

    runtime {
        docker: gatk_docker
        disks: "local-disk ${disk_size_gb} SSD"
        memory: "${memory_mb} MiB"
        cpu: cpu
        maxRetries: 1
        preemptible: 3
        noAddress: true
    }

    output {
        File output_vcf = "~{basename}.vcf.gz"
    }
}

task CreateHomRefSitesOnlyVcf {
    input {
        File vcf
        File vcf_index
        String basename

        Int disk_size_gb = ceil(2*size(vcf, "GiB")) + 10
        Int cpu = 1
        Int memory_mb = 6000
        String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.6.1.0"
    }
    Int command_mem = memory_mb - 1500
    Int max_heap = memory_mb - 1000

    command {
        set -e -o pipefail

        # create header with only first 8 columns and store that
        bcftools view -h ~{vcf} | grep "^##" > ~{basename}.vcf
        bcftools view -h ~{vcf} | grep -v "^##" | cut -f1-8 >> ~{basename}.vcf

        # append first 8 columns of hom ref sites to previously stored header
        bcftools query -e 'GT[*]="alt"' -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\t%QUAL\t%FILTER\t%INFO\n' ~{vcf} >> ~{basename}.vcf

        bgzip ~{basename}.vcf
    }

    runtime {
        docker: gatk_docker
        disks: "local-disk ${disk_size_gb} SSD"
        memory: "${memory_mb} MiB"
        cpu: cpu
        maxRetries: 1
        preemptible: 3
        noAddress: true
    }

    output {
        File output_vcf = "~{basename}.vcf.gz"
    }
}