version 1.0

# This script is under review. It is not actively tested or maintained at this time.
workflow SplitMultiallelics {
    input {
        File input_vcf
        File input_vcf_index
        File ref_dict
        String contig
        String output_basename
        Int num_base_chunk_size = 10000000
    }

    String ubuntu_docker = "us.gcr.io/broad-dsde-methods/ubuntu:20.04"
    String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.6.1.0"

    call CalculateChromosomeLength {
        input:
            ref_dict = ref_dict,
            chrom = contig,
            ubuntu_docker = ubuntu_docker
    }


    Float num_base_chunk_float = num_base_chunk_size
    Int num_base_chunks = ceil(CalculateChromosomeLength.chrom_length / num_base_chunk_float)

    scatter (i in range(num_base_chunks)) {
        Int start_chunk_first = (i * num_base_chunk_size) + 1
        Int end_chunk_first = if (CalculateChromosomeLength.chrom_length < ((i + 1) * num_base_chunk_size)) then CalculateChromosomeLength.chrom_length else ((i + 1) * num_base_chunk_size)
        String chunk_basename_first = "generate_first_chunk_" + i

        call GenerateChunk {
            input:
                vcf = input_vcf,
                vcf_index = input_vcf_index,
                start = start_chunk_first,
                end = end_chunk_first,
                chrom = contig,
                basename = chunk_basename_first,
                gatk_docker = gatk_docker
        }

        call SeparateMultiallelics {
            input:
                vcf_input = GenerateChunk.output_vcf,
                vcf_input_index = GenerateChunk.output_vcf_index,
        }
    }

    call GatherVcfs {
        input:
            input_vcfs = SeparateMultiallelics.output_vcf,
            output_vcf_name = output_basename + ".multiallelic_split." + contig + ".vcf.gz"
    }

    output {
        File multi_allelics_split_vcf = GatherVcfs.output_vcf
        File multi_allelics_split_vcf_index = GatherVcfs.output_vcf_index
    }
}

task SeparateMultiallelics {
    input {
        File vcf_input
        File vcf_input_index

        Int disk_size_gb =  ceil(3 * (size(vcf_input, "GiB") + size(vcf_input_index, "GiB"))) + 20
        String bcftools_docker = "us.gcr.io/broad-gotc-prod/imputation-bcf-vcf:1.0.7-1.10.2-0.1.16-1669908889"
        Int cpu = 1
        Int memory_mb = 12000
    }

    String output_basename = basename(vcf_input, ".vcf.gz")
    command {
        set -e -o pipefail

        bcftools norm -m - ~{vcf_input} -Oz -o ~{output_basename}.multiallelic_split.vcf.gz
    }
    output {
        File output_vcf = "~{output_basename}.multiallelic_split.vcf.gz"
    }
    runtime {
        docker: bcftools_docker
        disks: "local-disk ${disk_size_gb} HDD"
        memory: "${memory_mb} MiB"
        cpu: cpu
        preemptible: 0
        noAddress: true
    }
}

task CreateVcfIndex {
    input {
        File vcf_input

        Int disk_size_gb = ceil(1.2 * size(vcf_input, "GiB")) + 10
        Int cpu = 1
        Int memory_mb = 6000
        String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.6.1.0"
    }

    String vcf_basename = basename(vcf_input)

    command {
        set -e -o pipefail

        ln -sf ~{vcf_input} ~{vcf_basename}

        bcftools index -t ~{vcf_basename}
    }

    runtime {
        docker: gatk_docker
        disks: "local-disk ${disk_size_gb} HDD"
        memory: "${memory_mb} MiB"
        cpu: cpu
    }

    output {
        File output_vcf = "~{vcf_basename}"
        File output_vcf_index = "~{vcf_basename}.tbi"
    }
}

task CalculateChromosomeLength {
    input {
        File ref_dict
        String chrom

        String ubuntu_docker = "us.gcr.io/broad-dsde-methods/ubuntu:20.04"
        Int memory_mb = 2000
        Int cpu = 1
        Int disk_size_gb = ceil(2 * size(ref_dict, "GiB")) + 5
    }

    command {
        set -e -o pipefail

        grep -P "SN:~{chrom}\t" ~{ref_dict} | sed 's/.*LN://' | sed 's/\t.*//'
    }
    runtime {
        docker: ubuntu_docker
        disks: "local-disk ${disk_size_gb} HDD"
        memory: "${memory_mb} MiB"
        cpu: cpu
        preemptible: 3
    }
    output {
        Int chrom_length = read_int(stdout())
    }
}

task GenerateChunk {
    input {
        Int start
        Int end
        String chrom
        String basename
        File vcf
        File vcf_index

        Int disk_size_gb = ceil(2 * size(vcf, "GiB")) + 10
        Int cpu = 1
        Int memory_mb = 6000
        String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.6.1.0"
    }
    Int command_mem = memory_mb - 1500
    Int max_heap = memory_mb - 1000

    command {
        set -euo pipefail

        gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" \
        SelectVariants \
        -V ~{vcf} \
        -L ~{chrom}:~{start}-~{end} \
        -O ~{basename}.vcf.gz \
        --exclude-filtered true \
        -select 'POS >= ~{start}'
    }
    runtime {
        docker: gatk_docker
        disks: "local-disk ${disk_size_gb} HDD"
        memory: "${memory_mb} MiB"
        cpu: cpu
    }
    parameter_meta {
        vcf: {
                 description: "vcf",
                 localization_optional: true
             }
        vcf_index: {
                       description: "vcf index",
                       localization_optional: true
                   }
    }
    output {
        File output_vcf = "~{basename}.vcf.gz"
        File output_vcf_index = "~{basename}.vcf.gz.tbi"
    }
}

task GatherVcfs {
    input {
        Array[File] input_vcfs
        String output_vcf_name
        Int disk_size_gb = ceil(1.5 * size(input_vcfs, "GiB")) + 10
        Int machine_mem_mb = 6000
        String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.6.1.0"
    }

    Int command_mem = machine_mem_mb - 1500
    Int max_heap = machine_mem_mb - 1000

    parameter_meta {
        input_vcfs: {
                        localization_optional: true
                    }
    }

    command <<<
        set -euo pipefail

        # --ignore-safety-checks makes a big performance difference so we include it in our invocation.
        # This argument disables expensive checks that the file headers contain the same set of
        # genotyped samples and that files are in order by position of first record.
        gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" \
        GatherVcfsCloud \
        --ignore-safety-checks \
        --gather-type BLOCK \
        --input ~{sep=" --input " input_vcfs} \
        --output ~{output_vcf_name}

        tabix ~{output_vcf_name}
    >>>

    runtime {
        memory: "~{machine_mem_mb} MiB"
        cpu: "1"
        bootDiskSizeGb: 15
        disks: "local-disk " + disk_size_gb + " HDD"
        preemptible: 0
        docker: gatk_docker
    }

    output {
        File output_vcf = "~{output_vcf_name}"
        File output_vcf_index = "~{output_vcf_name}.tbi"
    }
}