version 1.0

import "../../../../../tasks/wdl/GermlineVariantDiscovery.wdl" as Calling
import "../../../../../tasks/wdl/Qc.wdl" as QC
import "../../../../../tasks/wdl/Utilities.wdl" as Utils
import "../../../../../tasks/wdl/BamProcessing.wdl" as BamProcessing
import "../../../../../tasks/wdl/DragenTasks.wdl" as DragenTasks

workflow VariantCalling {


  String pipeline_version = "2.2.8"


  input {
    Boolean run_dragen_mode_variant_calling = false
    Boolean use_spanning_event_genotyping = true
    File calling_interval_list
    File evaluation_interval_list
    Int haplotype_scatter_count
    Int break_bands_at_multiples_of
    Float? contamination
    File input_bam
    File input_bam_index
    File ref_fasta
    File ref_fasta_index
    File ref_dict
    File? ref_str
    File dbsnp_vcf
    File dbsnp_vcf_index
    String base_file_name
    String final_vcf_base_name
    Int agg_preemptible_tries
    Boolean make_gvcf = true
    Boolean make_bamout = false
    Boolean use_gatk3_haplotype_caller = false
    Boolean skip_reblocking = false
    Boolean use_dragen_hard_filtering = false
    String cloud_provider
  }

  # docker images
  String gatk_docker_gcp = "us.gcr.io/broad-gatk/gatk:4.6.1.0"
  String gatk_docker_azure = "terrapublic.azurecr.io/gatk:4.6.1.0"
  String gatk_docker = if cloud_provider == "gcp" then gatk_docker_gcp else gatk_docker_azure

  String gatk_1_3_docker_gcp = "us.gcr.io/broad-gotc-prod/gatk:1.3.0-4.2.6.1-1649964384"
  String gatk_1_3_docker_azure = "us.gcr.io/broad-gotc-prod/gatk:1.3.0-4.2.6.1-1649964384"
  String gatk_1_3_docker = if cloud_provider == "gcp" then gatk_1_3_docker_gcp else gatk_1_3_docker_azure

  String picard_python_docker_gcp = "us.gcr.io/broad-gotc-prod/picard-python:1.0.0-2.26.10-1663951039"
  String picard_python_docker_azure = "dsppipelinedev.azurecr.io/picard-python:1.0.0-2.26.10-1663951039"
  String picard_python_docker = if cloud_provider == "gcp" then picard_python_docker_gcp else picard_python_docker_azure

  String picard_cloud_docker_gcp = "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10"
  String picard_cloud_docker_azure = "dsppipelinedev.azurecr.io/picard-cloud:2.26.10"
  String picard_cloud_docker = if cloud_provider == "gcp" then picard_cloud_docker_gcp else picard_cloud_docker_azure


  # make sure either gcp or azr is supplied as cloud_provider input
  if ((cloud_provider != "gcp") && (cloud_provider != "azure")) {
    call Utils.ErrorWithMessage as ErrorMessageIncorrectInput {
      input:
        message = "cloud_provider must be supplied with either 'gcp' or 'azure'."
    }
  }

  parameter_meta {
    make_bamout: "For CNNScoreVariants to run with a 2D model, a bamout must be created by HaplotypeCaller. The bamout is a bam containing information on how HaplotypeCaller remapped reads while it was calling variants. See https://gatkforums.broadinstitute.org/gatk/discussion/5484/howto-generate-a-bamout-file-showing-how-haplotypecaller-has-remapped-sequence-reads for more details."
    run_dragen_mode_variant_calling: "Run variant calling using the DRAGEN-GATK pipeline, false by default."
  }

  if (run_dragen_mode_variant_calling) {
    call DragenTasks.CalibrateDragstrModel as DragstrAutoCalibration {
      input:
        ref_fasta = ref_fasta,
        ref_fasta_idx = ref_fasta_index,
        ref_dict = ref_dict,
        alignment = input_bam,
        alignment_index = input_bam_index,
        str_table_file = select_first([ref_str]),
        docker = gatk_docker
    }
  }


  # Break the calling interval_list into sub-intervals
  # Perform variant calling on the sub-intervals, and then gather the results
  call Utils.ScatterIntervalList as ScatterIntervalList {
    input:
      interval_list = calling_interval_list,
      scatter_count = haplotype_scatter_count,
      break_bands_at_multiples_of = break_bands_at_multiples_of,
      docker = picard_python_docker
  }

  # We need disk to localize the sharded input and output due to the scatter for HaplotypeCaller.
  # If we take the number we are scattering by and reduce by 20 we will have enough disk space
  # to account for the fact that the data is quite uneven across the shards.
  Int potential_hc_divisor = ScatterIntervalList.interval_count - 20
  Int hc_divisor = if potential_hc_divisor > 1 then potential_hc_divisor else 1

  # Call variants in parallel over WGS calling intervals
  scatter (scattered_interval_list in ScatterIntervalList.out) {

    if (use_gatk3_haplotype_caller) {
      call Calling.HaplotypeCaller_GATK35_GVCF as HaplotypeCallerGATK3 {
        input:
          input_bam = input_bam,
          input_bam_index = input_bam_index,
          interval_list = scattered_interval_list,
          gvcf_basename = base_file_name,
          ref_dict = ref_dict,
          ref_fasta = ref_fasta,
          ref_fasta_index = ref_fasta_index,
          contamination = contamination,
          preemptible_tries = agg_preemptible_tries,
          hc_scatter = hc_divisor,
          docker = gatk_1_3_docker
      }
    }

    if (!use_gatk3_haplotype_caller) {
      # Generate GVCF by interval
      call Calling.HaplotypeCaller_GATK4_VCF as HaplotypeCallerGATK4 {
        input:
          contamination = if run_dragen_mode_variant_calling then 0 else contamination,
          input_bam = input_bam,
          input_bam_index = input_bam_index,
          interval_list = scattered_interval_list,
          vcf_basename = base_file_name,
          ref_dict = ref_dict,
          ref_fasta = ref_fasta,
          ref_fasta_index = ref_fasta_index,
          hc_scatter = hc_divisor,
          make_gvcf = make_gvcf,
          make_bamout = make_bamout,
          run_dragen_mode_variant_calling = run_dragen_mode_variant_calling,
          use_dragen_hard_filtering = use_dragen_hard_filtering,
          use_spanning_event_genotyping = use_spanning_event_genotyping,
          dragstr_model = DragstrAutoCalibration.dragstr_model,
          preemptible_tries = agg_preemptible_tries,
          gatk_docker = gatk_docker
       }

      if (use_dragen_hard_filtering) {
        call Calling.DragenHardFilterVcf as DragenHardFilterVcf {
          input:
            input_vcf = HaplotypeCallerGATK4.output_vcf,
            input_vcf_index = HaplotypeCallerGATK4.output_vcf_index,
            make_gvcf = make_gvcf,
            vcf_basename = base_file_name,
            preemptible_tries = agg_preemptible_tries,
            gatk_docker = gatk_docker
        }
      }

      # If bamout files were created, we need to sort and gather them into one bamout
      if (make_bamout) {
        call BamProcessing.SortSam as SortBamout {
          input:
            input_bam = HaplotypeCallerGATK4.bamout,
            output_bam_basename = final_vcf_base_name,
            preemptible_tries = agg_preemptible_tries,
            compression_level = 2,
            docker = picard_cloud_docker
        }
      }
    }

    File vcfs_to_merge = select_first([HaplotypeCallerGATK3.output_gvcf, DragenHardFilterVcf.output_vcf, HaplotypeCallerGATK4.output_vcf])
    File vcf_indices_to_merge = select_first([HaplotypeCallerGATK3.output_gvcf_index, DragenHardFilterVcf.output_vcf_index, HaplotypeCallerGATK4.output_vcf_index])
  }

  # Combine by-interval (g)VCFs into a single sample (g)VCF file
  String hard_filter_suffix = if use_dragen_hard_filtering then ".hard-filtered" else ""
  String merge_suffix = if make_gvcf then ".g.vcf.gz" else ".vcf.gz"
  call Calling.MergeVCFs as MergeVCFs {
    input:
      input_vcfs = vcfs_to_merge,
      input_vcfs_indexes = vcf_indices_to_merge,
      output_vcf_name = final_vcf_base_name + hard_filter_suffix + merge_suffix,
      preemptible_tries = agg_preemptible_tries,
      docker = picard_cloud_docker
  }

  if (make_gvcf && !skip_reblocking) {
    call Calling.Reblock as Reblock {
      input:
        gvcf = MergeVCFs.output_vcf,
        gvcf_index = MergeVCFs.output_vcf_index,
        ref_fasta = ref_fasta,
        ref_fasta_index = ref_fasta_index,
        ref_dict = ref_dict,
        output_vcf_filename = basename(MergeVCFs.output_vcf, ".g.vcf.gz") + ".rb.g.vcf.gz",
        docker_path = gatk_docker
    }
  }

  if (make_bamout) {
    call MergeBamouts {
      input:
        bams = select_all(SortBamout.output_bam),
        output_base_name = final_vcf_base_name
    }
  }

  # Validate the (g)VCF output of HaplotypeCaller
  call QC.ValidateVCF as ValidateVCF {
    input:
      input_vcf = select_first([Reblock.output_vcf, MergeVCFs.output_vcf]),
      input_vcf_index = select_first([Reblock.output_vcf_index, MergeVCFs.output_vcf_index]),
      dbsnp_vcf = dbsnp_vcf,
      dbsnp_vcf_index = dbsnp_vcf_index,
      ref_fasta = ref_fasta,
      ref_fasta_index = ref_fasta_index,
      ref_dict = ref_dict,
      calling_interval_list = calling_interval_list,
      is_gvcf = make_gvcf,
      extra_args = if (skip_reblocking == false) then "--no-overlaps" else "",
      docker_path = gatk_docker,
      preemptible_tries = agg_preemptible_tries
  }

  # QC the (g)VCF
  call QC.CollectVariantCallingMetrics as CollectVariantCallingMetrics {
    input:
      input_vcf = select_first([Reblock.output_vcf, MergeVCFs.output_vcf]),
      input_vcf_index = select_first([Reblock.output_vcf_index, MergeVCFs.output_vcf_index]),
      metrics_basename = final_vcf_base_name,
      dbsnp_vcf = dbsnp_vcf,
      dbsnp_vcf_index = dbsnp_vcf_index,
      ref_dict = ref_dict,
      evaluation_interval_list = evaluation_interval_list,
      is_gvcf = make_gvcf,
      preemptible_tries = agg_preemptible_tries,
      docker = picard_cloud_docker
  }

  output {
    File vcf_summary_metrics = CollectVariantCallingMetrics.summary_metrics
    File vcf_detail_metrics = CollectVariantCallingMetrics.detail_metrics
    File output_vcf = select_first([Reblock.output_vcf, MergeVCFs.output_vcf])
    File output_vcf_index = select_first([Reblock.output_vcf_index, MergeVCFs.output_vcf_index])
    File? bamout = MergeBamouts.output_bam
    File? bamout_index = MergeBamouts.output_bam_index
  }
  meta {
    allowNestedInputs: true
  }
}

# This task is here because merging bamout files using Picard produces an error.
task MergeBamouts {

  input {
    Array[File] bams
    String output_base_name
  }

  Int disk_size = ceil(size(bams, "GiB") * 2) + 10

  command <<<
    samtools merge ~{output_base_name}.bam ~{sep=" " bams}
    samtools index ~{output_base_name}.bam
    mv ~{output_base_name}.bam.bai ~{output_base_name}.bai
  >>>

  output {
    File output_bam = "~{output_base_name}.bam"
    File output_bam_index = "~{output_base_name}.bai"
  }

  runtime {
    docker: "biocontainers/samtools:1.3.1"
    memory: "4 GiB"
    disks: "local-disk ~{disk_size} HDD"
    preemptible: 3
    cpu: 1
  }
}