version 1.0

workflow BuildIndices {
  input {
    # Genome source can be NCBI or GENCODE
    String genome_source
    # GTF annotation version refers to the version or release listed in the GTF
    String gtf_annotation_version
    # Genome build is the assembly accession (NCBI) or version (GENCODE)
    String genome_build
    String organism

    File annotations_gtf
    File genome_fa
    File biotypes

    Boolean run_add_introns = false
    Boolean run_mitofinder = false
    Boolean skip_gtf_modification = false

    String?  mito_accession                       # e.g. chimp or ferret mito accession (NC_…)
    File?    mito_ref_gbk                         # path to mitochondrion reference .gbk
    Array[String]? mitofinder_opts                # optional, override extra flags to MitoFinder/add_mito
    File?    annotations_gff                       # gff file for mitofinder
  }

  if (false) {
    String? none = "None"
  }

  # version of this pipeline
  String pipeline_version = "5.0.3"


  parameter_meta {
    annotations_gtf: "the annotation file"
    genome_fa: "the fasta file"
    biotypes: "gene_biotype attributes to include in the gtf file"
  }

    # ---- Append mitochondrial sequence + annotations ----
    # Note: String comparison is case-sensitive.
    if (run_mitofinder) {
      call MitoAnnotate as annotate_with_mitofinder {
        input:
          mito_accession = select_first([mito_accession]),
          mito_ref_gbk   = select_first([mito_ref_gbk]),
          genome_fa      = genome_fa,
          transcript_gff = select_first([annotations_gff]),
          spec_name      = organism,
          mitofinder_opts = mitofinder_opts
      }
      call AppendMitoGTF as append_mito_gtf {
        input:
          original_gtf   = annotations_gtf,
          mito_gtf       = annotate_with_mitofinder.out_gtf
      }
    }

    call BuildStarSingleNucleus {
      input:
        gtf_annotation_version = gtf_annotation_version,
        annotation_gtf = select_first([if run_mitofinder then append_mito_gtf.out_gtf else none, annotations_gtf]),
        genome_fa = select_first([if run_mitofinder then annotate_with_mitofinder.out_fasta else none, genome_fa]),
        biotypes = biotypes,
        genome_build = genome_build,
        genome_source = genome_source,
        organism = organism,
        skip_gtf_modification = skip_gtf_modification,
        mito_accession = select_first([mito_accession])
    }
    call CalculateChromosomeSizes {
      input:
        genome_fa = select_first([if run_mitofinder then annotate_with_mitofinder.out_fasta else none, genome_fa]),
    }
    call BuildBWAreference {
      input:
        genome_fa = select_first([if run_mitofinder then annotate_with_mitofinder.out_fasta else none, genome_fa]),
        chrom_sizes_file = CalculateChromosomeSizes.chrom_sizes,
        genome_source = genome_source,
        genome_build = genome_build,
        gtf_annotation_version = gtf_annotation_version,
        organism = organism,
        mito_accession = select_first([mito_accession])
    }

    call RecordMetadata {
      input:
        pipeline_version = pipeline_version,
        was_mitofinder_run = run_mitofinder,
        organism = organism,
        mito_accession_used = mito_accession,
        mito_ref_gbk_used = mito_ref_gbk,
        mitofinder_opts_used = mitofinder_opts,
        input_files = select_all([if run_mitofinder then annotations_gff else none, annotations_gtf, biotypes, genome_fa]),
        output_files = select_all([
                                  if run_mitofinder then annotate_with_mitofinder.out_fasta else none,
                                  if run_mitofinder then append_mito_gtf.out_gtf else none,
                                  BuildStarSingleNucleus.star_index,
                                  BuildStarSingleNucleus.modified_annotation_gtf,
                                  CalculateChromosomeSizes.chrom_sizes,
                                  BuildBWAreference.reference_bundle
                                  ])
    }

  if (run_add_introns) {
    call SNSS2AddIntronsToGTF {
      input:
      modified_annotation_gtf = BuildStarSingleNucleus.modified_annotation_gtf,
      genome_fa = genome_fa
    }
  }

  output {
    File star_index_tar = BuildStarSingleNucleus.star_index
    String pipeline_version_out = "BuildIndices_v~{pipeline_version}"
    File snSS2_annotation_gtf_modified = BuildStarSingleNucleus.modified_annotation_gtf
    File reference_bundle = BuildBWAreference.reference_bundle
    File chromosome_sizes = CalculateChromosomeSizes.chrom_sizes
    File metadata = RecordMetadata.metadata_file
    File? snSS2_annotation_gtf_with_introns = SNSS2AddIntronsToGTF.modified_annotation_gtf_with_introns
    # Optional outputs from the mito step
    File? mito_annotated_fasta = annotate_with_mitofinder.out_fasta
    File? mito_annotated_gtf = append_mito_gtf.out_gtf
    File? star_index_with_introns = SNSS2AddIntronsToGTF.star_index_with_introns
  }
}

task MitoAnnotate {
  input {
    String mito_accession
    File   mito_ref_gbk
    File   genome_fa
    File   transcript_gff
    String spec_name
    Array[String]? mitofinder_opts
  }

  command <<<
    set -euo pipefail
    set -x

    # WDL/Cromwell already localizes File inputs; these are valid inside the container.
    GBK="~{mito_ref_gbk}"
    FA="~{genome_fa}"
    GFF_IN="~{transcript_gff}"

    # Make a temp dir in the container for any conversions/decompression
    tmpDir="$(mktemp -d)"
    chmod 777 "$tmpDir"

    echo "Container PWD: $(pwd)"
    echo "Checking mounted /cromwell_root (should exist in Cromwell docker backends):"
    ls -ld /cromwell_root || true

    echo "Preflight: show inputs"
    ls -l "$GBK" || (echo "Missing GBK at $GBK" && exit 1)
    ls -l "$FA"  || (echo "Missing FASTA at $FA" && exit 1)
    ls -l "$GFF_IN" || (echo "Missing transcript file at $GFF_IN" && exit 1)

    GFF="$GFF_IN"

    # If add_mito can't read .gz,  decompress to a working copy
    if gzip -t "$GFF_IN" 2>/dev/null; then
      echo "Decompressing transcript annotations to GFF (stream-safe)..."
      zcat -f "$GFF_IN" > "$tmpDir/transcripts.gff"
      GFF="$tmpDir/transcripts.gff"
    fi


    echo "Running add_mito with:"

    # Run add_mito using ONLY the localized, in-container paths
    add_mito \
      -a "~{mito_accession}" \
      -r "$GBK" \
      -g "$FA" \
      -t "$GFF" \
      -n "~{spec_name}" \
      ~{sep=' ' mitofinder_opts}


    FA_OUT=$(find /mnt/disks/cromwell_root -name '*_mito.fasta' | head -n1)
    GTF_OUT=$(find /mnt/disks/cromwell_root -name '*_mito.gtf' | head -n1)

    cp "$FA_OUT" ./genome_mito.fasta
    cp "$GTF_OUT" ./transcripts_mito.gtf

  >>>

  output {
    File out_fasta = "genome_mito.fasta"
    File out_gtf   = "transcripts_mito.gtf"
  }

  runtime {
    docker: "us.gcr.io/broad-gotc-prod/add_mito:1.0.0-1.4.2"
    memory: "8 GiB"
    disks: "local-disk 50 HDD"
    cpu: 2
  }
}

task AppendMitoGTF {
  input {
    File original_gtf
    File mito_gtf
  }

  command <<<
    set -euo pipefail

    # grep mitofinder in the mito_gtf and append those lines to the original gtf
    grep "mitofinder" ~{mito_gtf} > mito_only.gtf

    # Check if original GTF is gzipped and decompress if needed
    if [[ "~{original_gtf}" == *.gz ]]; then
      echo "Original GTF is gzipped, decompressing..."
      gunzip -c ~{original_gtf} > original_decompressed.gtf
      ORIGINAL_FILE="original_decompressed.gtf"
    else
      ORIGINAL_FILE="~{original_gtf}"
    fi

    # Concatenate the original GTF and the mito GTF
    echo "Combining GTF files..."
    cat "${ORIGINAL_FILE}" mito_only.gtf > combined_annotations.gtf

  >>>

  output {
    File out_gtf = "combined_annotations.gtf"
  }

  runtime {
    docker: "ubuntu:20.04"
    memory: "2 GiB"
    disks: "local-disk 10 HDD"
    cpu: 1
  }
}


task CalculateChromosomeSizes {
  input {
    File genome_fa
  }
  command <<<
    samtools faidx ~{genome_fa}
    cut -f1,2 "~{genome_fa}.fai" > chrom.sizes
  >>>
  runtime {
    docker: "us.gcr.io/broad-gotc-prod/samtools:1.0.0-1.11-1624651616"
    preemptible: 3
    memory: "3 GiB"
    cpu: "1"
    disks: "local-disk 50 HDD"
  }
  output {
    File chrom_sizes = "chrom.sizes"
  }
}

task BuildStarSingleNucleus {
  input {
    # GTF annotation version refers to the version (GENCODE) or release (NCBI) listed in the GTF
    String gtf_annotation_version
    # Genome source can be NCBI or GENCODE
    String genome_source
    # Genome build is the assembly accession (NCBI) or version (GENCODE)
    String genome_build
    # Organism can be Macaque, Mouse, Human, etc.
    String organism
    File genome_fa
    File annotation_gtf
    File biotypes
    Boolean skip_gtf_modification
    Int disk = 100
    String? mito_accession
  }

  meta {
    description: "Modify GTF files and build reference index files for STAR aligner"
  }

  String gtf_prefix = if skip_gtf_modification then "" else "modified_"
  String ref_name = "star2.7.10a-~{organism}-~{genome_source}-build-~{genome_build}-~{gtf_annotation_version}"
  String star_index_name = "~{gtf_prefix}~{ref_name}.tar"
  String annotation_gtf_modified = "~{gtf_prefix}v~{gtf_annotation_version}.annotation.gtf"

  command <<<
    # Decompress GTF if it's gzipped
    if [[ "~{annotation_gtf}" == *.gz ]]; then
        echo "Detected gzipped GTF file, decompressing..."
        gunzip -c ~{annotation_gtf} > annotation.gtf
        GTF_FILE="annotation.gtf"
    else
        echo "GTF file is not compressed"
        GTF_FILE="~{annotation_gtf}"
    fi

    # Fix missing gene_name attributes
    echo "Checking and fixing gene_name attributes in GTF..."
    awk -F'\t' 'BEGIN { OFS="\t" }
      /^#/ { print; next }
      {
        gene_id = ""; gene_name = "";
        if ($9 ~ /gene_id/) {
          n = split($9, a, /gene_id "/)
          if (n > 1) {
            split(a[2], b, "\"")
            gene_id = b[1]
          }
        }

        # Check if gene_name is missing and add it
        if ($9 !~ /gene_name/ && gene_id != "") {
          sub(/[[:space:]]*;[[:space:]]*$/, "", $9)  # remove trailing semicolons/spaces
          $9 = $9 "; gene_name \"" gene_id "\";"
        }

        print
      }' "$GTF_FILE" > fixed_annotation.gtf

    # Use the fixed GTF for downstream processing
    GTF_FILE="fixed_annotation.gtf"
    echo "GTF gene_name fix complete"

    # First check for marmoset GTF and modify header
    echo "checking for marmoset"
    if [[ "~{organism}" == "marmoset" || "~{organism}" == "Marmoset" ]]
    then
        echo "marmoset is detected, running header modification"
        python3 /script/create_marmoset_header_mt_genes.py \
            ${GTF_FILE} > "/cromwell_root/header.gtf"
    else
        echo "marmoset is not detected"

        # Check that input GTF files contain input genome source, genome build version, and annotation version
        if head -10 ${GTF_FILE} | grep -qi ~{genome_build}
        then
            echo Genome version found in the GTF file
        else
            echo Error: Input genome version does not match version in GTF file
            exit 1;
        fi

        # Check that GTF file contains correct build source info in the first 10 lines of the GTF
        if head -10 ${GTF_FILE} | grep -qi ~{genome_source}
        then
            echo Source of genome build identified in the GTF file
        else
            echo Error: Source of genome build not identified in the GTF file
            exit 1;
        fi
        set -eo pipefail
    fi

    if [ "~{skip_gtf_modification}" = "false" ]; then
        if [[ "~{organism}" == "marmoset" || "~{organism}" == "Marmoset" ]]
        then
            echo "marmoset detected, running marmoset GTF modification"
            echo "Listing files to check for head.gtf"
            ls
            python3 /script/modify_gtf_marmoset.py \
                --input-gtf "/cromwell_root/header.gtf" \
                --output-gtf ~{annotation_gtf_modified} \
                --species ~{organism}
            echo "listing files, should see modified gtf"
            ls
        else
            echo "running GTF modification for non-marmoset"
            python3 /script/modify_gtf.py \
                --input-gtf ${GTF_FILE} \
                --output-gtf ~{annotation_gtf_modified} \
                --biotypes ~{biotypes}
        fi
    else
        echo "Skipping GTF modification — using original GTF for STAR index"
        cp ${GTF_FILE} ~{annotation_gtf_modified}
    fi

    # --- Remove duplicate mito contig if mito_accession is set
    if [ -n "~{mito_accession}" ]; then
      echo "mito_accession provided: ~{mito_accession}"

      if grep -q "^>~{mito_accession}$" ~{genome_fa}; then
        echo "Removing duplicate contig ~{mito_accession} from FASTA..."

        awk -v acc="~{mito_accession}" '
          BEGIN { deleted = 0 }
          $0 == ">" acc && deleted == 0 { deleted = 1; skip = 1; next }
          /^>/ { skip = 0 }
          !skip
          ' ~{genome_fa} > genome_mito.filtered.fasta
        mv genome_mito.filtered.fasta ~{genome_fa}
      else
        echo "Contig ~{mito_accession} not found; skipping removal."
      fi
    else
        echo "No mito_accession provided, skipping contig removal."
    fi

    mkdir star
    STAR --runMode genomeGenerate \
    --genomeDir star \
    --genomeFastaFiles ~{genome_fa} \
    --sjdbGTFfile ~{annotation_gtf_modified} \
    --sjdbOverhang 100 \
    --runThreadN 16 \
    --limitGenomeGenerateRAM=43375752629

    tar -cvf ~{star_index_name} star
  >>>

  output {
    File star_index = star_index_name
    File modified_annotation_gtf = annotation_gtf_modified
  }

  runtime {
    docker: "us.gcr.io/broad-gotc-prod/build-indices:2.1.0"
    memory: "64 GiB"
    disks: "local-disk ${disk} HDD"
    disk: disk + " GB" # TES
    cpu:"16"
  }
}

task BuildBWAreference {
  input {
    File genome_fa
    File? chrom_sizes_file

    # GTF annotation version refers to the version (GENCODE) or release (NCBI) listed in the GTF
    String gtf_annotation_version
    # Genome source can be NCBI or GENCODE
    String genome_source
    # Genome build is the assembly accession (NCBI) or version (GENCODE)
    String genome_build
    # Organism can be Macaque, Mouse, Human, etc.
    String organism
    String? mito_accession
  }

String reference_name = "bwa-mem2-2.2.1-~{organism}-~{genome_source}-build-~{genome_build}"

  command <<<
    mkdir genome
    mv ~{chrom_sizes_file} genome/chrom.sizes
    file=~{genome_fa}
    if [ ${file: -3} == ".gz" ]
      then
      gunzip -c ~{genome_fa} > genome/genome.fa
    else
      mv ~{genome_fa} genome/genome.fa
    fi

    # --- Remove duplicate contig if mito_accession is provided ---
    if [ -n "~{mito_accession}" ]; then
      echo "mito_accession provided: ~{mito_accession}"

      if grep -q "^>~{mito_accession}$" genome/genome.fa; then
        echo "Removing duplicate contig ~{mito_accession} from FASTA..."

        awk -v acc="~{mito_accession}" '
          BEGIN { deleted = 0 }
          $0 == ">" acc && deleted == 0 { deleted = 1; skip = 1; next }
          /^>/ { skip = 0 }
          !skip
        ' genome/genome.fa > genome/genome.filtered.fa

        mv genome/genome.filtered.fa genome/genome.fa
      else
        echo "Contig ~{mito_accession} not found in genome.fa, no removal needed."
      fi
    else
        echo "No mito_accession provided, skipping contig removal."
    fi

    bwa-mem2 index genome/genome.fa
    tar --dereference -cvf - genome/ > ~{reference_name}.tar
  >>>

  runtime {
    docker: "us.gcr.io/broad-gotc-prod/samtools-bwa-mem-2:1.0.0-2.2.1_x64-linux-1685469504"
    memory: "96GB"
    disks: "local-disk 100 HDD"
    disk: "100 GB" # TES
    cpu: "4"
  }

  output {
    File reference_bundle = "~{reference_name}.tar"
  }
}


task RecordMetadata {
  input {
    String pipeline_version
    Array[File] input_files
    Array[File] output_files
    # New inputs for logging mito info
    Boolean was_mitofinder_run
    String organism
    String? mito_accession_used
    File? mito_ref_gbk_used
    Array[String]? mitofinder_opts_used
  }

  command <<<
    set -euo pipefail

    # create metadata file
    echo "Pipeline Version: ~{pipeline_version}" > metadata.txt
    echo "Date of Workflow Run: $(date -u +%Y-%m-%dT%H:%M:%SZ)" >> metadata.txt
    echo "" >> metadata.txt

    echo "--- MitoFinder Details ---" >> metadata.txt
    # Check if the boolean flag is true
    if [[ "~{was_mitofinder_run}" == "true" ]]; then
      echo "MitoFinder was run for organism: ~{organism}" >> metadata.txt
      # Check for and report the specific parameters used, handling optional inputs.
      if [ "~{mito_accession_used}" != "" ]; then
        echo "Mitochondrial Accession: ~{mito_accession_used}" >> metadata.txt
      fi
      if [ "~{mito_ref_gbk_used}" != "" ]; then
        echo "Mitochondrial Reference GBK: ~{mito_ref_gbk_used} (md5sum: $(md5sum "~{mito_ref_gbk_used}" | awk '{print $1}'))" >> metadata.txt
      fi
      if [ "~{sep=' ' mitofinder_opts_used}" != "" ]; then
        echo "MitoFinder Extra Options: ~{sep=' ' mitofinder_opts_used}" >> metadata.txt
      fi
    else
      echo "MitoFinder was not run." >> metadata.txt
    fi
    echo "" >> metadata.txt

    # echo paths and md5sums for input files
    echo "Input Files and their md5sums:" >> metadata.txt
    for file in ~{sep=" " input_files}; do
      gs_path=$(echo "$file" | sed 's|^/mnt/disks/cromwell_root/|gs://|')
      echo "$gs_path : $(md5sum "$file" | awk '{print $1}')" >> metadata.txt
    done
    echo "" >> metadata.txt

    # echo paths and md5sums for input files
    echo "Output Files and their md5sums:" >> metadata.txt
    for file in ~{sep=" " output_files}; do
      gs_path=$(echo "$file" | sed 's|^/mnt/disks/cromwell_root/|gs://|')
      echo "$gs_path : $(md5sum "$file" | awk '{print $1}')" >> metadata.txt
    done
    echo "" >> metadata.txt

    # grab workspace bucket
    file="~{output_files[0]}"
    workspace_bucket=$(echo $file | awk -F'/' '{print $3}')
    echo "Workspace Bucket: $workspace_bucket" >> metadata.txt

    # grab submission ID
    submission_id=$(echo $file | awk -F'/' '{print $5}')
    echo "Submission ID: $submission_id" >> metadata.txt

    # grab workflow ID
    workflow_id=$(echo $file | awk -F'/' '{print $7}')
    echo "Workflow ID: $workflow_id" >> metadata.txt

    echo "" >> metadata.txt
  >>>

  output {
    File metadata_file = "metadata.txt"
  }

  runtime {
    docker: "ubuntu:20.04"
    memory: "5 GiB"
    disks: "local-disk 100 HDD"
    cpu: "1"
  }
}

  task SNSS2AddIntronsToGTF {
  input {
    File modified_annotation_gtf
    File genome_fa
  }
    String basename = basename(modified_annotation_gtf, ".gtf")
    String star_index_name = "~{basename}_intron.tar"

  command <<<

  python3  /script/add-introns-to-gtf.py  \
    --input-gtf "~{modified_annotation_gtf}" \
    --output-gtf "~{basename}_with_introns.gtf"

  mkdir star
  STAR --runMode genomeGenerate \
  --genomeDir star \
  --genomeFastaFiles ~{genome_fa} \
  --sjdbGTFfile ~{basename}_with_introns.gtf \
  --sjdbOverhang 100 \
  --runThreadN 16

  tar -cvf ~{star_index_name} star
  >>>

  output {
    File modified_annotation_gtf_with_introns = "~{basename}_with_introns.gtf"
    File star_index_with_introns = star_index_name
  }

  runtime {
    docker: "us.gcr.io/broad-gotc-prod/build-indices:4.2.1"
    memory: "50 GiB"
    disks: "local-disk 100 HDD"
    disk: 100 + " GB" # TES
  }
}