version 1.0

workflow BuildIndices {
  input {
    # Genome source can be NCBI or GENCODE
    String genome_source
    # GTF annotation version refers to the version or release listed in the GTF
    String gtf_annotation_version
    # Genome build is the assembly accession (NCBI) or version (GENCODE)
    String genome_build
    String organism

    File annotations_gtf
    File genome_fa
    File? biotypes                                # required only when run_modify_gtf=true for non-marmoset

    Boolean run_add_introns = false
    Boolean run_mitofinder = false
    Boolean run_modify_gtf

    String?  mito_accession                       # e.g. chimp or ferret mito accession (NC_…)
    File?    mito_ref_gbk                         # path to mitochondrion reference .gbk
    Array[String]? mitofinder_opts                # optional, override extra flags to MitoFinder/add_mito
    File?    annotations_gff                      # gff file for mitofinder
  }

  # version of this pipeline
  String pipeline_version = "5.1.0"

  parameter_meta {
    annotations_gtf: "the annotation file"
    genome_fa: "the fasta file"
    biotypes: "gene_biotype attributes to include in the gtf file; required when run_modify_gtf=true for non-marmoset organisms"
  }

  # ---- Mitofinder block (completely isolated; no effect when run_mitofinder = false) ----
  if (run_mitofinder) {
    call MitoAnnotate as annotate_with_mitofinder {
      input:
        mito_accession = select_first([mito_accession]), # check to make sure mito_accession is provided before running MitoAnnotate
        mito_ref_gbk   = select_first([mito_ref_gbk]), # check to make sure mito_ref_gbk is provided before running MitoAnnotate
        genome_fa      = genome_fa,
        transcript_gff = select_first([annotations_gff]),
        spec_name      = organism,
        mitofinder_opts = mitofinder_opts
    }

    call AppendMitoGTF as append_mito_gtf {
      input:
        original_gtf   = annotations_gtf,
        mito_gtf       = annotate_with_mitofinder.out_gtf
    }

    # Remove the old mito contig from the combined genome to avoid duplicates
    call RemoveDuplicateMitoContig {
      input:
        genome_fa      = annotate_with_mitofinder.out_fasta,
        mito_accession = select_first([mito_accession])
    }
  }

  # When run_mitofinder = true, use mito-processed files; otherwise pass through originals unchanged
  File final_genome_fa = select_first([RemoveDuplicateMitoContig.cleaned_fasta, genome_fa])
  File final_annotations_gtf = select_first([append_mito_gtf.out_gtf, annotations_gtf])

  # ---- Always-run GTF gene_name fix ----
  call FixGeneNames {
    input:
      annotation_gtf = final_annotations_gtf
  }

  # ---- Conditional GTF modification block ----
  Boolean is_marmoset = (organism == "marmoset" || organism == "Marmoset")

  # ---- Always-run GTF validation (non-marmoset only) ----
  # In the original monolithic BuildStarSingleNucleus, these checks ran
  # unconditionally for non-marmoset organisms—even when skip_gtf_modification
  # was true.  Extracting them here preserves that behaviour after the task
  # was split into separate steps.
  if (!is_marmoset) {
    call ValidateGTF {
      input:
        annotation_gtf = FixGeneNames.fixed_gtf,
        genome_source = genome_source,
        genome_build = genome_build
    }
  }

  if (run_modify_gtf && !is_marmoset) {
    call ModifyGTF {
      input:
        annotation_gtf = FixGeneNames.fixed_gtf,
        biotypes = select_first([biotypes])
    }
  }

  if (run_modify_gtf && is_marmoset) {
    call ModifyGTFMarmoset {
      input:
        annotation_gtf = FixGeneNames.fixed_gtf,
        organism = organism
    }
  }

  File gtf_for_star = select_first([ModifyGTF.modified_gtf, ModifyGTFMarmoset.modified_gtf, FixGeneNames.fixed_gtf])

  call BuildStarSingleNucleus {
    input:
      gtf_annotation_version = gtf_annotation_version,
      annotation_gtf = gtf_for_star,
      genome_fa = final_genome_fa,
      genome_build = genome_build,
      genome_source = genome_source,
      organism = organism,
      run_modify_gtf = run_modify_gtf
  }

  call CalculateChromosomeSizes {
    input:
      genome_fa = final_genome_fa
  }

  call BuildBWAreference {
    input:
      genome_fa = final_genome_fa,
      chrom_sizes_file = CalculateChromosomeSizes.chrom_sizes,
      genome_source = genome_source,
      genome_build = genome_build,
      gtf_annotation_version = gtf_annotation_version,
      organism = organism
  }

  # Centralize what goes into metadata
  # All optional outputs can be passed directly to select_all
  Array[File] recorded_inputs = select_all([
    annotations_gff,  # File? - already optional
    annotations_gtf,
    biotypes,
    genome_fa
  ])

  Array[File] recorded_outputs = select_all([
    annotate_with_mitofinder.out_fasta,  # File? from conditional block
    append_mito_gtf.out_gtf,              # File? from conditional block
    FixGeneNames.fixed_gtf,               # Always-run gene_name fix
    ModifyGTF.modified_gtf,               # File? from conditional block
    ModifyGTFMarmoset.modified_gtf,       # File? from conditional block
    BuildStarSingleNucleus.star_index,
    BuildStarSingleNucleus.modified_annotation_gtf,
    CalculateChromosomeSizes.chrom_sizes,
    BuildBWAreference.reference_bundle
  ])

  call RecordMetadata {
    input:
      pipeline_version = pipeline_version,
      organism = organism,
      genome_source = genome_source,
      genome_build = genome_build,
      gtf_annotation_version = gtf_annotation_version,
      run_mitofinder = run_mitofinder,
      run_modify_gtf = run_modify_gtf,
      run_add_introns = run_add_introns,
      is_marmoset = is_marmoset,
      input_files = recorded_inputs,
      output_files = recorded_outputs,
      input_annotations_gtf = annotations_gtf,
      input_genome_fa = genome_fa,
      mito_annotated_fasta = annotate_with_mitofinder.out_fasta,
      mito_appended_gtf = append_mito_gtf.out_gtf,
      fixed_gtf = FixGeneNames.fixed_gtf,
      modified_gtf = ModifyGTF.modified_gtf,
      modified_gtf_marmoset = ModifyGTFMarmoset.modified_gtf,
      star_annotation_gtf = BuildStarSingleNucleus.modified_annotation_gtf,
      star_index = BuildStarSingleNucleus.star_index
  }

  if (run_add_introns) {
    call SNSS2AddIntronsToGTF {
      input:
        modified_annotation_gtf = BuildStarSingleNucleus.modified_annotation_gtf,
        genome_fa = final_genome_fa
    }
  }

  output {
    File star_index_tar = BuildStarSingleNucleus.star_index
    String pipeline_version_out = "BuildIndices_v~{pipeline_version}"
    File snSS2_annotation_gtf_modified = BuildStarSingleNucleus.modified_annotation_gtf
    File reference_bundle = BuildBWAreference.reference_bundle
    File chromosome_sizes = CalculateChromosomeSizes.chrom_sizes
    File metadata = RecordMetadata.metadata_file
    File? snSS2_annotation_gtf_with_introns = SNSS2AddIntronsToGTF.modified_annotation_gtf_with_introns
    # Optional outputs from the mito step
    File? mito_annotated_fasta = annotate_with_mitofinder.out_fasta
    File? mito_annotated_gtf = append_mito_gtf.out_gtf
    File? star_index_with_introns = SNSS2AddIntronsToGTF.star_index_with_introns
  }
}

task MitoAnnotate {
  input {
    String mito_accession
    File   mito_ref_gbk
    File   genome_fa
    File   transcript_gff
    String spec_name
    Array[String]? mitofinder_opts
  }

  command <<<
    set -euo pipefail
    set -x

    # WDL/Cromwell already localizes File inputs; these are valid inside the container.
    GBK="~{mito_ref_gbk}"
    FA="~{genome_fa}"
    GFF_IN="~{transcript_gff}"

    # Make a temp dir in the container for any conversions/decompression
    tmpDir="$(mktemp -d)"
    chmod 777 "$tmpDir"

    echo "Container PWD: $(pwd)"
    echo "Checking mounted /cromwell_root (should exist in Cromwell docker backends):"
    ls -ld /cromwell_root || true

    echo "Preflight: show inputs"
    ls -l "$GBK" || (echo "Missing GBK at $GBK" && exit 1)
    ls -l "$FA"  || (echo "Missing FASTA at $FA" && exit 1)
    ls -l "$GFF_IN" || (echo "Missing transcript file at $GFF_IN" && exit 1)

    GFF="$GFF_IN"

    # If add_mito can't read .gz,  decompress to a working copy
    if gzip -t "$GFF_IN" 2>/dev/null; then
      echo "Decompressing transcript annotations to GFF (stream-safe)..."
      zcat -f "$GFF_IN" > "$tmpDir/transcripts.gff"
      GFF="$tmpDir/transcripts.gff"
    fi


    echo "Running add_mito with:"

    # Run add_mito using ONLY the localized, in-container paths
    add_mito \
      -a "~{mito_accession}" \
      -r "$GBK" \
      -g "$FA" \
      -t "$GFF" \
      -n "~{spec_name}" \
      ~{sep=' ' mitofinder_opts}


    FA_OUT=$(find /mnt/disks/cromwell_root -name '*_mito.fasta' | head -n1)
    GTF_OUT=$(find /mnt/disks/cromwell_root -name '*_mito.gtf' | head -n1)

    cp "$FA_OUT" ./genome_mito.fasta
    cp "$GTF_OUT" ./transcripts_mito.gtf

  >>>

  output {
    File out_fasta = "genome_mito.fasta"
    File out_gtf   = "transcripts_mito.gtf"
  }

  runtime {
    docker: "us.gcr.io/broad-gotc-prod/add_mito:1.0.0-1.4.2"
    memory: "8 GiB"
    disks: "local-disk 50 HDD"
    cpu: 2
  }
}

task AppendMitoGTF {
  input {
    File original_gtf
    File mito_gtf
  }

  command <<<
    set -euo pipefail

    # grep mitofinder in the mito_gtf and append those lines to the original gtf
    grep "mitofinder" ~{mito_gtf} > mito_only.gtf

    # Check if original GTF is gzipped and decompress if needed
    if [[ "~{original_gtf}" == *.gz ]]; then
      echo "Original GTF is gzipped, decompressing..."
      gunzip -c ~{original_gtf} > original_decompressed.gtf
      ORIGINAL_FILE="original_decompressed.gtf"
    else
      ORIGINAL_FILE="~{original_gtf}"
    fi

    # Concatenate the original GTF and the mito GTF
    echo "Combining GTF files..."
    cat "${ORIGINAL_FILE}" mito_only.gtf > combined_annotations.gtf

  >>>

  output {
    File out_gtf = "combined_annotations.gtf"
  }

  runtime {
    docker: "ubuntu:20.04"
    memory: "2 GiB"
    disks: "local-disk 10 HDD"
    cpu: 1
  }
}

task RemoveDuplicateMitoContig {
  input {
    File genome_fa
    String mito_accession
  }

  command <<<
    set -euo pipefail

    cp ~{genome_fa} genome_input.fasta

    if grep -q "^>~{mito_accession}$" genome_input.fasta; then
      echo "Removing duplicate contig ~{mito_accession} from FASTA..."
      awk -v acc="~{mito_accession}" '
        BEGIN { deleted = 0 }
        $0 == ">" acc && deleted == 0 { deleted = 1; skip = 1; next }
        /^>/ { skip = 0 }
        !skip
      ' genome_input.fasta > cleaned_genome.fasta
    else
      echo "Contig ~{mito_accession} not found; no removal needed."
      cp genome_input.fasta cleaned_genome.fasta
    fi
  >>>

  output {
    File cleaned_fasta = "cleaned_genome.fasta"
  }

  runtime {
    docker: "ubuntu:20.04"
    memory: "4 GiB"
    disks: "local-disk 50 HDD"
    cpu: 1
  }
}


task CalculateChromosomeSizes {
  input {
    File genome_fa
  }
  command <<<
    samtools faidx ~{genome_fa}
    cut -f1,2 "~{genome_fa}.fai" > chrom.sizes
  >>>
  runtime {
    docker: "us.gcr.io/broad-gotc-prod/samtools:1.0.0-1.11-1624651616"
    preemptible: 3
    memory: "3 GiB"
    cpu: "1"
    disks: "local-disk 50 HDD"
  }
  output {
    File chrom_sizes = "chrom.sizes"
  }
}

task FixGeneNames {
  input {
    File annotation_gtf
  }

  meta {
    description: "Decompress GTF if needed and fix missing gene_name attributes by copying from gene_id"
  }

  command <<<
    set -eo pipefail

    # Decompress GTF if gzipped
    if [[ "~{annotation_gtf}" == *.gz ]]; then
        echo "Detected gzipped GTF file, decompressing..."
        gunzip -c ~{annotation_gtf} > annotation.gtf
        GTF_FILE="annotation.gtf"
    else
        echo "GTF file is not compressed"
        GTF_FILE="~{annotation_gtf}"
    fi

    # Fix missing gene_name attributes
    echo "Checking and fixing gene_name attributes in GTF..."
    awk -F'\t' 'BEGIN { OFS="\t" }
      /^#/ { print; next }
      {
        gene_id = ""; gene_name = "";
        if ($9 ~ /gene_id/) {
          n = split($9, a, /gene_id "/)
          if (n > 1) {
            split(a[2], b, "\"")
            gene_id = b[1]
          }
        }

        # Check if gene_name is missing and add it
        if ($9 !~ /gene_name/ && gene_id != "") {
          sub(/[[:space:]]*;[[:space:]]*$/, "", $9)  # remove trailing semicolons/spaces
          $9 = $9 "; gene_name \"" gene_id "\";"
        }

        print
      }' "$GTF_FILE" > fixed_annotation.gtf

    echo "GTF gene_name fix complete"
  >>>

  output {
    File fixed_gtf = "fixed_annotation.gtf"
  }

  runtime {
    docker: "ubuntu:20.04"
    memory: "4 GiB"
    disks: "local-disk 50 HDD"
    cpu: 1
  }
}

task ValidateGTF {
  input {
    File annotation_gtf
    String genome_source
    String genome_build
  }

  meta {
    description: "Validate that the GTF header contains the expected genome build and source. Runs for non-marmoset organisms regardless of run_modify_gtf."
  }

  command <<<
    set -eo pipefail

    GTF_FILE="~{annotation_gtf}"

    # Check that GTF contains expected genome build
    if head -10 ${GTF_FILE} | grep -qi ~{genome_build}
    then
        echo Genome version found in the GTF file
    else
        echo Error: Input genome version does not match version in GTF file
        exit 1;
    fi

    # Check that GTF contains expected genome source
    if head -10 ${GTF_FILE} | grep -qi ~{genome_source}
    then
        echo Source of genome build identified in the GTF file
    else
        echo Error: Source of genome build not identified in the GTF file
        exit 1;
    fi

    echo "GTF validation passed"
  >>>

  output {
    Boolean validation_passed = true
  }

  runtime {
    docker: "ubuntu:20.04"
    memory: "2 GiB"
    disks: "local-disk 10 HDD"
    cpu: 1
  }
}

task BuildStarSingleNucleus {
  input {
    # GTF annotation version refers to the version (GENCODE) or release (NCBI) listed in the GTF
    String gtf_annotation_version
    # Genome source can be NCBI or GENCODE
    String genome_source
    # Genome build is the assembly accession (NCBI) or version (GENCODE)
    String genome_build
    # Organism can be Macaque, Mouse, Human, etc.
    String organism
    File genome_fa
    File annotation_gtf
    Boolean run_modify_gtf
    Int disk = 100
  }

  meta {
    description: "Build reference index files for STAR aligner"
  }

  String gtf_prefix = if run_modify_gtf then "modified_" else ""
  String ref_name = "star2.7.10a-~{organism}-~{genome_source}-build-~{genome_build}-~{gtf_annotation_version}"
  String star_index_name = "~{gtf_prefix}~{ref_name}.tar"
  String annotation_gtf_modified = "~{gtf_prefix}v~{gtf_annotation_version}.annotation.gtf"

  command <<<
    # Decompress GTF if gzipped, otherwise copy to expected output name
    if [[ "~{annotation_gtf}" == *.gz ]]; then
        echo "Detected gzipped GTF file, decompressing..."
        gunzip -c ~{annotation_gtf} > ~{annotation_gtf_modified}
    else
        echo "GTF file is not compressed, copying..."
        cp ~{annotation_gtf} ~{annotation_gtf_modified}
    fi

    mkdir star
    STAR --runMode genomeGenerate \
    --genomeDir star \
    --genomeFastaFiles ~{genome_fa} \
    --sjdbGTFfile ~{annotation_gtf_modified} \
    --sjdbOverhang 100 \
    --runThreadN 16 \
    --limitGenomeGenerateRAM=43375752629

    tar -cvf ~{star_index_name} star
  >>>

  output {
    File star_index = star_index_name
    File modified_annotation_gtf = annotation_gtf_modified
  }

  runtime {
    docker: "us.gcr.io/broad-gotc-prod/build-indices:2.1.0"
    memory: "64 GiB"
    disks: "local-disk ${disk} HDD"
    disk: disk + " GB" # TES
    cpu:"16"
  }
}

task ModifyGTF {
  input {
    File annotation_gtf
    File biotypes
  }

  meta {
    description: "Modify GTF annotation file for non-marmoset organisms using biotype filtering"
  }

  command <<<
    set -eo pipefail

    GTF_FILE="~{annotation_gtf}"

    # Run standard GTF modification
    echo "Running GTF modification"
    python3 /script/modify_gtf.py \
        --input-gtf ${GTF_FILE} \
        --output-gtf modified.annotation.gtf \
        --biotypes ~{biotypes}
  >>>

  output {
    File modified_gtf = "modified.annotation.gtf"
  }

  runtime {
    docker: "us.gcr.io/broad-gotc-prod/build-indices:2.1.0"
    memory: "8 GiB"
    disks: "local-disk 50 HDD"
    cpu: 2
  }
}

task ModifyGTFMarmoset {
  input {
    File annotation_gtf
    String organism
  }

  meta {
    description: "Modify GTF annotation file for marmoset organisms"
  }

  command <<<
    set -eo pipefail

    GTF_FILE="~{annotation_gtf}"

    # Create marmoset header
    echo "Marmoset detected, running header modification"
    python3 /script/create_marmoset_header_mt_genes.py \
        ${GTF_FILE} > /cromwell_root/header.gtf

    # Run marmoset-specific GTF modification
    echo "Running marmoset GTF modification"
    python3 /script/modify_gtf_marmoset.py \
        --input-gtf /cromwell_root/header.gtf \
        --output-gtf modified.annotation.gtf \
        --species ~{organism}
  >>>

  output {
    File modified_gtf = "modified.annotation.gtf"
  }

  runtime {
    docker: "us.gcr.io/broad-gotc-prod/build-indices:2.1.0"
    memory: "8 GiB"
    disks: "local-disk 50 HDD"
    cpu: 2
  }
}

task BuildBWAreference {
  input {
    File genome_fa
    File? chrom_sizes_file

    # GTF annotation version refers to the version (GENCODE) or release (NCBI) listed in the GTF
    String gtf_annotation_version
    # Genome source can be NCBI or GENCODE
    String genome_source
    # Genome build is the assembly accession (NCBI) or version (GENCODE)
    String genome_build
    # Organism can be Macaque, Mouse, Human, etc.
    String organism
  }

String reference_name = "bwa-mem2-2.2.1-~{organism}-~{genome_source}-build-~{genome_build}"

  command <<<
    mkdir genome
    mv ~{chrom_sizes_file} genome/chrom.sizes
    file=~{genome_fa}
    if [ ${file: -3} == ".gz" ]
      then
      gunzip -c ~{genome_fa} > genome/genome.fa
    else
      mv ~{genome_fa} genome/genome.fa
    fi

    bwa-mem2 index genome/genome.fa
    tar --dereference -cvf - genome/ > ~{reference_name}.tar
  >>>

  runtime {
    docker: "us.gcr.io/broad-gotc-prod/samtools-bwa-mem-2:1.0.0-2.2.1_x64-linux-1685469504"
    memory: "96GB"
    disks: "local-disk 100 HDD"
    disk: "100 GB" # TES
    cpu: "4"
  }

  output {
    File reference_bundle = "~{reference_name}.tar"
  }
}


task RecordMetadata {
  input {
    String pipeline_version
    String organism
    String genome_source
    String genome_build
    String gtf_annotation_version
    Boolean run_mitofinder
    Boolean run_modify_gtf
    Boolean run_add_introns
    Boolean is_marmoset
    Array[File] input_files
    Array[File] output_files

    # Original inputs for tracking
    File input_annotations_gtf
    File input_genome_fa

    # Optional modification outputs for tracking which steps ran
    File? mito_annotated_fasta
    File? mito_appended_gtf
    File fixed_gtf
    File? modified_gtf
    File? modified_gtf_marmoset
    File star_annotation_gtf
    File star_index
  }

  command <<<
    set -euo pipefail

    # Helper: convert cromwell paths to gs:// paths
    to_gs() { echo "$1" | sed 's|^/mnt/disks/cromwell_root/|gs://|'; }

    # ---- Header ----
    echo "========================================" > metadata.txt
    echo "BuildIndices Pipeline Metadata" >> metadata.txt
    echo "========================================" >> metadata.txt
    echo "Pipeline Version: ~{pipeline_version}" >> metadata.txt
    echo "Date of Workflow Run: $(date -u +%Y-%m-%dT%H:%M:%SZ)" >> metadata.txt
    echo "" >> metadata.txt

    # ---- Reference Genome Info ----
    echo "Reference Genome Configuration:" >> metadata.txt
    echo "  Organism: ~{organism}" >> metadata.txt
    echo "  Genome Source: ~{genome_source}" >> metadata.txt
    echo "  Genome Build: ~{genome_build}" >> metadata.txt
    echo "  GTF Annotation Version: ~{gtf_annotation_version}" >> metadata.txt
    echo "  Is Marmoset: ~{is_marmoset}" >> metadata.txt
    echo "" >> metadata.txt

    # ---- Pipeline Options ----
    echo "Pipeline Options:" >> metadata.txt
    echo "  run_mitofinder: ~{run_mitofinder}" >> metadata.txt
    echo "  run_modify_gtf: ~{run_modify_gtf}" >> metadata.txt
    echo "  run_add_introns: ~{run_add_introns}" >> metadata.txt
    echo "" >> metadata.txt

    # ---- Modifications Applied ----
    echo "Modifications Applied:" >> metadata.txt

    # MitoFinder
    if [ "~{run_mitofinder}" = "true" ]; then
      echo "  [MitoFinder] Ran mitochondrial annotation" >> metadata.txt
      echo "    Input genome FASTA: $(to_gs '~{input_genome_fa}')" >> metadata.txt
      if [ -n "~{default='NONE' mito_annotated_fasta}" ] && [ "~{default='NONE' mito_annotated_fasta}" != "NONE" ]; then
        echo "    Output mito-annotated FASTA: $(to_gs '~{mito_annotated_fasta}')" >> metadata.txt
      fi
      echo "    Input annotations GTF: $(to_gs '~{input_annotations_gtf}')" >> metadata.txt
      if [ -n "~{default='NONE' mito_appended_gtf}" ] && [ "~{default='NONE' mito_appended_gtf}" != "NONE" ]; then
        echo "    Output mito-appended GTF: $(to_gs '~{mito_appended_gtf}')" >> metadata.txt
      fi
    else
      echo "  [MitoFinder] Skipped" >> metadata.txt
    fi

    # FixGeneNames (always runs)
    echo "  [FixGeneNames] Fixed missing gene_name attributes" >> metadata.txt
    echo "    Output fixed GTF: $(to_gs '~{fixed_gtf}')" >> metadata.txt

    # GTF Modification
    if [ "~{run_modify_gtf}" = "true" ]; then
      if [ "~{is_marmoset}" = "true" ]; then
        echo "  [ModifyGTFMarmoset] Ran marmoset-specific GTF modification" >> metadata.txt
        echo "    Input GTF: $(to_gs '~{input_annotations_gtf}')" >> metadata.txt
        if [ -n "~{default='NONE' modified_gtf_marmoset}" ] && [ "~{default='NONE' modified_gtf_marmoset}" != "NONE" ]; then
          echo "    Output modified GTF: $(to_gs '~{modified_gtf_marmoset}')" >> metadata.txt
        fi
      else
        echo "  [ModifyGTF] Ran standard GTF modification" >> metadata.txt
        echo "    Input GTF: $(to_gs '~{input_annotations_gtf}')" >> metadata.txt
        if [ -n "~{default='NONE' modified_gtf}" ] && [ "~{default='NONE' modified_gtf}" != "NONE" ]; then
          echo "    Output modified GTF: $(to_gs '~{modified_gtf}')" >> metadata.txt
        fi
      fi
    else
      echo "  [ModifyGTF] Skipped" >> metadata.txt
    fi

    # STAR Index
    echo "  [BuildStarSingleNucleus] Built STAR index" >> metadata.txt
    echo "    Input GTF: $(to_gs '~{star_annotation_gtf}')" >> metadata.txt
    echo "    Output STAR index: $(to_gs '~{star_index}')" >> metadata.txt

    echo "" >> metadata.txt

    # ---- Input Files ----
    echo "Input Files and their md5sums:" >> metadata.txt
    for file in ~{sep=" " input_files}; do
      gs_path=$(to_gs "$file")
      echo "  $gs_path : $(md5sum "$file" | awk '{print $1}')" >> metadata.txt
    done
    echo "" >> metadata.txt

    # ---- Output Files ----
    echo "Output Files and their md5sums:" >> metadata.txt
    for file in ~{sep=" " output_files}; do
      gs_path=$(to_gs "$file")
      echo "  $gs_path : $(md5sum "$file" | awk '{print $1}')" >> metadata.txt
    done
    echo "" >> metadata.txt

    # ---- Cromwell Execution Info ----
    file="~{output_files[0]}"
    workspace_bucket=$(echo $file | awk -F'/' '{print $3}')
    echo "Workspace Bucket: $workspace_bucket" >> metadata.txt

    submission_id=$(echo $file | awk -F'/' '{print $5}')
    echo "Submission ID: $submission_id" >> metadata.txt

    workflow_id=$(echo $file | awk -F'/' '{print $7}')
    echo "Workflow ID: $workflow_id" >> metadata.txt

    echo "" >> metadata.txt
  >>>

  output {
    File metadata_file = "metadata.txt"
  }

  runtime {
    docker: "ubuntu:20.04"
    memory: "5 GiB"
    disks: "local-disk 100 HDD"
    cpu: "1"
  }
}

task SNSS2AddIntronsToGTF {
  input {
    File modified_annotation_gtf
    File genome_fa
  }
    String basename = basename(modified_annotation_gtf, ".gtf")
    String star_index_name = "~{basename}_intron.tar"

  command <<<

  python3  /script/add-introns-to-gtf.py  \
    --input-gtf "~{modified_annotation_gtf}" \
    --output-gtf "~{basename}_with_introns.gtf"

  mkdir star
  STAR --runMode genomeGenerate \
  --genomeDir star \
  --genomeFastaFiles ~{genome_fa} \
  --sjdbGTFfile ~{basename}_with_introns.gtf \
  --sjdbOverhang 100 \
  --runThreadN 16

  tar -cvf ~{star_index_name} star
  >>>

  output {
    File modified_annotation_gtf_with_introns = "~{basename}_with_introns.gtf"
    File star_index_with_introns = star_index_name
  }

  runtime {
    docker: "us.gcr.io/broad-gotc-prod/build-indices:4.2.1"
    memory: "50 GiB"
    disks: "local-disk 100 HDD"
    disk: 100 + " GB" # TES
  }
}