version 1.0

##########################################################################################
# LRAA Single-Cell RNA-seq Workflow
##########################################################################################
#
# This workflow performs isoform discovery and/or quantification on single-cell long-read
# RNA-seq data with two primary execution modes and multiple optimization paths.
#
# ===== EXECUTION MODES =====
#
# 1. BASIC MODE (single_cell_pipe_mode = "basic")
#    - Runs initial discovery on the full BAM to generate a preliminary GTF
#    - Builds single-cell sparse matrices (gene, isoform, splice pattern) from read tracking
#    - Optionally filters good cells from empty droplets
#    - Clusters cells using Seurat on gene-level expression
#    - Outputs: initial GTF, single-cell matrices, cluster assignments, UMAP visualizations
#    - Use when: You want a quick initial analysis without per-cluster refinement
#
# 2. CLUSTER-GUIDED MODE (single_cell_pipe_mode = "cluster-guided")
#    - Performs all basic mode steps PLUS:
#    - Partitions BAM by cell clusters
#    - Re-runs LRAA discovery independently on each cluster's reads
#    - Merges cluster-specific GTFs into a refined final GTF
#    - Re-quantifies all cells against the final GTF
#    - Builds final single-cell sparse matrices with improved isoform definitions
#    - Outputs: refined final GTF, improved single-cell matrices, per-cluster pseudobulk data
#    - Use when: You want the highest quality isoform definitions tailored to cell types
#
# ===== QUANTIFICATION-ONLY MODE =====
#
# Set quant_only = true to skip isoform discovery and quantify against a known annotation:
#    - In BASIC mode with quant_only = true:
#      * Requires: initial_annot_gtf (the annotation to quantify against)
#      * Skips: Initial discovery phase
#      * Runs: Quantification, matrix building, cell filtering, clustering
#      * Outputs: Single-cell matrices and clusters based on the provided annotation
#
#    - In CLUSTER-GUIDED mode with quant_only = true:
#      * Requires: initial_annot_gtf
#      * Skips: Both initial discovery AND per-cluster discovery
#      * Runs: Clustering (if not precomputed), per-cluster quantification only
#      * Outputs: Single-cell matrices from cluster-guided quantification
#
# ===== PRECOMPUTED INPUTS (OPTIMIZATION PATHS) =====
#
# You can skip expensive computation steps by providing precomputed files from prior runs:
#
# 1. Skip Initial Discovery:
#    - Provide: precomputed_init_gtf + precomputed_init_quant_tracking
#    - Effect: Skips LRAA_init, uses your GTF and tracking files directly
#    - Use case: Re-run clustering with different parameters without re-discovering isoforms
#
# 2. Skip Initial Discovery + Matrix Building + Clustering:
#    - Provide: precomputed_init_gtf + precomputed_cluster_assignments_tsv
#    - Effect: Jumps directly to cluster-guided refinement (if mode = "cluster-guided")
#             or just outputs the precomputed results (if mode = "basic")
#    - Use case: Test cluster-guided refinement with different discovery parameters
#
# 3. Kickstart Cluster-Guided Mode:
#    - Provide: precomputed_init_gtf + precomputed_cluster_assignments_tsv
#    - Set: single_cell_pipe_mode = "cluster-guided"
#    - Flow: Uses precomputed GTF as the annotation for cluster-guided discovery
#            (passed as annot_gtf to guide per-cluster LRAA runs)
#    - Use case: You have initial results and want refined cluster-specific isoforms
#               without re-running the expensive initial discovery
#
# 4. Cluster-Guided Quantification-Only:
#    - Provide: precomputed_cluster_assignments_tsv + initial_annot_gtf
#    - Set: single_cell_pipe_mode = "cluster-guided", quant_only = true
#    - Flow: Skips all discovery, quantifies each cluster against initial_annot_gtf
#    - Use case: Quantify known isoforms with cluster-aware quantification
#
# ===== GENE SYMBOL INCORPORATION =====
#
# Optionally provide ref_annot_gtf_source_gene_symbols to incorporate gene symbols:
#    - Runs gffcompare between LRAA GTF and reference annotation
#    - Updates all single-cell matrices with gene symbols from matched loci
#    - Only runs when: All required sparse matrices are available (from either basic or
#                      cluster-guided mode, and not skipped due to missing precomputed inputs)
#
# ===== TYPICAL WORKFLOWS =====
#
# Quick exploration:
#   single_cell_pipe_mode = "basic"
#   → Get initial GTF, matrices, and clusters quickly
#
# High-quality production run:
#   single_cell_pipe_mode = "cluster-guided"
#   → Full pipeline with cluster-specific isoform refinement
#
# Iterative refinement:
#   1st run: mode = "basic" (save init GTF and cluster assignments)
#   2nd run: mode = "cluster-guided" + precomputed files
#   → Skip expensive initial phase, focus on cluster-specific refinement
#
# Quantify known annotation:
#   quant_only = true + initial_annot_gtf
#   → Fast quantification without discovery
#
##########################################################################################

import "LRAA.wdl" as LRAA
import "subwdls/LRAA-build_sparse_matrices_from_tracking.wdl" as BuildMatrices
import "subwdls/LRAA-filter_good_cells.wdl" as FilterCells
import "subwdls/LRAA-gene_sparseM_to_seurat_clusters.wdl" as Seurat
import "subwdls/Incorporate_gene_symbols.wdl" as GeneSymbols
import "LRAA-cell_cluster_guided.wdl" as ClusterGuided

workflow LRAA_singlecell_wf {
  input {
    # Core inputs
    String sample_id
    File referenceGenome
    File inputBAM

    # Optional: guide initial discovery with an annotation
    File? initial_annot_gtf

    # Single-cell pipeline mode: 'basic' or 'cluster-guided'
    String single_cell_pipe_mode = "basic"
    
    # Quantification-only mode: skip discovery, requires initial_annot_gtf
    Boolean quant_only = false

    # Platform/options
    Boolean HiFi = false
    String oversimplify = "chrM"   # e.g., "chrM" or "chrM,M"
    String main_chromosomes = ""  # if empty, runs without partitioning
    String? region                 # e.g., "chr1:100000-200000"; forces direct mode

    # Optional: reuse outputs from a prior initial discovery run and skip LRAA_init
    File? precomputed_init_quant_tracking
    File? precomputed_init_gtf
    
    # Optional: provide precomputed cluster assignments to skip steps 1-3
    File? precomputed_cluster_assignments_tsv

    # Single-cell barcode and UMI tags
    String cell_barcode_tag = "CB"
    String read_umi_tag = "XM"

    # Resources and docker (propagated to subcalls where applicable)
    Int numThreadsPerWorker = 5
    Int numThreadsPerWorkerScattered = 5
    Int num_parallel_contigs = 3
    Int memoryGB = 64
    Int memoryGBPerWorkerScattered = 32
    Int memoryGBbuildSparseMatrices = 32
    Int memoryGBFilterCells = 32
    Int memoryGBSeurat = 32
    Int memoryGBmergeGTFs = 32
    Int memoryGBquantFinal = 32
    Int memoryGBscSparseMatrices = 32
    Int diskSizeGB = 256
    String docker = "us-central1-docker.pkg.dev/methods-dev-lab/lraa/lraa:latest"

    # Seurat clustering parameters (forwarded to Seurat subworkflow)
    Int min_cells = 10
    Int min_features = 1000
    Float percent_mt_max = 20.0
    String mt_pattern = "^MT-"
    Int npcs = 12
    Float resolution = 0.6
    Int n_variable_features = 2000
    Int seed = 1

    # Filter good cells parameters
    Boolean enable_filter_good_cells = true
    Float fdr_threshold = 0.01
    Int? lower_threshold

    File? ref_annot_gtf_source_gene_symbols # used as source for gene symbol assignment at the end.
  }

  # Only skip the initial discovery call when the downstream-critical artifacts are provided
  Boolean has_precomputed_init = defined(precomputed_init_gtf) && (defined(precomputed_init_quant_tracking) || defined(precomputed_cluster_assignments_tsv))
  Boolean has_precomputed_clusters = defined(precomputed_cluster_assignments_tsv)
  Boolean run_initial_phase = !has_precomputed_init
  Boolean run_clustering_phase = !has_precomputed_clusters
  Boolean run_cluster_guided = single_cell_pipe_mode == "cluster-guided"

  # 1) Initial transcript discovery + quantification on the full BAM (skipped when precomputed inputs are provided)
  if (run_initial_phase) {
    call LRAA.LRAA_wf as LRAA_init {
      input:
        sample_id = sample_id,
        referenceGenome = referenceGenome,
        inputBAM = inputBAM,
        annot_gtf = initial_annot_gtf,
        HiFi = HiFi,
        oversimplify = oversimplify,
        main_chromosomes = main_chromosomes,
        region = region,
        cell_barcode_tag = cell_barcode_tag,
        read_umi_tag = read_umi_tag,
        numThreadsPerWorker = numThreadsPerWorker,
        numThreadsPerWorkerScattered = numThreadsPerWorkerScattered,
        num_parallel_contigs = num_parallel_contigs,
        memoryGB = memoryGB,
        memoryGBPerWorkerScattered = memoryGBPerWorkerScattered,
        diskSizeGB = diskSizeGB,
        docker = docker,
        quant_only = quant_only
    }
  }

  File? init_quant_expr_file = LRAA_init.mergedQuantExpr
  File? init_quant_tracking_generated = LRAA_init.mergedQuantTracking
  File? init_quant_tracking_file = if (!run_initial_phase && defined(precomputed_init_quant_tracking)) then select_first([precomputed_init_quant_tracking]) else init_quant_tracking_generated
  File? init_gtf_generated = LRAA_init.mergedGTF
  File? init_gtf_file = if (!run_initial_phase && defined(precomputed_init_gtf)) then select_first([precomputed_init_gtf]) else init_gtf_generated

  # 2) Build single-cell sparse matrices from the initial tracking (skipped when precomputed clusters are provided)
  if (run_clustering_phase) {
    call BuildMatrices.BuildSparseMatricesFromTracking as build_sc_from_init_tracking {
      input:
        sample_id = sample_id,
        tracking_file = select_first([init_quant_tracking_file]),
        docker = docker,
        memoryGB = memoryGBbuildSparseMatrices
    }

    # 2.5) Filter good cells from the gene-level sparse matrix (optional)
    if (enable_filter_good_cells) {
      call FilterCells.FilterGoodCells as filter_good_cells {
        input:
          sample_id = sample_id,
          gene_sparse_tar_gz = build_sc_from_init_tracking.gene_sparse_dir_tgz,
          isoform_sparse_tar_gz = build_sc_from_init_tracking.isoform_sparse_dir_tgz,
          splice_pattern_sparse_tar_gz = build_sc_from_init_tracking.splice_pattern_sparse_dir_tgz,
          docker = docker,
          memoryGB = memoryGBFilterCells,
          fdr_threshold = fdr_threshold,
          lower_threshold = lower_threshold
      }
    }

    # 3) Cluster cells from the gene-level sparse matrix (filtered or unfiltered)
    File gene_sparse_for_clustering = if enable_filter_good_cells then select_first([filter_good_cells.filtered_gene_sparse_tar_gz]) else build_sc_from_init_tracking.gene_sparse_dir_tgz
    
    call Seurat.GeneSparseM_To_SeuratClusters as cluster_cells {
      input:
        sample_id = sample_id,
        gene_sparse_tar_gz = gene_sparse_for_clustering,
        docker = docker,
        memoryGB = memoryGBSeurat,
        min_cells = min_cells,
        min_features = min_features,
        percent_mt_max = percent_mt_max,
        mt_pattern = mt_pattern,
        npcs = npcs,
        resolution = resolution,
        n_variable_features = n_variable_features,
        seed = seed
    }
  }

  # Select cluster assignments: use precomputed if provided, otherwise use generated
  File? cluster_assignments_generated = cluster_cells.cluster_assignments_tsv
  File cluster_assignments_file = if (has_precomputed_clusters) then select_first([precomputed_cluster_assignments_tsv]) else select_first([cluster_assignments_generated])

  # 4) Cluster-guided reconstruction + final single-cell matrices (only if mode is 'cluster-guided')
  if (run_cluster_guided) {
    call ClusterGuided.LRAA_cell_cluster_guided as cluster_guided {
      input:
        sample_id = sample_id,
        referenceGenome = referenceGenome,
        inputBAM = inputBAM,
        cell_clusters_info = cluster_assignments_file,
        annot_gtf = if quant_only then initial_annot_gtf else init_gtf_file,
        HiFi = HiFi,
        oversimplify = oversimplify,
        main_chromosomes = main_chromosomes,
        cell_barcode_tag = cell_barcode_tag,
        read_umi_tag = read_umi_tag,
        numThreadsPerWorker = numThreadsPerWorker,
        numThreadsPerWorkerScattered = numThreadsPerWorkerScattered,
        num_parallel_contigs = num_parallel_contigs,
        memoryGB = memoryGB,
        memoryGBPerWorkerScattered = memoryGBPerWorkerScattered,
        memoryGBmergeGTFs = memoryGBmergeGTFs,
        memoryGBquantFinal = memoryGBquantFinal,
        memoryGBscSparseMatrices = memoryGBscSparseMatrices,
        diskSizeGB = diskSizeGB,
        docker = docker,
        quant_only_cluster_guided = quant_only
    }
  }

  # 5) Incorporate gene symbols: use cluster-guided outputs if available, otherwise use filtered (if enabled) or unfiltered good cell outputs
  # In quant_only mode, these GTF values will naturally be undefined since discovery doesn't produce them
  File? gtf_for_symbols = if run_cluster_guided then cluster_guided.LRAA_final_gtf else init_gtf_file
  File? gene_sparse_for_symbols = if run_cluster_guided then cluster_guided.sc_gene_sparse_tar_gz else (if enable_filter_good_cells then filter_good_cells.filtered_gene_sparse_tar_gz else build_sc_from_init_tracking.gene_sparse_dir_tgz)
  File? isoform_sparse_for_symbols = if run_cluster_guided then cluster_guided.sc_isoform_sparse_tar_gz else (if enable_filter_good_cells then filter_good_cells.filtered_isoform_sparse_tar_gz else build_sc_from_init_tracking.isoform_sparse_dir_tgz)
  File? splice_pattern_sparse_for_symbols = if run_cluster_guided then cluster_guided.sc_splice_pattern_sparse_tar_gz else (if enable_filter_good_cells then filter_good_cells.filtered_splice_pattern_sparse_tar_gz else build_sc_from_init_tracking.splice_pattern_sparse_dir_tgz)
  File? mapping_for_symbols = if run_cluster_guided then cluster_guided.sc_gene_transcript_splicehash_mapping else build_sc_from_init_tracking.mapping_file

  if (defined(ref_annot_gtf_source_gene_symbols) && defined(gene_sparse_for_symbols) && defined(isoform_sparse_for_symbols) && defined(splice_pattern_sparse_for_symbols) && defined(mapping_for_symbols)) {
    call GeneSymbols.Incorporate_gene_symbols as add_gene_symbols {
      input:
        sample_id = sample_id,
        reference_gtf = select_first([ref_annot_gtf_source_gene_symbols]),
        final_gtf = gtf_for_symbols,
        final_sc_gene_sparse_tar_gz = select_first([gene_sparse_for_symbols]),
        final_sc_isoform_sparse_tar_gz = select_first([isoform_sparse_for_symbols]),
        final_sc_splice_pattern_sparse_tar_gz = select_first([splice_pattern_sparse_for_symbols]),
        final_sc_gene_transcript_splicehash_mapping = select_first([mapping_for_symbols]),
        docker = docker
    }
  }

  output {
    # Initial discovery outputs
    File? init_quant_expr = init_quant_expr_file
    File? init_quant_tracking = init_quant_tracking_generated
    File? init_gtf = init_gtf_generated

    # Initial single-cell matrices and clustering inputs/outputs
    File? init_sc_gene_sparse_tar_gz = build_sc_from_init_tracking.gene_sparse_dir_tgz
    File? init_sc_isoform_sparse_tar_gz = build_sc_from_init_tracking.isoform_sparse_dir_tgz
    File? init_sc_splice_pattern_sparse_tar_gz = build_sc_from_init_tracking.splice_pattern_sparse_dir_tgz
    File? init_sc_gene_transcript_splicehash_mapping = build_sc_from_init_tracking.mapping_file
    
    
    # Filter empty droplets
    File? filtered_gene_sparse_tar_gz = filter_good_cells.filtered_gene_sparse_tar_gz
    File? filtered_isoform_sparse_tar_gz = filter_good_cells.filtered_isoform_sparse_tar_gz
    File? filtered_splice_pattern_sparse_tar_gz = filter_good_cells.filtered_splice_pattern_sparse_tar_gz
    File? good_cell_barcodes = filter_good_cells.good_cell_barcodes
    File? filtering_summary = filter_good_cells.filtering_summary
    
    # Seurat for gene-based filtering, cell clustering, and umap
    File? seurat_umap_pdf = cluster_cells.umap_pdf
    File? seurat_umap_with_clusters_tsv = cluster_cells.umap_with_clusters_tsv
    File? seurat_rds_initial = cluster_cells.seurat_rds_initial
    File? seurat_rds = cluster_cells.seurat_rds
    File? seurat_cluster_assignments = cluster_assignments_generated

    # Final cluster-guided LRAA outputs (main deliverables)
    File? final_gtf = cluster_guided.LRAA_final_gtf
    File? final_gtf_tracking = cluster_guided.LRAA_final_gtf_tracking
    File? final_tracking = cluster_guided.LRAA_final_tracking
    File? final_sc_gene_sparse_tar_gz = cluster_guided.sc_gene_sparse_tar_gz
    File? final_sc_isoform_sparse_tar_gz = cluster_guided.sc_isoform_sparse_tar_gz
    File? final_sc_splice_pattern_sparse_tar_gz = cluster_guided.sc_splice_pattern_sparse_tar_gz
    File? final_sc_gene_transcript_splicehash_mapping = cluster_guided.sc_gene_transcript_splicehash_mapping

    # Convenience tarballs and matrices from the cluster-guided phase
    File? partitioned_cluster_bams_tar = cluster_guided.LRAA_partitioned_cluster_bams_tar
    File? final_cluster_exprs_tar = cluster_guided.LRAA_final_cluster_exprs_tar
    File? final_cluster_trackings_tar = cluster_guided.LRAA_final_cluster_trackings_tar
    
    # Preliminary intermediate outputs from cluster-guided discovery phase
    File? prelim_cluster_gtfs = cluster_guided.LRAA_prelim_cluster_gtfs
    File? prelim_cluster_read_trackings = cluster_guided.LRAA_prelim_cluster_read_trackings
    File? prelim_cluster_pseudobulk_exprs = cluster_guided.LRAA_prelim_cluster_pseudobulk_exprs
    
    File? cluster_gene_counts_matrix = cluster_guided.cluster_gene_counts_matrix
    File? cluster_gene_TPM_matrix = cluster_guided.cluster_gene_TPM_matrix
    File? cluster_isoform_counts_matrix = cluster_guided.cluster_isoform_counts_matrix
    File? cluster_isoform_TPM_matrix = cluster_guided.cluster_isoform_TPM_matrix
    File? cluster_isoform_counts_forDiffIsoUsage = cluster_guided.cluster_isoform_counts_forDiffIsoUsage

    File? incl_gene_symbols_gffcompare_tracking = add_gene_symbols.gffcompare_tracking
    File? incl_gene_symbols_gffcompare_stats = add_gene_symbols.gffcompare_stats
    File? incl_gene_symbols_updated_gtf = add_gene_symbols.updated_gtf_with_gene_symbols
    File? incl_gene_symbols_updated_id_mappings = add_gene_symbols.updated_id_mappings
    File? incl_gene_symbols_gene_sparse_tar_gz = add_gene_symbols.updated_gene_sparse_tar_gz
    File? incl_gene_symbols_isoform_sparse_tar_gz = add_gene_symbols.updated_isoform_sparse_tar_gz
    File? incl_gene_symbols_splice_pattern_sparse_tar_gz = add_gene_symbols.updated_splice_pattern_sparse_tar_gz
  }
}