version 1.0 workflow scANVI { meta { description: "Pipeline for cell type label transfer on Multiome data using SCVI and SCANVI models. Integrates single-cell RNA (GEX) and ATAC data with an annotated reference to transfer cell type labels via semi-supervised deep generative models." allowNestedInputs: true } input { # GCS bucket path containing input h5ad files (e.g., gs://bucket/path/to/inputs) String? input_bucket # Optional direct h5ad file inputs (override bucket files if provided) File? gex_h5ad File? atac_h5ad File? ref_h5ad # Expected filenames in the input bucket String gex_filename = "gex.h5ad" String atac_filename = "atac.h5ad" String ref_filename = "ref.h5ad" # Unique identifier prepended to all output filenames String input_id } String pipeline_version = "1.0.0" # Docker image (same container for both tasks; only Task 2 gets GPUs attached) String docker = "us.gcr.io/broad-gotc-prod/scvi-scanvi@sha256:81fe915a045bd2929a1c457f4a0061055c6ea42fa3f88e9352b618e4a6e47b58" # Step 1: CPU-only preprocessing and filtering of all three h5ad inputs call PreprocessFilter { input: input_bucket = input_bucket, gex_h5ad = gex_h5ad, atac_h5ad = atac_h5ad, ref_h5ad = ref_h5ad, gex_filename = gex_filename, atac_filename = atac_filename, ref_filename = ref_filename, input_id = input_id, docker = docker } # Step 2: GPU-accelerated SCVI/SCANVI model training and label transfer call MultiomeLabelTransfer { input: gex_h5ad = PreprocessFilter.preprocessed_gex_h5ad, atac_activity_h5ad = PreprocessFilter.preprocessed_atac_activity_h5ad, ref_h5ad = PreprocessFilter.preprocessed_ref_h5ad, input_id = input_id, docker = docker } output { File scanvi_predictions_h5ad = MultiomeLabelTransfer.scanvi_predictions_h5ad File atac_annotated_h5ad = MultiomeLabelTransfer.atac_annotated_h5ad File gex_annotated_h5ad = MultiomeLabelTransfer.gex_annotated_h5ad String pipeline_version_out = pipeline_version } } # ────────────────────────────────────────────────────────────────────────────── # Task 1: PreprocessFilter (CPU-only) # # Handles all h5ad preprocessing and filtering before model training: # - Patches missing columns (star_IsCell, gex_barcodes) # - Filters GEX to STARsolo cell calls and min count thresholds # - Reindexes ATAC barcodes to match GEX # - Subsets to shared barcodes across GEX and ATAC # - Assigns batch labels and modality tags # - Converts ATAC cell-by-bin matrix to gene activity matrix (snapatac2) # - Outputs three preprocessed h5ad files ready for model training # ────────────────────────────────────────────────────────────────────────────── task PreprocessFilter { input { # GCS bucket path containing input h5ad files String? input_bucket # Optional direct h5ad file inputs File? gex_h5ad File? atac_h5ad File? ref_h5ad # Expected filenames in the input bucket String gex_filename = "gex.h5ad" String atac_filename = "atac.h5ad" String ref_filename = "ref.h5ad" # Runtime attributes hardcoded in each task String input_id String docker Int disk_size = 1000 # bigger disk before cell filtering Int mem_size = 120 Int nthreads = 32 } parameter_meta { input_bucket: "GCS bucket path containing input h5ad files (e.g., gs://bucket/path/to/inputs)." gex_h5ad: "Gene expression AnnData h5ad file from Multiome/Optimus pipeline output." atac_h5ad: "ATAC cell-by-bin AnnData h5ad file from Multiome/PeakCalling pipeline output." ref_h5ad: "Annotated reference AnnData h5ad file with cell type labels in obs['final_annotation']." gex_filename: "Expected GEX h5ad filename in the input bucket." atac_filename: "Expected ATAC h5ad filename in the input bucket." ref_filename: "Expected reference h5ad filename in the input bucket." input_id: "Unique identifier prepended to all output filenames." docker: "Docker image containing the scvi-scanvi runtime environment." disk_size: "Disk size in GB." mem_size: "Memory size in GB." } command <<< set -euo pipefail # ── Resolve input file paths ────────────────────────────────────────── if [ -n "~{default='' gex_h5ad}" ]; then # Direct File inputs: Cromwell already localized them GEX_FILE="~{default='' gex_h5ad}" ATAC_FILE="~{default='' atac_h5ad}" REF_FILE="~{default='' ref_h5ad}" # Verify all three localized files are present and non-empty for f in "$GEX_FILE" "$ATAC_FILE" "$REF_FILE"; do if [ ! -f "$f" ]; then echo "ERROR: input file not found: $f" >&2; exit 1 fi if [ ! -s "$f" ]; then echo "ERROR: input file is empty: $f" >&2; exit 1 fi done elif [ -n "~{default='' input_bucket}" ]; then # Bucket mode: construct GCS paths and download BUCKET="~{default='' input_bucket}" BUCKET="${BUCKET%/}" GEX_FILE="${BUCKET}/~{gex_filename}" ATAC_FILE="${BUCKET}/~{atac_filename}" REF_FILE="${BUCKET}/~{ref_filename}" # Verify all three objects exist in the bucket before downloading for gs_path in "$GEX_FILE" "$ATAC_FILE" "$REF_FILE"; do if ! gsutil -q stat "$gs_path"; then echo "ERROR: GCS object not found: $gs_path" >&2; exit 1 fi done echo "Downloading inputs from bucket..." gsutil cp "$GEX_FILE" gex_input.h5ad gsutil cp "$ATAC_FILE" atac_input.h5ad gsutil cp "$REF_FILE" ref_input.h5ad GEX_FILE="gex_input.h5ad" ATAC_FILE="atac_input.h5ad" REF_FILE="ref_input.h5ad" else echo "ERROR: must provide either direct file inputs (gex_h5ad, atac_h5ad, ref_h5ad) or input_bucket." >&2 exit 1 fi export GEX_FILE ATAC_FILE REF_FILE # Symlink the gene annotation file (needed by snapatac2 make_gene_matrix) ln -sf /usr/local/gencode.v41.basic.annotation.gff3.gz . # ── Run preprocessing in Python ─────────────────────────────────────── python3 <>> runtime { docker: docker bootDiskSizeGb: 20 disks: "local-disk ${disk_size} SSD" memory: "${mem_size} GiB" cpu: nthreads maxRetries: 1 } output { File preprocessed_gex_h5ad = "~{input_id}_preprocessed_gex.h5ad" File preprocessed_atac_activity_h5ad = "~{input_id}_preprocessed_atac_activity.h5ad" File preprocessed_ref_h5ad = "~{input_id}_preprocessed_ref.h5ad" } } # ────────────────────────────────────────────────────────────────────────────── # Task 2: MultiomeLabelTransfer (GPU) # # Receives preprocessed h5ad files and runs model training + label transfer: # - SCVI unsupervised latent space learning # - SCANVI semi-supervised label transfer # - Outputs annotated GEX, ATAC, and SCANVI prediction h5ad files # ────────────────────────────────────────────────────────────────────────────── task MultiomeLabelTransfer { input { # Preprocessed h5ad files from PreprocessFilter File gex_h5ad File atac_activity_h5ad File ref_h5ad # Runtime attributes String input_id String docker Int disk_size = 500 Int mem_size = 120 Int nthreads = 32 } parameter_meta { gex_h5ad: "Preprocessed gene expression h5ad file (filtered, batch-labeled, modality-tagged)." atac_activity_h5ad: "Preprocessed ATAC gene activity h5ad file (converted from cell-by-bin, batch-labeled, modality-tagged)." ref_h5ad: "Preprocessed reference h5ad file with cell type labels and modality tag." input_id: "Unique identifier prepended to all output filenames." docker: "Docker image containing the scvi-scanvi runtime environment." disk_size: "Disk size in GB." mem_size: "Memory size in GB." } command <<< set -euo pipefail python3 <>> runtime { docker: docker bootDiskSizeGb: 20 disks: "local-disk ${disk_size} SSD" memory: "${mem_size} GiB" cpu: nthreads hardware_gpu_type: "nvidia-tesla-t4" # known to work with Terra gpuCount: 2 nvidia_driver_version: "535.104.05" # compatible with CUDA 12.x and T4 GPUs, known to work with Terra maxRetries: 1 } output { File scanvi_predictions_h5ad = "~{input_id}_SCANVI_predictions.h5ad" File atac_annotated_h5ad = "~{input_id}_atac_annotated_matrix.h5ad" File gex_annotated_h5ad = "~{input_id}_gex_annotated_matrix.h5ad" } }