version 1.0 import "../../../tasks/broad/Utilities.wdl" as utils workflow snm3C { input { Array[File] fastq_input_read1 Array[File] fastq_input_read2 File random_primer_indexes String plate_id String cloud_provider # mapping inputs File tarred_index_files File genome_fa File chromosome_sizes String r1_adapter = "AGATCGGAAGAGCACACGTCTGAAC" String r2_adapter = "AGATCGGAAGAGCGTCGTGTAGGGA" Int r1_left_cut = 10 Int r1_right_cut = 10 Int r2_left_cut = 10 Int r2_right_cut = 10 Int min_read_length = 30 Int num_upstr_bases = 0 Int num_downstr_bases = 2 Int compress_level = 5 Int batch_number = 6 } #docker images String m3c_yap_hisat_docker = "m3c-yap-hisat:2.4" # Determine docker prefix based on cloud provider String gcr_docker_prefix = "us.gcr.io/broad-gotc-prod/" String acr_docker_prefix = "dsppipelinedev.azurecr.io/" String docker_prefix = if cloud_provider == "gcp" then gcr_docker_prefix else acr_docker_prefix String cromwell_root_dir = if cloud_provider == "gcp" then "/mnt/disks/cromwell_root" else "/cromwell-executions" # make sure either gcp or azr is supplied as cloud_provider input if ((cloud_provider != "gcp") && (cloud_provider != "azure")) { call utils.ErrorWithMessage as ErrorMessageIncorrectInput { input: message = "cloud_provider must be supplied with either 'gcp' or 'azure'." } } # version of the pipeline String pipeline_version = "4.1.0" call Demultiplexing { input: fastq_input_read1 = fastq_input_read1, fastq_input_read2 = fastq_input_read2, random_primer_indexes = random_primer_indexes, plate_id = plate_id, batch_number = batch_number, docker = docker_prefix + m3c_yap_hisat_docker, } scatter(tar in Demultiplexing.tarred_demultiplexed_fastqs) { call Hisat_paired_end as Hisat_paired_end { input: tarred_demultiplexed_fastqs = tar, tarred_index_files = tarred_index_files, genome_fa = genome_fa, chromosome_sizes = chromosome_sizes, min_read_length = min_read_length, r1_adapter = r1_adapter, r2_adapter = r2_adapter, r1_left_cut = r1_left_cut, r1_right_cut = r1_right_cut, r2_left_cut = r2_left_cut, r2_right_cut = r2_right_cut, plate_id = plate_id, docker = docker_prefix + m3c_yap_hisat_docker, cromwell_root_dir = cromwell_root_dir, cloud_provider = cloud_provider, } call Hisat_single_end as Hisat_single_end { input: split_fq_tar = Hisat_paired_end.split_fq_tar, tarred_index_files = tarred_index_files, genome_fa = genome_fa, plate_id = plate_id, docker = docker_prefix + m3c_yap_hisat_docker, cromwell_root_dir = cromwell_root_dir, cloud_provider = cloud_provider } call Merge_sort_analyze as Merge_sort_analyze { input: paired_end_unique_tar = Hisat_paired_end.unique_bam_tar, read_overlap_tar = Hisat_single_end.remove_overlaps_output_bam_tar, genome_fa = genome_fa, num_upstr_bases = num_upstr_bases, num_downstr_bases = num_downstr_bases, compress_level = compress_level, chromosome_sizes = chromosome_sizes, plate_id = plate_id, docker = docker_prefix + m3c_yap_hisat_docker, cromwell_root_dir = cromwell_root_dir, cloud_provider = cloud_provider } } call Summary_PerCellOutput { input: name_sorted_bams = Merge_sort_analyze.name_sorted_bam, unique_reads_cgn_extraction_allc = Merge_sort_analyze.allc, unique_reads_cgn_extraction_tbi = Merge_sort_analyze.tbi, all_reads_3C_contacts = Merge_sort_analyze.all_reads_3C_contacts, unique_reads_cgn_extraction_allc_extract = Merge_sort_analyze.extract_allc_output_allc_tar, unique_reads_cgn_extraction_tbi_extract = Merge_sort_analyze.extract_allc_output_tbi_tar, plate_id = plate_id, docker = docker_prefix + m3c_yap_hisat_docker, cromwell_root_dir = cromwell_root_dir } call Summary { input: trimmed_stats = Hisat_paired_end.trim_stats_tar, hisat3n_stats = Hisat_paired_end.hisat3n_paired_end_stats_tar, r1_hisat3n_stats = Hisat_single_end.hisat3n_dna_split_reads_summary_R1_tar, r2_hisat3n_stats = Hisat_single_end.hisat3n_dna_split_reads_summary_R2_tar, dedup_stats = Merge_sort_analyze.dedup_stats_tar, chromatin_contact_stats = Merge_sort_analyze.chromatin_contact_stats, allc_uniq_reads_stats = Merge_sort_analyze.allc_uniq_reads_stats, unique_reads_cgn_extraction_tbi = Merge_sort_analyze.extract_allc_output_tbi_tar, plate_id = plate_id, docker = docker_prefix + m3c_yap_hisat_docker, cromwell_root_dir = cromwell_root_dir, cloud_provider = cloud_provider } meta { allowNestedInputs: true } output { File MappingSummary = Summary.mapping_summary Array[File] reference_version = Hisat_paired_end.reference_version Array[File] name_sorted_bam_array = Summary_PerCellOutput.name_sorted_bam_array Array[File] unique_reads_cgn_extraction_allc_array = Summary_PerCellOutput.unique_reads_cgn_extraction_allc_array Array[File] unique_reads_cgn_extraction_tbi_array = Summary_PerCellOutput.unique_reads_cgn_extraction_tbi_array Array[File] all_reads_3C_contacts_array = Summary_PerCellOutput.all_reads_3C_contacts_array Array[File] unique_reads_cgn_extraction_allc_extract_array = Summary_PerCellOutput.unique_reads_cgn_extraction_allc_extract_array Array[File] unique_reads_cgn_extraction_tbi_extract_array = Summary_PerCellOutput.unique_reads_cgn_extraction_tbi_extract_array } } task Demultiplexing { input { Array[File] fastq_input_read1 Array[File] fastq_input_read2 File random_primer_indexes String plate_id Int batch_number Int min_threshold = 100 Int max_threshold = 6000000 String docker Int disk_size = 1000 Int mem_size = 10 Int preemptible_tries = 2 Int cpu = 8 } command <<< set -euo pipefail WORKING_DIR=`pwd` # Cat files for each r1, r2 cat ~{sep=' ' fastq_input_read1} > $WORKING_DIR/r1.fastq.gz cat ~{sep=' ' fastq_input_read2} > $WORKING_DIR/r2.fastq.gz # Run cutadapt /opt/conda/bin/cutadapt -Z -e 0.01 --no-indels -j 8 \ -g file:~{random_primer_indexes} \ -o ~{plate_id}-{name}-R1.fq.gz \ -p ~{plate_id}-{name}-R2.fq.gz \ $WORKING_DIR/r1.fastq.gz \ $WORKING_DIR/r2.fastq.gz \ > $WORKING_DIR/~{plate_id}.stats.txt # Remove the fastq files that end in unknown-R1.fq.gz and unknown-R2.fq.gz rm $WORKING_DIR/*-unknown-R{1,2}.fq.gz python3 < ~{max_threshold}: print("Removing ", file_path, " with count equal to ", adapter_counts[adapter_name]) os.remove(file_path) CODE # Check if the number of *R1.fq.gz files is 0 if [[ $(ls | grep "\-R1.fq.gz" | wc -l) -eq 0 ]]; then echo "Error: No files found. All fastq files were removed. Exiting." exit 1 fi # Batch the fastq files into folders of batch_number size R1_files=($(ls $WORKING_DIR | grep "\-R1.fq.gz")) R2_files=($(ls $WORKING_DIR | grep "\-R2.fq.gz")) batch_number=~{batch_number} total_files=${#R1_files[@]} echo "Total files: $total_files" if [[ $total_files -lt $batch_number ]]; then echo "Warning: Number of files is less than the batch number. Updating batch number to $total_files." batch_number=$total_files fi for i in $(seq 1 "${batch_number}"); do # Use seq for reliable brace expansion mkdir -p "batch${i}" # Combine batch and i, use -p to create parent dirs done # Counter for the folder index and create emptycells file folder_index=1 # Distribute the FASTQ files and create TAR files for file in "${R1_files[@]}"; do sample_id=$(basename "$file" "-R1.fq.gz") r2_file="${sample_id}-R2.fq.gz" mv $WORKING_DIR/$file batch$((folder_index))/$file mv $WORKING_DIR/$r2_file batch$((folder_index))/$r2_file # Increment the counter folder_index=$(( (folder_index % $batch_number) + 1 )) done # Tar up files per batch echo "TAR files" for i in $(seq 1 "${batch_number}"); do tar -cf - $WORKING_DIR/batch${i}/*.fq.gz | pigz > ~{plate_id}.${i}.cutadapt_output_files.tar.gz done >>> runtime { docker: docker disks: "local-disk ${disk_size} SSD" cpu: cpu memory: "${mem_size} GiB" preemptible: preemptible_tries } output { Array[File] tarred_demultiplexed_fastqs = glob("*.tar.gz") File stats = "~{plate_id}.stats.txt" } } task Hisat_paired_end { input { File tarred_demultiplexed_fastqs File tarred_index_files File genome_fa File chromosome_sizes String plate_id String docker String cromwell_root_dir String cloud_provider String r1_adapter String r2_adapter Int r1_left_cut Int r1_right_cut Int r2_left_cut Int r2_right_cut Int min_read_length Int disk_size = 1000 Int cpu = 48 Int mem_size = 64 Int preemptible_tries = 2 String cpu_platform = "Intel Ice Lake" } command <<< set -euo pipefail set -x lscpu WORKING_DIR=`pwd` # check genomic reference version and print to output txt file STRING=~{genome_fa} BASE=$(basename $STRING .fa) echo "The reference is $BASE" > ~{plate_id}.reference_version.txt # untar the index files for hisat task start=$(date +%s) echo "Untarring tarred_index_files" pigz -dc ~{tarred_index_files} | tar -xf - rm ~{tarred_index_files} end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to untar tarred_index_files: $elapsed seconds" # get the basename of the genome_fa file cp ~{genome_fa} . genome_fa_basename=$(basename ~{genome_fa} .fa) start=$(date +%s) echo "samtools faidx $genome_fa_basename.fa" samtools faidx $genome_fa_basename.fa end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to samtools faidx: $elapsed seconds" min_read_length=~{min_read_length} # untar the demultiplexed fastqs for sort and trim task start=$(date +%s) echo "Untar demultiplexed fastqs" pigz -dc ~{tarred_demultiplexed_fastqs} | tar -xf - end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to untar: $elapsed seconds" if [ ~{cloud_provider} = "gcp" ]; then batch_dir="~{cromwell_root_dir}~{cromwell_root_dir}/batch*/" else batch_dir="~{cromwell_root_dir}/*/*/*/*/*~{cromwell_root_dir}/*/*/*/*/batch*/" fi task() { local file=$1 sample_id=$(basename "$file" "-R1.fq.gz") echo $sample_id r2_file="${sample_id}-R2.fq.gz" r1_file="${sample_id}-R1.fq.gz" cp $batch_dir/"$r1_file" . cp $batch_dir/"$r2_file" . # sort start=$(date +%s) echo "Run sort r1" zcat "$r1_file" | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > "${sample_id}-R1_sorted.fq" end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run sort r1: $elapsed seconds" # sort start=$(date +%s) echo "Run sort r2" zcat "$r2_file" | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > "${sample_id}-R2_sorted.fq" end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run sort r2: $elapsed seconds" # trim using cutadapt start=$(date +%s) echo "Run cutadapt" /opt/conda/bin/cutadapt \ -a R1Adapter=~{r1_adapter} \ -A R2Adapter=~{r2_adapter} \ --report=minimal -O 6 -q 20 \ -u ~{r1_left_cut} \ -u -~{r1_right_cut} \ -U ~{r2_left_cut} \ -U -~{r2_right_cut} \ -Z \ -m ${min_read_length}:${min_read_length} \ --pair-filter 'both' \ -o ${sample_id}-R1_trimmed.fq.gz \ -p ${sample_id}-R2_trimmed.fq.gz \ ${sample_id}-R1_sorted.fq ${sample_id}-R2_sorted.fq \ > ${sample_id}.trimmed.stats.txt end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run cutadapt: $elapsed seconds" # hisat run start=$(date +%s) echo "Run hisat" if [ ~{cloud_provider} = "gcp" ]; then hisat_index_file_dir="~{cromwell_root_dir}/$genome_fa_basename" else hisat_index_file_dir="$WORKING_DIR/$genome_fa_basename" fi hisat-3n $hisat_index_file_dir \ -q \ -1 ${sample_id}-R1_trimmed.fq.gz \ -2 ${sample_id}-R2_trimmed.fq.gz \ --directional-mapping-reverse --base-change C,T \ --no-repeat-index \ --no-spliced-alignment \ --no-temp-splicesite \ -t \ --new-summary \ --summary-file ${sample_id}.hisat3n_dna_summary.txt \ --threads 8 | samtools view -b -q 0 -o "${sample_id}.hisat3n_dna.unsort.bam" end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run hisat: $elapsed seconds" # call separate_unique_and_multi_align_reads start=$(date +%s) echo "Run separate_unique_and_multi_align_reads" python3 -c 'from cemba_data.hisat3n import separate_unique_and_multi_align_reads;separate_unique_and_multi_align_reads(in_bam_path="'"$sample_id"'.hisat3n_dna.unsort.bam", out_unique_path="'"$sample_id"'.hisat3n_dna.unique_aligned.bam", out_multi_path="'"$sample_id"'.hisat3n_dna.multi_aligned.bam", out_unmappable_path="'"$sample_id"'.hisat3n_dna.unmapped.fastq", unmappable_format="fastq", mapq_cutoff=10, qlen_cutoff='"$min_read_length"')' end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run separate_unique_and_multi_align_reads: $elapsed seconds" # call split_hisat3n_unmapped_reads start=$(date +%s) echo "Run split_hisat3n_unmapped_reads" python3 -c 'from cemba_data.hisat3n import *;split_hisat3n_unmapped_reads(fastq_path="'"$sample_id"'.hisat3n_dna.unmapped.fastq",output_prefix="'"$sample_id"'.hisat3n_dna.split_reads",min_length='"$min_read_length"')' end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run split_hisat3n_unmapped_reads: $elapsed seconds" rm ${sample_id}-R1.fq.gz ${sample_id}-R2.fq.gz rm ${sample_id}-R1_sorted.fq ${sample_id}-R2_sorted.fq rm ${sample_id}-R1_trimmed.fq.gz ${sample_id}-R2_trimmed.fq.gz rm ${sample_id}.hisat3n_dna.unsort.bam ${sample_id}.hisat3n_dna.multi_aligned.bam rm ${sample_id}.hisat3n_dna.unmapped.fastq } R1_files=($(ls $batch_dir | grep "\-R1.fq.gz")) R2_files=($(ls $batch_dir | grep "\-R2.fq.gz")) # run 6 instances of task in parallel for file in "${R1_files[@]}"; do ( echo "starting task $file.." task "$file" sleep $(( (RANDOM % 3) + 1)) ) & # allow to execute up to 6 jobs in parallel if [[ $(jobs -r -p | wc -l) -ge 6 ]]; then wait -n fi done wait echo "Tasks all done." du -h * #################################### ## make sure that the number of output bams equals the length of R1_files # Count the number of *.hisat3n_dna.unique_aligned.bam files bam_count=$(find . -maxdepth 1 -type f -name '*.hisat3n_dna.unique_aligned.bam' | wc -l) fastq_counts=$(find . -maxdepth 1 -type f -name '*.split_reads*.fastq' | wc -l) # Get the length of the array ${R1_files[@]} array_length=${#R1_files[@]} # Check if the count of *.hisat3n_dna.unique_aligned.bam files matches the length of the array ${R1_files[@]} if [ "$bam_count" -ne "$array_length" ]; then echo "Error: Number of BAM files does not match the length of the array." exit 1 fi # Check if the count of FASTQ files matches the length of the array ${R1_files[@]} if [ "$fastq_counts" -ne "$((2 * array_length))" ]; then echo "Error: Number of FASTQ files: $fastq_count does not match the 2 * length of the array: ${#R1_files[@]}." exit 1 fi echo "Number of BAM and FASTQ files matches the length of the array." #################################### # tar up stats echo "Tar up stats" start=$(date +%s) tar -cf - *.trimmed.stats.txt | pigz > ~{plate_id}.trimmed_stats_files.tar.gz tar -cf - *.hisat3n_dna_summary.txt | pigz > ~{plate_id}.hisat3n_paired_end_stats_files.tar.gz end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run tar stats $elapsed seconds" # tar up the uniqe bams echo "Tar up unique bams" start=$(date +%s) tar -cf - *.hisat3n_dna.unique_aligned.bam | pigz > ~{plate_id}.hisat3n_paired_end_unique_bam_files.tar.gz end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run tar unique bams $elapsed seconds" # tar up the split fastq files echo "Tar up fastqs" start=$(date +%s) tar -cf - *.split_reads*.fastq | pigz > ~{plate_id}.hisat3n_paired_end_split_fastq_files.tar.gz end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run tar fastqs $elapsed seconds" >>> runtime { docker: docker disks: "local-disk ${disk_size} SSD" cpu: cpu memory: "${mem_size} GiB" cpuPlatform: cpu_platform preemptible: preemptible_tries } output { File trim_stats_tar = "~{plate_id}.trimmed_stats_files.tar.gz" File hisat3n_paired_end_stats_tar = "~{plate_id}.hisat3n_paired_end_stats_files.tar.gz" File unique_bam_tar = "~{plate_id}.hisat3n_paired_end_unique_bam_files.tar.gz" File split_fq_tar = "~{plate_id}.hisat3n_paired_end_split_fastq_files.tar.gz" File reference_version = "~{plate_id}.reference_version.txt" } } task Hisat_single_end { input { File split_fq_tar File genome_fa File tarred_index_files String plate_id String docker String cromwell_root_dir String cloud_provider Int disk_size = 1000 Int mem_size = 64 Int cpu = 32 Int preemptible_tries = 2 String cpu_platform = "Intel Ice Lake" } command <<< set -euo pipefail set -x lscpu WORKING_DIR=`pwd` # untar the tarred index files echo "Untar tarred_index_files" start=$(date +%s) pigz -dc ~{tarred_index_files} | tar -xf - rm ~{tarred_index_files} end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to untar tarred_index_files: $elapsed seconds" cp ~{genome_fa} . #get the basename of the genome_fa file echo "samtools faidx" start=$(date +%s) genome_fa_basename=$(basename ~{genome_fa} .fa) samtools faidx $genome_fa_basename.fa end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to samtools faidx: $elapsed seconds" # untar the unmapped fastq files echo "Untar split_fq_tar" start=$(date +%s) pigz -dc ~{split_fq_tar} | tar -xf - rm ~{split_fq_tar} end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to untar split_fq_tar: $elapsed seconds" # make directories mkdir -p ~{cromwell_root_dir}/merged_sort_bams mkdir -p ~{cromwell_root_dir}/read_overlap # define lists of r1 and r2 fq files R1_files=($(ls | grep "\.hisat3n_dna.split_reads.R1.fastq")) R2_files=($(ls | grep "\.hisat3n_dna.split_reads.R2.fastq")) task() { BASE=$(basename "$file" ".hisat3n_dna.split_reads.R1.fastq") echo $BASE echo "Running hisat on sample_id_R1" $BASE echo "Hisat 3n R1" start=$(date +%s) if [ ~{cloud_provider} = "gcp" ]; then hisat_index_file_dir="~{cromwell_root_dir}/$genome_fa_basename" else hisat_index_file_dir="$WORKING_DIR/$genome_fa_basename" fi # hisat on R1 single end hisat-3n $hisat_index_file_dir \ -q \ -U ${BASE}.hisat3n_dna.split_reads.R1.fastq \ -S ${BASE}.hisat3n_dna.split_reads.R1.sam --directional-mapping-reverse --base-change C,T \ --no-repeat-index \ --no-spliced-alignment \ --no-temp-splicesite \ -t \ --new-summary \ --summary-file ${BASE}.hisat3n_dna_split_reads_summary.R1.txt \ --threads 8 end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run $elapsed seconds" echo "Finish running hisat on sample_id_R1" $BASE echo "Hisat 3n R2" start=$(date +%s) echo "Running hisat on sample_id_R2" $BASE # hisat on R2 single end hisat-3n $hisat_index_file_dir \ -q \ -U ${BASE}.hisat3n_dna.split_reads.R2.fastq \ -S ${BASE}.hisat3n_dna.split_reads.R2.sam --directional-mapping --base-change C,T \ --no-repeat-index \ --no-spliced-alignment \ --no-temp-splicesite \ -t --new-summary \ --summary-file ${BASE}.hisat3n_dna_split_reads_summary.R2.txt \ --threads 8 end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run $elapsed seconds" echo "Finish running hisat on sample_id_R2" $BASE # samtools merge echo "samtools merge R1 and R2" start=$(date +%s) samtools merge -o ${BASE}.name_merged.sam ${BASE}.hisat3n_dna.split_reads.R1.sam ${BASE}.hisat3n_dna.split_reads.R2.sam -@8 end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run samtools merge $elapsed seconds" # samtools sort echo "samtools sort R1 and R2" start=$(date +%s) samtools sort -n -@8 -m1g ${BASE}.name_merged.sam -o ${BASE}.name_sorted.bam end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run samtools sort $elapsed seconds" # samtools filter bam echo "samtools -q 10" start=$(date +%s) samtools view -q 10 ${BASE}.name_sorted.bam -o ${BASE}.name_sorted.filtered.bam end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run samtools -q 10 $elapsed seconds" if [ ~{cloud_provider} = "gcp" ]; then bam_path_prefix="~{cromwell_root_dir}" else bam_path_prefix=$WORKING_DIR fi # remove_overlap_read_parts echo "call remove_overlap_read_parts" start=$(date +%s) python3 -c 'from cemba_data.hisat3n import *;import os;remove_overlap_read_parts(in_bam_path="'"$BASE"'.name_sorted.filtered.bam",out_bam_path="'"$BASE"'.hisat3n_dna.split_reads.read_overlap.bam")' end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run remove overlap $elapsed seconds" # remove files rm ${BASE}.hisat3n_dna.split_reads.R1.fastq ${BASE}.hisat3n_dna.split_reads.R2.fastq rm ${BASE}.hisat3n_dna.split_reads.R1.sam ${BASE}.hisat3n_dna.split_reads.R2.sam rm ${BASE}.name_merged.sam rm ${BASE}.name_sorted.bam rm ${BASE}.name_sorted.filtered.bam } # run 4 instances in parallel each with 8 threads for file in "${R1_files[@]}"; do ( echo "starting task $file.." task "$file" sleep $(( (RANDOM % 3) + 1)) ) & if [[ $(jobs -r -p | wc -l) -ge 4 ]]; then wait -n fi done wait echo "All done running tasks." du -h * #################################### ## make sure that the number of output bams equals the length of R1_files # Count the number of bam files bam_count=$(find . -maxdepth 1 -type f -name '*read_overlap.bam' | wc -l) # Get the length of the array ${R1_files[@]} array_length=${#R1_files[@]} # Check if the count of bams matches the length of the array ${R1_files[@]} if [ "$bam_count" -ne "$array_length" ]; then echo "Error: Number of BAM files does not match the length of the array." exit 1 fi echo "Number of BAM files matches the length of the array." #################################### echo "Tar up summary text files" start=$(date +%s) # tar up the r1 and r2 stats files -p to set number of threads tar -cf - *.hisat3n_dna_split_reads_summary.R1.txt | pigz > ~{plate_id}.hisat3n_dna_split_reads_summary.R1.tar.gz tar -cf - *.hisat3n_dna_split_reads_summary.R2.txt | pigz > ~{plate_id}.hisat3n_dna_split_reads_summary.R2.tar.gz end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run tar summary text files $elapsed seconds" # tar up read overlap files echo "Tar up read_overlap bams" start=$(date +%s) tar -cf - *read_overlap.bam | pigz > ~{plate_id}.remove_overlap_read_parts.tar.gz end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to tar read_overlap bams $elapsed seconds" >>> runtime { docker: docker disks: "local-disk ${disk_size} SSD" cpu: cpu memory: "${mem_size} GiB" cpuPlatform: cpu_platform preemptible: preemptible_tries } output { File hisat3n_dna_split_reads_summary_R1_tar = "~{plate_id}.hisat3n_dna_split_reads_summary.R1.tar.gz" File hisat3n_dna_split_reads_summary_R2_tar = "~{plate_id}.hisat3n_dna_split_reads_summary.R2.tar.gz" File remove_overlaps_output_bam_tar = "~{plate_id}.remove_overlap_read_parts.tar.gz" } } task Merge_sort_analyze { input { String plate_id File paired_end_unique_tar File read_overlap_tar String docker String cromwell_root_dir String cloud_provider #input for allcools bam-to-allc File genome_fa String genome_base = basename(genome_fa) Int num_upstr_bases Int num_downstr_bases Int compress_level File chromosome_sizes String cpu_platform = "Intel Ice Lake" Int disk_size = 1000 Int mem_size = 40 Int cpu = 16 Int preemptible_tries = 2 } command <<< set -euo pipefail set -x lscpu WORKING_DIR=`pwd` # unzip tars echo "Untar paired_end_unique_tar" start=$(date +%s) pigz -dc ~{paired_end_unique_tar} | tar -xf - rm ~{paired_end_unique_tar} end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to untar paired_end_unique_tar: $elapsed seconds" echo "Untar read_overlap_tar" start=$(date +%s) pigz -dc ~{read_overlap_tar} | tar -xf - rm ~{read_overlap_tar} end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to untar read_overlap_tar: $elapsed seconds" # reference and index start=$(date +%s) echo "Reference and index fasta" mkdir reference cp ~{genome_fa} reference ls reference samtools faidx reference/*.fa end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to index fasta $elapsed seconds" # define lists of r1 and r2 fq files UNIQUE_BAMS=($(ls | grep "\.hisat3n_dna.unique_aligned.bam")) SPLIT_BAMS=($(ls | grep "\.hisat3n_dna.split_reads.read_overlap.bam")) # for allcools bam-to-allc if [ ~{num_upstr_bases} -eq 0 ]; then mcg_context=CGN else mcg_context=HCGN fi # make directories mkdir ~{cromwell_root_dir}/output_bams mkdir ~{cromwell_root_dir}/temp mkdir ~{cromwell_root_dir}/allc-${mcg_context} task() { local file=$1 sample_id=$(basename "$file" ".hisat3n_dna.unique_aligned.bam") echo $sample_id start=$(date +%s) echo "Merge all unique_aligned and read_overlap" samtools merge -f "${sample_id}.hisat3n_dna.all_reads.bam" "${sample_id}.hisat3n_dna.unique_aligned.bam" "${sample_id}.hisat3n_dna.split_reads.read_overlap.bam" -@4 end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run merge $elapsed seconds" start=$(date +%s) echo "Sort all reads by name" samtools sort -n -@4 -m1g -o "${sample_id}.hisat3n_dna.all_reads.name_sort.bam" "${sample_id}.hisat3n_dna.all_reads.bam" end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run sort by name $elapsed seconds" start=$(date +%s) echo "Sort all reads by position" samtools sort -O BAM -@4 -m1g -o "${sample_id}.hisat3n_dna.all_reads.pos_sort.bam" "${sample_id}.hisat3n_dna.all_reads.name_sort.bam" end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run sort by pos $elapsed seconds" start=$(date +%s) echo "Call Picard remove duplicates" name=${sample_id}.hisat3n_dna.all_reads.deduped picard MarkDuplicates I=${sample_id}.hisat3n_dna.all_reads.pos_sort.bam O=~{cromwell_root_dir}/output_bams/${name}.bam \ M=~{cromwell_root_dir}/output_bams/${name}.matrix.txt \ REMOVE_DUPLICATES=true TMP_DIR=~{cromwell_root_dir}/temp end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run picard $elapsed seconds" start=$(date +%s) echo "Call samtools index" samtools index ~{cromwell_root_dir}/output_bams/${name}.bam end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to samtools index $elapsed seconds" start=$(date +%s) echo "Call chromatin contacts from name sorted bams" python3 -c 'from cemba_data.hisat3n import *;import os;import glob;call_chromatin_contacts(bam_path="'"$sample_id"'.hisat3n_dna.all_reads.name_sort.bam",contact_prefix="'"$sample_id"'.hisat3n_dna.all_reads",save_raw=False,save_hic_format=True)' end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to chromatin contacts $elapsed seconds" if [ ~{cloud_provider} = "gcp" ]; then reference_fasta="~{cromwell_root_dir}/reference/~{genome_base}" else reference_fasta="$WORKING_DIR/reference/~{genome_base}" fi start=$(date +%s) echo "Call allcools bam-to-allc from deduped.bams" /opt/conda/bin/allcools bam-to-allc \ --bam_path ~{cromwell_root_dir}/output_bams/${name}.bam \ --reference_fasta $reference_fasta \ --output_path "${sample_id}.allc.tsv.gz" \ --num_upstr_bases ~{num_upstr_bases} \ --num_downstr_bases ~{num_downstr_bases} \ --compress_level ~{compress_level} \ --save_count_df \ --convert_bam_strandness end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to allcools bam-to-allc $elapsed seconds" start=$(date +%s) echo "Call allcools extract-all" allcools extract-allc --strandness merge \ --allc_path ${sample_id}.allc.tsv.gz \ --output_prefix ~{cromwell_root_dir}/allc-${mcg_context}/${sample_id} \ --mc_contexts ${mcg_context} \ --chrom_size_path ~{chromosome_sizes} end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to allcools extract-all $elapsed seconds" echo "Remove some bams" rm ${sample_id}.hisat3n_dna.all_reads.bam rm ${sample_id}.hisat3n_dna.all_reads.pos_sort.bam rm ~{cromwell_root_dir}/${sample_id}.hisat3n_dna.split_reads.read_overlap.bam rm ~{cromwell_root_dir}/${sample_id}.hisat3n_dna.unique_aligned.bam } # run 4 instances of task in parallel for file in "${UNIQUE_BAMS[@]}"; do ( echo "starting task $file.." task "$file" sleep $(( (RANDOM % 3) + 1)) ) & # allow to execute up to 4 jobs in parallel if [[ $(jobs -r -p | wc -l) -ge 4 ]]; then wait -n fi done wait echo "Tasks all done." du -h * #################################### ## make sure that the number of output bams equals the length of UNIQUE_BAMS # Count the number of *.hisat3n_dna.unique_aligned.bam files bam_count=$(find . -maxdepth 1 -type f -name '*.hisat3n_dna.all_reads.name_sort.bam' | wc -l) contact_count=$(find . -maxdepth 1 -type f -name '*.hisat3n_dna.all_reads.3C.contact.tsv.gz' | wc -l) # Get the length of the array ${UNIQUE_BAMS[@]} array_length=${#UNIQUE_BAMS[@]} # Check if the count of bams matches the length of the array ${UNIQUE_BAMS[@]} if [ "$bam_count" -ne "$array_length" ]; then echo "Error: Number of BAM files does not match the length of the array." exit 1 fi # Check if the count of tsv files matches the length of the array ${UNIQUE_BAMS[@]} if [ "$contact_count" -ne "$array_length" ]; then echo "Error: Number of hisat3n_dna.all_reads.3C.contact.tsv.gz files does not match the length of the array." exit 1 fi echo "Number of output files matches the length of the array." #################################### echo "recursively ls'sing cromwell root again" ls -lR ~{cromwell_root_dir} echo "Tar files." tar -cf - ~{cromwell_root_dir}/output_bams/*.matrix.txt | pigz > ~{plate_id}.dedup_unique_bam_and_index_unique_bam_stats.tar.gz tar -cf - *.hisat3n_dna.all_reads.name_sort.bam | pigz > ~{plate_id}.hisat3n_dna.all_reads.name_sort.tar.gz # tar outputs of call_chromatin_contacts tar -cf - *.hisat3n_dna.all_reads.3C.contact.tsv.gz | pigz > ~{plate_id}.hisat3n_dna.all_reads.3C.contact.tar.gz tar -cf - *.hisat3n_dna.all_reads.dedup_contacts.tsv.gz | pigz > ~{plate_id}.hisat3n_dna.all_reads.dedup_contacts.tar.gz tar -cf - *.hisat3n_dna.all_reads.contact_stats.csv | pigz > ~{plate_id}.chromatin_contact_stats.tar.gz # tar outputs of allcools tar -cf - *.allc.tsv.gz | pigz > ~{plate_id}.allc.tsv.tar.gz tar -cf - *.allc.tsv.gz.tbi | pigz > ~{plate_id}.allc.tbi.tar.gz tar -cf - *.allc.tsv.gz.count.csv | pigz > ~{plate_id}.allc.count.tar.gz tar -cf - ~{cromwell_root_dir}/allc-${mcg_context}/*.gz | pigz > ~{plate_id}.extract-allc.tar.gz tar -cf - ~{cromwell_root_dir}/allc-${mcg_context}/*.tbi | pigz > ~{plate_id}.extract-allc_tbi.tar.gz >>> runtime { docker: docker disks: "local-disk ${disk_size} SSD" cpu: cpu memory: "${mem_size} GiB" cpuPlatform: cpu_platform preemptible: preemptible_tries } output { File allc = "~{plate_id}.allc.tsv.tar.gz" File tbi = "~{plate_id}.allc.tbi.tar.gz" File all_reads_dedup_contacts = "~{plate_id}.hisat3n_dna.all_reads.dedup_contacts.tar.gz" File all_reads_3C_contacts = "~{plate_id}.hisat3n_dna.all_reads.3C.contact.tar.gz" File name_sorted_bam = "~{plate_id}.hisat3n_dna.all_reads.name_sort.tar.gz" File dedup_stats_tar = "~{plate_id}.dedup_unique_bam_and_index_unique_bam_stats.tar.gz" File chromatin_contact_stats = "~{plate_id}.chromatin_contact_stats.tar.gz" File allc_uniq_reads_stats = "~{plate_id}.allc.count.tar.gz" File extract_allc_output_tbi_tar = "~{plate_id}.extract-allc_tbi.tar.gz" File extract_allc_output_allc_tar = "~{plate_id}.extract-allc.tar.gz" } } task Summary_PerCellOutput { input { # to generate list of files per file type Array[File] name_sorted_bams Array[File] unique_reads_cgn_extraction_allc Array[File] unique_reads_cgn_extraction_tbi Array[File] all_reads_3C_contacts Array[File] unique_reads_cgn_extraction_allc_extract Array[File] unique_reads_cgn_extraction_tbi_extract String cromwell_root_dir String docker String plate_id Int disk_size = 1000 Int mem_size = 30 Int cpu = 16 } command <<< set -euo pipefail set -x # Set root_dir to current working directory root_dir=$(pwd) echo "This is the root directory " $root_dir extract_and_remove() { if [ $# -eq 0 ]; then echo "No files exist" return fi for tarred_file in "${@}"; do dir_name=`basename "${tarred_file%.tar.gz}"` echo $dir_name mkdir -p "$root_dir"/"$dir_name" pigz -dc "$tarred_file" | tar -xvf - -C "$root_dir"/"$dir_name" rm "$tarred_file" done } # output files at a cell level echo "Untar files needed at per cell level" extract_and_remove ~{sep=' ' name_sorted_bams} extract_and_remove ~{sep=' ' unique_reads_cgn_extraction_allc} extract_and_remove ~{sep=' ' unique_reads_cgn_extraction_tbi} extract_and_remove ~{sep=' ' all_reads_3C_contacts} extract_and_remove ~{sep=' ' unique_reads_cgn_extraction_allc_extract} extract_and_remove ~{sep=' ' unique_reads_cgn_extraction_tbi_extract} >>> runtime { docker: docker disks: "local-disk ${disk_size} SSD" cpu: cpu memory: "${mem_size} GiB" } output { Array[File] name_sorted_bam_array = glob("~{plate_id}.hisat3n_dna.all_reads.name_sort/*") Array[File] unique_reads_cgn_extraction_allc_array = glob("~{plate_id}.allc.tsv/*") Array[File] unique_reads_cgn_extraction_tbi_array = glob("~{plate_id}.allc.tbi/*") Array[File] all_reads_3C_contacts_array = glob("~{plate_id}.hisat3n_dna.all_reads.3C.contact/*") Array[File] unique_reads_cgn_extraction_allc_extract_array = glob("~{plate_id}.extract-allc/~{cromwell_root_dir}/allc-CGN/*") Array[File] unique_reads_cgn_extraction_tbi_extract_array = glob("~{plate_id}.extract-allc_tbi/~{cromwell_root_dir}/allc-CGN/*") } } task Summary { input { Array[File] trimmed_stats Array[File] hisat3n_stats Array[File] r1_hisat3n_stats Array[File] r2_hisat3n_stats Array[File] dedup_stats Array[File] chromatin_contact_stats Array[File] allc_uniq_reads_stats Array[File] unique_reads_cgn_extraction_tbi String plate_id String cromwell_root_dir String cloud_provider String docker Int disk_size = 80 Int mem_size = 5 Int preemptible_tries = 3 Int cpu = 4 } command <<< set -euo pipefail WORKING_DIR=`pwd` if [ ~{cloud_provider} = "gcp" ]; then base_directory=~{cromwell_root_dir} matrix_files_dir="~{cromwell_root_dir}~{cromwell_root_dir}/output_bams" allc_index_dir="~{cromwell_root_dir}~{cromwell_root_dir}/allc-*" else base_directory=$WORKING_DIR matrix_files_dir="$WORKING_DIR~{cromwell_root_dir}/output_bams" allc_index_dir="$WORKING_DIR~{cromwell_root_dir}/allc-*" fi mkdir $base_directory/fastq mkdir $base_directory/bam mkdir $base_directory/allc mkdir $base_directory/hic extract_and_remove() { if [ $# -eq 0 ]; then echo "No files exist" return fi for tar in "${@}"; do tar -xvf "$tar" rm "$tar" done } extract_and_remove ~{sep=' ' trimmed_stats} extract_and_remove ~{sep=' ' hisat3n_stats} extract_and_remove ~{sep=' ' r1_hisat3n_stats} extract_and_remove ~{sep=' ' r2_hisat3n_stats} extract_and_remove ~{sep=' ' dedup_stats} extract_and_remove ~{sep=' ' chromatin_contact_stats} extract_and_remove ~{sep=' ' allc_uniq_reads_stats} extract_and_remove ~{sep=' ' unique_reads_cgn_extraction_tbi} mv *.trimmed.stats.txt $base_directory/fastq mv *.hisat3n_dna_summary.txt *.hisat3n_dna_split_reads_summary.R1.txt *.hisat3n_dna_split_reads_summary.R2.txt $base_directory/bam mv $matrix_files_dir/*.hisat3n_dna.all_reads.deduped.matrix.txt $base_directory/bam mv *.hisat3n_dna.all_reads.contact_stats.csv $base_directory/hic mv *.allc.tsv.gz.count.csv $base_directory/allc mv $allc_index_dir/*.allc.tsv.gz.tbi $base_directory/allc python3 -c 'from cemba_data.hisat3n import *;snm3c_summary()' mv MappingSummary.csv.gz ~{plate_id}_MappingSummary.csv.gz >>> runtime { docker: docker disks: "local-disk ${disk_size} HDD" cpu: cpu memory: "${mem_size} GiB" preemptible: preemptible_tries } output { File mapping_summary = "~{plate_id}_MappingSummary.csv.gz" } }