version 1.0 # Exactly one of input_cram and input_bam should be supplied to this workflow. If an input_cram is supplied, a ref_fasta # and ref_fasta_index must also be supplied. The ref_fasta and ref_fasta_index are used to generate a bam, so if an # input_cram is not supplied, the input_bam is used instead and the ref_fasta and ref_fasta_index are not needed. # If the output_map file is provided, it is expected to be a tab-separated file containing a list of all the read group ids # found in the input_cram / input_bam and the desired name of the unmapped bams generated for each. # If the file is not provided, the output names of the unmapped bams will be the read_group_id workflow CramToUnmappedBams { String pipeline_version = "1.1.3" input { File? input_cram File? input_bam File? ref_fasta File? ref_fasta_index File? output_map String base_file_name String unmapped_bam_suffix = ".unmapped.bam" Int additional_disk = 20 } if (defined(input_cram)) { Float cram_size = size(input_cram, "GiB") String bam_from_cram_name = basename(input_cram_path, ".cram") String input_cram_path = select_first([input_cram]) call CramToBam { input: ref_fasta = select_first([ref_fasta]), ref_fasta_index = select_first([ref_fasta_index]), cram_file = select_first([input_cram]), output_basename = bam_from_cram_name, disk_size = ceil(cram_size * 6) + additional_disk } } File input_file = select_first([CramToBam.output_bam, input_bam]) Float input_size = size(input_file, "GiB") if (!defined(output_map)) { call GenerateOutputMap { input: input_bam = input_file, unmapped_bam_suffix = unmapped_bam_suffix, disk_size = ceil(input_size) + additional_disk } } call SplitUpOutputMapFile { input: read_group_map_file = select_first([output_map, GenerateOutputMap.output_map]) } scatter (rg_map_file in SplitUpOutputMapFile.rg_to_ubam_file) { call SplitOutUbamByReadGroup { input: input_bam = input_file, rg_to_ubam_file = rg_map_file, disk_size = ceil(input_size * 2) + additional_disk } String unmapped_bam_filename = basename(SplitOutUbamByReadGroup.output_bam) call RevertSam { input: input_bam = SplitOutUbamByReadGroup.output_bam, output_bam_filename = unmapped_bam_filename, disk_size = ceil(input_size * 3) + additional_disk } call SortSam { input: input_bam = RevertSam.output_bam, output_bam_filename = unmapped_bam_filename } Float unmapped_bam_size = size(SortSam.output_bam, "GiB") call ValidateSamFile { input: input_bam = SortSam.output_bam, report_filename = unmapped_bam_filename + ".validation_report", disk_size = ceil(unmapped_bam_size) + additional_disk } } output { Array[File] validation_report = ValidateSamFile.report Array[File] unmapped_bams = SortSam.output_bam } meta { allowNestedInputs: true } } task RevertSam { input { File input_bam String output_bam_filename Int disk_size Int memory_in_MiB = 3000 } Int java_mem = memory_in_MiB - 1000 Int max_heap = memory_in_MiB - 500 command <<< java -Xms~{java_mem}m -Xmx~{max_heap}m -jar /usr/picard/picard.jar \ RevertSam \ --INPUT ~{input_bam} \ --OUTPUT ~{output_bam_filename} \ --VALIDATION_STRINGENCY LENIENT \ --ATTRIBUTE_TO_CLEAR FT \ --ATTRIBUTE_TO_CLEAR CO \ --ATTRIBUTE_TO_CLEAR PA \ --ATTRIBUTE_TO_CLEAR OA \ --ATTRIBUTE_TO_CLEAR XA \ --SORT_ORDER coordinate >>> runtime { docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" disks: "local-disk " + disk_size + " HDD" memory: "~{memory_in_MiB} MiB" preemptible: 3 } output { File output_bam = output_bam_filename } } # This task is slower than converting straight from cram to bam (note we stream through sam format in between cram and bam) # This is currently necessary due to a difference in the way the NM tag is calculated in order to produce a valid bam. task CramToBam { input { File ref_fasta File ref_fasta_index File cram_file String output_basename Int disk_size Int memory_in_MiB = 7000 } command <<< set -e set -o pipefail samtools view -h -T ~{ref_fasta} ~{cram_file} | samtools view -b -o ~{output_basename}.bam - samtools index -b ~{output_basename}.bam mv ~{output_basename}.bam.bai ~{output_basename}.bai >>> runtime { docker: "us.gcr.io/broad-gotc-prod/samtools:1.0.0-1.11-1624651616" cpu: 3 memory: "~{memory_in_MiB} MiB" disks: "local-disk " + disk_size + " HDD" preemptible: 3 } output { File output_bam = "~{output_basename}.bam" File output_bam_index = "~{output_basename}.bai" } } task GenerateOutputMap { input { File input_bam String unmapped_bam_suffix Int disk_size Int memory_in_MiB = 3000 } command <<< set -e samtools view -H ~{input_bam} | grep '^@RG' | cut -f2 | sed s/ID:// > readgroups.txt echo -e "#READ_GROUP_ID\tOUTPUT" > output_map.tsv for rg in `cat readgroups.txt`; do echo -e "$rg\t$rg~{unmapped_bam_suffix}" >> output_map.tsv done >>> runtime { docker: "us.gcr.io/broad-gotc-prod/samtools:1.0.0-1.11-1624651616" disks: "local-disk " + disk_size + " HDD" memory: "~{memory_in_MiB} MiB" preemptible: 3 } output { File output_map = "output_map.tsv" } } task SplitUpOutputMapFile { input { File read_group_map_file Int disk_size = 10 Int memory_in_MiB = 3000 } command <<< mkdir rgtemp cd rgtemp # splits list of mappings into single files. One line each. grep -v '^#' ~{read_group_map_file} | split -l 1 - rg_to_ubam_ >>> runtime { docker: "gcr.io/gcp-runtimes/ubuntu_16_0_4@sha256:025124e2f1cf4d29149958f17270596bffe13fc6acca6252977c572dd5ba01bf" disks: "local-disk " + disk_size + " HDD" memory: "~{memory_in_MiB} MiB" } output { Array[File] rg_to_ubam_file = glob("rgtemp/rg_to_ubam_*") } } task SplitOutUbamByReadGroup { input { File input_bam File rg_to_ubam_file Int disk_size Int memory_in_MiB = 30000 } Array[Array[String]] tmp = read_tsv(rg_to_ubam_file) command <<< echo "Read Group ~{tmp[0][0]} from ~{input_bam} is being written to ~{tmp[0][1]}" samtools view -b -h -r ~{tmp[0][0]} -o ~{tmp[0][1]} ~{input_bam} >>> output { File output_bam = tmp[0][1] } runtime { docker: "us.gcr.io/broad-gotc-prod/samtools:1.0.0-1.11-1624651616" cpu: 2 disks: "local-disk " + disk_size + " HDD" memory: "~{memory_in_MiB} MiB" preemptible: 3 } } task ValidateSamFile { input { File input_bam String report_filename Int disk_size Int memory_in_MiB = 3000 } Int java_mem = memory_in_MiB - 1000 Int max_heap = memory_in_MiB - 500 command <<< java -Xms~{java_mem}m -Xmx~{max_heap}m -jar /usr/picard/picard.jar \ ValidateSamFile \ --INPUT ~{input_bam} \ --OUTPUT ~{report_filename} \ --MODE VERBOSE \ --IS_BISULFITE_SEQUENCED false >>> runtime { docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" disks: "local-disk " + disk_size + " HDD" memory: "~{memory_in_MiB} MiB" preemptible: 3 } output { File report = "~{report_filename}" } } task SortSam { input { File input_bam String output_bam_filename Int memory_in_MiB = 7000 Float sort_sam_disk_multiplier = 6 } # SortSam spills to disk a lot more because we are only store 300000 records in RAM now because its faster for our data so it needs # more disk space. Also it spills to disk in an uncompressed format so we need to account for that with a larger multiplier Int disk_size = ceil(sort_sam_disk_multiplier * size(input_bam, "GiB")) + 20 Int java_mem = memory_in_MiB - 1000 Int max_heap = memory_in_MiB - 500 command <<< java -Xms~{java_mem}m -Xmx~{max_heap}m -jar /usr/picard/picard.jar \ SortSam \ --INPUT ~{input_bam} \ --OUTPUT ~{output_bam_filename} \ --SORT_ORDER queryname \ --MAX_RECORDS_IN_RAM 300000 >>> runtime { docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" disks: "local-disk " + disk_size + " HDD" memory: "~{memory_in_MiB} MiB" preemptible: 3 } output { File output_bam = output_bam_filename } }