version 1.0 ## Copyright Broad Institute, 2018 ## ## This WDL pipeline implements data pre-processing and initial variant calling (GVCF ## generation) according to the GATK Best Practices (June 2016) for germline SNP and ## Indel discovery in human whole-genome data. ## ## Requirements/expectations : ## - Human whole-genome pair-end sequencing data in unmapped BAM (uBAM) format ## - One or more read groups, one per uBAM file, all belonging to a single sample (SM) ## - Input uBAM files must additionally comply with the following requirements: ## - - filenames all have the same suffix (we use ".unmapped.bam") ## - - files must pass validation by ValidateSamFile ## - - reads are provided in query-sorted order ## - - all reads must have an RG tag ## - GVCF output names must end in ".g.vcf.gz" ## - Reference genome must be Hg38 with ALT contigs ## ## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. ## For program versions, see docker containers. ## ## LICENSING : ## This script is released under the WDL source code license (BSD-3) (see LICENSE in ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may ## be subject to different licenses. Users are responsible for checking that they are ## authorized to run all programs before running this script. Please see the docker ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed ## licensing information pertaining to the included programs. import "../../../../../../tasks/wdl/UnmappedBamToAlignedBam.wdl" as ToBam import "../../../../../../tasks/wdl/AggregatedBamQC.wdl" as AggregatedQC import "../../../../../../tasks/wdl/Qc.wdl" as QC import "../../../../../../tasks/wdl/BamToCram.wdl" as ToCram import "../../../../../../tasks/wdl/Utilities.wdl" as Utilities import "../../variant_calling/VariantCalling.wdl" as ToGvcf import "../../../../../../structs/dna_seq/DNASeqStructs.wdl" # WORKFLOW DEFINITION workflow WholeGenomeGermlineSingleSample { String pipeline_version = "3.3.7" input { SampleAndUnmappedBams sample_and_unmapped_bams DNASeqSingleSampleReferences references DragmapReference? dragmap_reference VariantCallingScatterSettings scatter_settings PapiSettings papi_settings File? fingerprint_genotypes_file File? fingerprint_genotypes_index File wgs_coverage_interval_list Boolean provide_bam_output = false Boolean use_gatk3_haplotype_caller = true Boolean dragen_functional_equivalence_mode = false Boolean dragen_maximum_quality_mode = false Boolean run_dragen_mode_variant_calling = false Boolean use_spanning_event_genotyping = true Boolean unmap_contaminant_reads = true Boolean perform_bqsr = true Boolean use_bwa_mem = true Boolean allow_empty_ref_alt = false Boolean use_dragen_hard_filtering = false String cloud_provider } if (dragen_functional_equivalence_mode && dragen_maximum_quality_mode) { call Utilities.ErrorWithMessage as PresetArgumentsError { input: message = "Both dragen_functional_equivalence_mode and dragen_maximum_quality_mode have been set to true, however, they are mutually exclusive. You can set either of them to true, or set them both to false and adjust the arguments individually." } } if (run_dragen_mode_variant_calling && use_gatk3_haplotype_caller) { call Utilities.ErrorWithMessage as DragenModeVariantCallingAndGATK3Error { input: message = "DRAGEN mode variant calling has been activated, however, the HaplotypeCaller version has been set to use GATK 3. Please set use_gatk3_haplotype_caller to false to use DRAGEN mode variant calling." } } # Set DRAGEN-related arguments according to the preset arguments Boolean run_dragen_mode_variant_calling_ = if (dragen_functional_equivalence_mode || dragen_maximum_quality_mode) then true else run_dragen_mode_variant_calling Boolean use_spanning_event_genotyping_ = if dragen_functional_equivalence_mode then false else (if dragen_maximum_quality_mode then true else use_spanning_event_genotyping) Boolean unmap_contaminant_reads_ = if dragen_functional_equivalence_mode then false else (if dragen_maximum_quality_mode then true else unmap_contaminant_reads) Boolean perform_bqsr_ = if (dragen_functional_equivalence_mode || dragen_maximum_quality_mode) then false else perform_bqsr Boolean use_bwa_mem_ = if (dragen_functional_equivalence_mode || dragen_maximum_quality_mode) then false else use_bwa_mem Boolean use_gatk3_haplotype_caller_ = if (dragen_functional_equivalence_mode || dragen_maximum_quality_mode) then false else use_gatk3_haplotype_caller Boolean use_dragen_hard_filtering_ = if (dragen_functional_equivalence_mode || dragen_maximum_quality_mode) then true else use_dragen_hard_filtering # Not overridable: Float lod_threshold = -20.0 String cross_check_fingerprints_by = "READGROUP" String recalibrated_bam_basename = sample_and_unmapped_bams.base_file_name + ".aligned.duplicates_marked.recalibrated" String final_gvcf_base_name = select_first([sample_and_unmapped_bams.final_gvcf_base_name, sample_and_unmapped_bams.base_file_name]) call ToBam.UnmappedBamToAlignedBam { input: sample_and_unmapped_bams = sample_and_unmapped_bams, references = references, dragmap_reference = dragmap_reference, papi_settings = papi_settings, contamination_sites_ud = references.contamination_sites_ud, contamination_sites_bed = references.contamination_sites_bed, contamination_sites_mu = references.contamination_sites_mu, cross_check_fingerprints_by = cross_check_fingerprints_by, haplotype_database_file = references.haplotype_database_file, lod_threshold = lod_threshold, recalibrated_bam_basename = recalibrated_bam_basename, perform_bqsr = perform_bqsr_, use_bwa_mem = use_bwa_mem_, unmap_contaminant_reads = unmap_contaminant_reads_, allow_empty_ref_alt = allow_empty_ref_alt } call AggregatedQC.AggregatedBamQC { input: base_recalibrated_bam = UnmappedBamToAlignedBam.output_bam, base_recalibrated_bam_index = UnmappedBamToAlignedBam.output_bam_index, base_name = sample_and_unmapped_bams.base_file_name, sample_name = sample_and_unmapped_bams.sample_name, recalibrated_bam_base_name = recalibrated_bam_basename, haplotype_database_file = references.haplotype_database_file, references = references, fingerprint_genotypes_file = fingerprint_genotypes_file, fingerprint_genotypes_index = fingerprint_genotypes_index, papi_settings = papi_settings } call ToCram.BamToCram as BamToCram { input: input_bam = UnmappedBamToAlignedBam.output_bam, ref_fasta = references.reference_fasta.ref_fasta, ref_fasta_index = references.reference_fasta.ref_fasta_index, ref_dict = references.reference_fasta.ref_dict, duplication_metrics = UnmappedBamToAlignedBam.duplicate_metrics, chimerism_metrics = AggregatedBamQC.agg_alignment_summary_metrics, base_file_name = sample_and_unmapped_bams.base_file_name, agg_preemptible_tries = papi_settings.agg_preemptible_tries } # QC the sample WGS metrics (stringent thresholds) call QC.CollectWgsMetrics as CollectWgsMetrics { input: input_bam = UnmappedBamToAlignedBam.output_bam, input_bam_index = UnmappedBamToAlignedBam.output_bam_index, metrics_filename = sample_and_unmapped_bams.base_file_name + ".wgs_metrics", ref_fasta = references.reference_fasta.ref_fasta, ref_fasta_index = references.reference_fasta.ref_fasta_index, wgs_coverage_interval_list = wgs_coverage_interval_list, preemptible_tries = papi_settings.agg_preemptible_tries } # QC the sample raw WGS metrics (common thresholds) call QC.CollectRawWgsMetrics as CollectRawWgsMetrics { input: input_bam = UnmappedBamToAlignedBam.output_bam, input_bam_index = UnmappedBamToAlignedBam.output_bam_index, metrics_filename = sample_and_unmapped_bams.base_file_name + ".raw_wgs_metrics", ref_fasta = references.reference_fasta.ref_fasta, ref_fasta_index = references.reference_fasta.ref_fasta_index, wgs_coverage_interval_list = wgs_coverage_interval_list, preemptible_tries = papi_settings.agg_preemptible_tries } call ToGvcf.VariantCalling as BamToGvcf { input: run_dragen_mode_variant_calling = run_dragen_mode_variant_calling_, use_spanning_event_genotyping = use_spanning_event_genotyping_, calling_interval_list = references.calling_interval_list, evaluation_interval_list = references.evaluation_interval_list, haplotype_scatter_count = scatter_settings.haplotype_scatter_count, break_bands_at_multiples_of = scatter_settings.break_bands_at_multiples_of, contamination = UnmappedBamToAlignedBam.contamination, input_bam = UnmappedBamToAlignedBam.output_bam, input_bam_index = UnmappedBamToAlignedBam.output_bam_index, ref_fasta = references.reference_fasta.ref_fasta, ref_fasta_index = references.reference_fasta.ref_fasta_index, ref_dict = references.reference_fasta.ref_dict, ref_str = references.reference_fasta.ref_str, dbsnp_vcf = references.dbsnp_vcf, dbsnp_vcf_index = references.dbsnp_vcf_index, base_file_name = sample_and_unmapped_bams.base_file_name, final_vcf_base_name = final_gvcf_base_name, agg_preemptible_tries = papi_settings.agg_preemptible_tries, use_gatk3_haplotype_caller = use_gatk3_haplotype_caller_, use_dragen_hard_filtering = use_dragen_hard_filtering_, cloud_provider = cloud_provider } if (provide_bam_output) { File provided_output_bam = UnmappedBamToAlignedBam.output_bam File provided_output_bam_index = UnmappedBamToAlignedBam.output_bam_index } # Outputs that will be retained when execution is complete output { Array[File] quality_yield_metrics = UnmappedBamToAlignedBam.quality_yield_metrics Array[File] unsorted_read_group_base_distribution_by_cycle_pdf = UnmappedBamToAlignedBam.unsorted_read_group_base_distribution_by_cycle_pdf Array[File] unsorted_read_group_base_distribution_by_cycle_metrics = UnmappedBamToAlignedBam.unsorted_read_group_base_distribution_by_cycle_metrics Array[File] unsorted_read_group_insert_size_histogram_pdf = UnmappedBamToAlignedBam.unsorted_read_group_insert_size_histogram_pdf Array[File] unsorted_read_group_insert_size_metrics = UnmappedBamToAlignedBam.unsorted_read_group_insert_size_metrics Array[File] unsorted_read_group_quality_by_cycle_pdf = UnmappedBamToAlignedBam.unsorted_read_group_quality_by_cycle_pdf Array[File] unsorted_read_group_quality_by_cycle_metrics = UnmappedBamToAlignedBam.unsorted_read_group_quality_by_cycle_metrics Array[File] unsorted_read_group_quality_distribution_pdf = UnmappedBamToAlignedBam.unsorted_read_group_quality_distribution_pdf Array[File] unsorted_read_group_quality_distribution_metrics = UnmappedBamToAlignedBam.unsorted_read_group_quality_distribution_metrics File read_group_alignment_summary_metrics = AggregatedBamQC.read_group_alignment_summary_metrics File read_group_gc_bias_detail_metrics = AggregatedBamQC.read_group_gc_bias_detail_metrics File read_group_gc_bias_pdf = AggregatedBamQC.read_group_gc_bias_pdf File read_group_gc_bias_summary_metrics = AggregatedBamQC.read_group_gc_bias_summary_metrics File? cross_check_fingerprints_metrics = UnmappedBamToAlignedBam.cross_check_fingerprints_metrics File selfSM = UnmappedBamToAlignedBam.selfSM Float contamination = UnmappedBamToAlignedBam.contamination File calculate_read_group_checksum_md5 = AggregatedBamQC.calculate_read_group_checksum_md5 File agg_alignment_summary_metrics = AggregatedBamQC.agg_alignment_summary_metrics File agg_bait_bias_detail_metrics = AggregatedBamQC.agg_bait_bias_detail_metrics File agg_bait_bias_summary_metrics = AggregatedBamQC.agg_bait_bias_summary_metrics File agg_gc_bias_detail_metrics = AggregatedBamQC.agg_gc_bias_detail_metrics File agg_gc_bias_pdf = AggregatedBamQC.agg_gc_bias_pdf File agg_gc_bias_summary_metrics = AggregatedBamQC.agg_gc_bias_summary_metrics File agg_insert_size_histogram_pdf = AggregatedBamQC.agg_insert_size_histogram_pdf File agg_insert_size_metrics = AggregatedBamQC.agg_insert_size_metrics File agg_pre_adapter_detail_metrics = AggregatedBamQC.agg_pre_adapter_detail_metrics File agg_pre_adapter_summary_metrics = AggregatedBamQC.agg_pre_adapter_summary_metrics File agg_quality_distribution_pdf = AggregatedBamQC.agg_quality_distribution_pdf File agg_quality_distribution_metrics = AggregatedBamQC.agg_quality_distribution_metrics File agg_error_summary_metrics = AggregatedBamQC.agg_error_summary_metrics File? fingerprint_summary_metrics = AggregatedBamQC.fingerprint_summary_metrics File? fingerprint_detail_metrics = AggregatedBamQC.fingerprint_detail_metrics File wgs_metrics = CollectWgsMetrics.metrics File raw_wgs_metrics = CollectRawWgsMetrics.metrics File duplicate_metrics = UnmappedBamToAlignedBam.duplicate_metrics File? output_bqsr_reports = UnmappedBamToAlignedBam.output_bqsr_reports File gvcf_summary_metrics = BamToGvcf.vcf_summary_metrics File gvcf_detail_metrics = BamToGvcf.vcf_detail_metrics File? output_bam = provided_output_bam File? output_bam_index = provided_output_bam_index File output_cram = BamToCram.output_cram File output_cram_index = BamToCram.output_cram_index File output_cram_md5 = BamToCram.output_cram_md5 File validate_cram_file_report = BamToCram.validate_cram_file_report File output_vcf = BamToGvcf.output_vcf File output_vcf_index = BamToGvcf.output_vcf_index } meta { allowNestedInputs: true } }