version 1.0 # This script is under review. It is not actively tested or maintained at this time. workflow BeagleImputationValidation { input { File ref_panel_vcf File ref_panel_vcf_index File truth_vcf File truth_vcf_index File test_vcf File test_vcf_index File? sample_names_to_select String output_basename } call RunBeagleGtStats { input: ref_panel_vcf = ref_panel_vcf, ref_panel_vcf_index = ref_panel_vcf_index, output_basename = output_basename } call SelectVariantType as SelectSnps { input: truth_vcf = truth_vcf, test_vcf = test_vcf, select_type_string = "SNP", sample_names_to_select = sample_names_to_select } call SelectVariantType as SelectIndels { input: truth_vcf = truth_vcf, test_vcf = test_vcf, select_type_string = "INDEL", sample_names_to_select = sample_names_to_select } call RunBeagleImputedR2 as RunBeagleImputedR2Snps { input: gt_stats = RunBeagleGtStats.gt_stats_output, truth_vcf = SelectSnps.truth_output_vcf, truth_vcf_index = SelectSnps.truth_output_vcf_index, test_vcf = SelectSnps.test_output_vcf, test_vcf_index = SelectSnps.test_output_vcf_index, output_basename = output_basename + ".SNPs" } call RunBeagleImputedR2 as RunBeagleImputedR2Indels { input: gt_stats = RunBeagleGtStats.gt_stats_output, truth_vcf = SelectIndels.truth_output_vcf, truth_vcf_index = SelectIndels.truth_output_vcf_index, test_vcf = SelectIndels.test_output_vcf, test_vcf_index = SelectIndels.test_output_vcf_index, output_basename = output_basename + ".INDELS" } output { File gt_stats_output = RunBeagleGtStats.gt_stats_output File imputed_r2_output_snps = RunBeagleImputedR2Snps.imputed_r2_output File imputed_r2_output_indels = RunBeagleImputedR2Indels.imputed_r2_output } } task RunBeagleGtStats { input { File ref_panel_vcf File ref_panel_vcf_index String output_basename Int disk_size_gb = ceil(size(ref_panel_vcf, "GiB")) + 20 Int cpu = 16 Int memory_mb = 96000 } Int command_mem = memory_mb - 6000 Int max_heap = memory_mb - 4000 command { set -e -o pipefail gunzip -c ~{ref_panel_vcf} | java -Xms~{command_mem}m -Xmx~{max_heap}m -jar /beagle_jars/gt-stats.jar > ~{output_basename}_gt_stats.tsv } output { File gt_stats_output = "~{output_basename}_gt_stats.tsv" } runtime { docker: "us.gcr.io/broad-dsde-methods/jsotobroad/beagle_validation:1.0.1" disks: "local-disk ${disk_size_gb} HDD" memory: "${memory_mb} MiB" cpu: cpu } } task RunBeagleImputedR2 { input { File gt_stats File truth_vcf File truth_vcf_index File test_vcf File test_vcf_index String output_basename Int disk_size_gb = ceil(size(truth_vcf, "GiB")) + ceil(size(test_vcf, "GiB")) + 20 Int cpu = 16 Int memory_mb = 100000 } Int command_mem = memory_mb - 12000 Int max_heap = memory_mb - 10000 command { set -e -o pipefail java -Xms~{command_mem}m -Xmx~{max_heap}m -jar /beagle_jars/imputed-r2.jar ~{gt_stats} \ ~{truth_vcf} \ ~{test_vcf} > ~{output_basename}.imputed_stats.tsv } output { File imputed_r2_output = "~{output_basename}.imputed_stats.tsv" } runtime { docker: "us.gcr.io/broad-dsde-methods/jsotobroad/beagle_validation:1.0.1" disks: "local-disk ${disk_size_gb} HDD" memory: "${memory_mb} MiB" cpu: cpu } } task SelectVariantType { input { File truth_vcf File test_vcf String select_type_string = "SNP" File? sample_names_to_select Int disk_size_gb = ceil(2 * (size(truth_vcf, "GiB") + size(test_vcf, "GiB"))) + 10 Int cpu = 1 Int memory_mb = 6000 String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.6.1.0" } Int command_mem = memory_mb - 1500 Int max_heap = memory_mb - 1000 String truth_basename = basename(truth_vcf, ".vcf.gz") String test_basename = basename(test_vcf, ".vcf.gz") command { set -euo pipefail gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" \ SelectVariants \ -select-type ~{select_type_string} \ ~{"--sample-name " + sample_names_to_select} \ --preserve-alleles \ -V ~{truth_vcf} \ -O ~{truth_basename}_~{select_type_string}.vcf.gz gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" \ SelectVariants \ -select-type ~{select_type_string} \ ~{"--sample-name " + sample_names_to_select} \ --preserve-alleles \ -V ~{test_vcf} \ -O ~{test_basename}_~{select_type_string}.vcf.gz } runtime { docker: gatk_docker disks: "local-disk ${disk_size_gb} HDD" memory: "${memory_mb} MiB" cpu: cpu } parameter_meta { truth_vcf: { description: "truth vcf", localization_optional: true } test_vcf: { description: "test vcf", localization_optional: true } } output { File truth_output_vcf = "~{truth_basename}_~{select_type_string}.vcf.gz" File truth_output_vcf_index = "~{truth_basename}_~{select_type_string}.vcf.gz.tbi" File test_output_vcf = "~{test_basename}_~{select_type_string}.vcf.gz" File test_output_vcf_index = "~{test_basename}_~{select_type_string}.vcf.gz.tbi" } }