version 1.0 workflow CleanSVs { input { File input_vcf File? input_vcf_index String? output_name Int min_size = 50 # Minimum size of SV events Boolean split_MA = true Boolean add_annotations = true Boolean normalize_and_clean = true Boolean convert_to_abstract = true } if (split_MA) { call SplitMASites { input: input_vcf=input_vcf, input_vcf_index=input_vcf_index, output_name=output_name } } if (add_annotations) { call AddAnnotations { input: input_vcf=select_first([SplitMASites.output_vcf, input_vcf]), input_vcf_index=select_first([SplitMASites.output_vcf_index, input_vcf_index]), min_size=min_size, output_name=output_name } } if (normalize_and_clean) { call NormalizeAndClean { input: input_vcf=select_first([AddAnnotations.output_vcf, SplitMASites.output_vcf, input_vcf]), input_vcf_index=select_first([AddAnnotations.output_vcf_index, SplitMASites.output_vcf_index, input_vcf_index]), output_name=output_name } } if (convert_to_abstract) { call ConvertToAbstract { input: input_vcf=select_first([NormalizeAndClean.output_vcf, AddAnnotations.output_vcf, SplitMASites.output_vcf, input_vcf]), input_vcf_index=select_first([NormalizeAndClean.output_vcf_index, AddAnnotations.output_vcf_index, SplitMASites.output_vcf_index, input_vcf_index]), output_name=output_name } } output { File output_vcf = select_first([ConvertToAbstract.output_vcf, NormalizeAndClean.output_vcf, AddAnnotations.output_vcf, SplitMASites.output_vcf, input_vcf]) File output_vcf_index = select_first([ConvertToAbstract.output_vcf_index, NormalizeAndClean.output_vcf_index, AddAnnotations.output_vcf_index, SplitMASites.output_vcf_index, input_vcf_index]) } } # Splits MA sites into biallelics task SplitMASites { input { File input_vcf File? input_vcf_index String output_name = "split" String output_suffix = "-splitMA" File? restriction_bed # Runtime parameters Int disk_size = ceil(2 * size(input_vcf, "GB")) + 100 Int cpu = 4 Int memory_ram = 16 } command <<< set -xe if [ -z "~{input_vcf_index}" ]; then bcftools index -t ~{input_vcf} fi bcftools norm -m -any -o "~{output_name}~{output_suffix}.vcf.gz" ~{"-R " + restriction_bed} ~{input_vcf} bcftools index -t -f "~{output_name}~{output_suffix}.vcf.gz" >>> runtime { docker: "us.gcr.io/broad-dsde-methods/sv_docker:v1.0" disks: "local-disk " + disk_size + " HDD" memory: memory_ram + " GB" cpu: cpu } output { File output_vcf = "~{output_name}~{output_suffix}.vcf.gz" File output_vcf_index = "~{output_name}~{output_suffix}.vcf.gz.tbi" } } # Used to add SVTYPE, SVLEN, and END fields to VCF containing just large INDELs task AddAnnotations { input { File input_vcf File input_vcf_index String output_name = "annotated" String output_suffix = "-sv_annotated" Int min_size = 50 # Runtime parameters Int disk_size = ceil(2 * size(input_vcf, "GB")) + 100 Int cpu = 4 Int memory_ram = 16 } command <<< set -xe # Add SVTYPE and SVLEN fields truvari anno svinfo -m ~{min_size} -o truvari-annotated.vcf.gz ~{input_vcf} bcftools view -i 'INFO/SVTYPE!="."' -o truvari-SVs.vcf.gz truvari-annotated.vcf.gz bcftools index -t -f truvari-SVs.vcf.gz # Add END for INDELs using bcftools because pysam doesn't handle these well # See: https://github.com/pysam-developers/pysam/issues/1200 bcftools query -f'%CHROM\t%POS\t%ID\t%REF\t%ALT\t%SVLEN\t%SVTYPE\n' truvari-SVs.vcf.gz > query.tsv # Process updated END coordinate from POS and SVLEN python3 << CODE import pandas as pd import numpy as np df = pd.read_csv('query.tsv', sep='\t', names=['CHROM', 'POS', 'ID', 'REF', 'ALT', 'SVLEN', 'SVTYPE']) df['SVLEN'] = df['SVLEN'].replace('.', 0).astype(int).apply(np.abs) # df['DIST'] = df.apply(lambda x: 1 if x['SVTYPE'] == 'INS' else x['SVLEN'], axis=1) # df['END'] = df['POS'] + df['DIST'] df['END'] = df['POS'] + df['SVLEN'] df[['CHROM', 'POS', 'ID', 'REF', 'ALT', 'END', 'SVLEN']].to_csv('annotations.tsv', sep='\t', index=False, header=False) CODE # Add annotations using bcftools # Start by removing old SVLEN from annotations; see this issue: https://github.com/samtools/bcftools/issues/1998 bcftools annotate -x INFO/SVLEN truvari-SVs.vcf.gz -o truvari-SVs-noSVLEN.vcf.gz bgzip annotations.tsv tabix -s1 -b2 -e2 annotations.tsv.gz echo '##INFO=' > annotations.hdr echo '##INFO=' >> annotations.hdr bcftools annotate -a annotations.tsv.gz -h annotations.hdr -c CHROM,POS,~ID,REF,ALT,INFO/END,INFO/SVLEN -o "~{output_name}~{output_suffix}.vcf.gz" truvari-SVs-noSVLEN.vcf.gz bcftools index -t -f "~{output_name}~{output_suffix}.vcf.gz" >>> runtime { docker: "us.gcr.io/broad-dsde-methods/sv_docker:v1.0" disks: "local-disk " + disk_size + " HDD" memory: memory_ram + " GB" cpu: cpu } output { File output_vcf = "~{output_name}~{output_suffix}.vcf.gz" File output_vcf_index = "~{output_name}~{output_suffix}.vcf.gz.tbi" } } # Removes SVs other than INS, DEL, and DUP, i.e. no BND, UNK, etc. and any less than min length # Also converts '.' FILTER to 'PASS' which is needed for Witty.er evaluation task NormalizeAndClean { input { File input_vcf File input_vcf_index String output_name = "output" String output_suffix = "-normalized" Int min_size = 50 Boolean restrict_sv_types = true Array[String] sv_types = ['DEL', 'INS', 'DUP'] # Restrict to just given types Boolean normalize_missing_filter_to_pass = true # Convert . FILTER to PASS Boolean strip_filters = false # Remove any filters regardless of . or not and replace with PASS # Runtime parameters Int disk_size = ceil(2 * size(input_vcf, "GB")) + 100 Int cpu = 4 Int memory_ram = 16 } command <<< set -xe python3 << CODE import pysam import numpy as np with pysam.VariantFile("~{input_vcf}") as vcf: with pysam.VariantFile("~{output_name}~{output_suffix}.vcf.gz", "w", header=vcf.header) as output_vcf: for record in vcf: # If SVTYPE has Number=A in header, record.info["SVTYPE"] would be a tuple if isinstance(record.info['SVTYPE'], tuple): svtype = record.info['SVTYPE'][0] # Multiallelics already split, so just one entry else: svtype = record.info['SVTYPE'] # Should be string in this case if (('~{restrict_sv_types}' == 'true') and (svtype in ["~{sep="\", \"" sv_types}"]) or ('~{restrict_sv_types}' == 'false')): # Normalize FILTER to be PASS rather than missing if ((len(record.filter.values()) == 0) and ('~{normalize_missing_filter_to_pass}' == 'true')) or ('~{strip_filters}' == 'true'): record.filter.clear() record.filter.add('PASS') ref = record.alleles[0] alt = record.alleles[1] if (np.abs(len(ref) - len(alt)) >= ~{min_size}) or (record.info['SVLEN'] >= ~{min_size}): if record.info['SVLEN'] > 0: # sniffles actually will write variants with abstract alleles SVLEN = 0, which causes problem in Wittyer... output_vcf.write(record) CODE bcftools index -t -f "~{output_name}~{output_suffix}.vcf.gz" >>> runtime { docker: "us.gcr.io/broad-dsde-methods/sv_docker:v1.0" disks: "local-disk " + disk_size + " HDD" memory: memory_ram + " GB" cpu: cpu } output { File output_vcf = "~{output_name}~{output_suffix}.vcf.gz" File output_vcf_index = "~{output_name}~{output_suffix}.vcf.gz.tbi" } } # Replaces explicit INDEL sequences with abstractions, e.g. or task ConvertToAbstract { input { File input_vcf File input_vcf_index String output_name = "converted" String output_suffix = "-converted_abs" # Runtime parameters Int disk_size = ceil(2 * size(input_vcf, "GB")) + 100 Int cpu = 4 Int memory_ram = 16 } command <<< set -xe python3 << CODE import pysam import numpy as np with pysam.VariantFile("~{input_vcf}") as vcf: with pysam.VariantFile("~{output_name}~{output_suffix}.vcf.gz", "w", header=vcf.header) as output_vcf: for record in vcf: ## Remove alt sequence to use abstract alleles; assumes split MA sites already ref = record.alleles[0] alt = record.alleles[1] # Check relative difference in size meets size threshold or already abstract # Also convert BND notation using ] or [ to abstract allele if (alt[0] == '<') or ('[' in alt) or (']' in alt): # If SVTYPE has Number=A in header, record.info["SVTYPE"] would be a tuple if isinstance(record.info['SVTYPE'], tuple): svtype = record.info['SVTYPE'][0] # Multiallelics already split, so just one entry else: svtype = record.info['SVTYPE'] # Should be string in this case # Replace REF with its first char, and ALT with abstraction # If SVTYPE is UNK, try to infer from alt alleles if svtype == "UNK": allele_type = alt.split('<')[-1].split('>')[0] record.info["SVTYPE"] = allele_type else: new_alt = f'<{svtype}>' if alt[0] != '<' else alt record.alleles = (record.alleles[0][0].upper(), new_alt) # upper since Wittyer dislikes lowercase ref... output_vcf.write(record) CODE bcftools index -t -f "~{output_name}~{output_suffix}.vcf.gz" >>> runtime { docker: "us.gcr.io/broad-dsde-methods/sv_docker:v1.0" disks: "local-disk " + disk_size + " HDD" memory: memory_ram + " GB" cpu: cpu } output { File output_vcf = "~{output_name}~{output_suffix}.vcf.gz" File output_vcf_index = "~{output_name}~{output_suffix}.vcf.gz.tbi" } }