version 1.0

import "https://raw.githubusercontent.com/broadinstitute/palantir-workflows/main/Utilities/WDLs/CreateIGVSession.wdl" as IGV
import "tasks.wdl" as SV_tasks

struct RuntimeAttributes {
    Int disk_size
    Int cpu
    Int memory
}

workflow BenchmarkSVs {
    input {
        File base_vcf
        File base_vcf_index
        String base_sample_name

        File comp_vcf
        File comp_vcf_index
        String comp_sample_name

        String? experiment

        File ref_fasta
        File ref_fai

        File? evaluation_bed
        Float? evaluation_pct    # Defaults to checking at least one base overlap evaluation_bed; between 0 and 1

        Array[File] bed_regions = []
        Array[String] bed_labels = []
        Int breakpoint_padding = 20

        Array[Int] svlen_bin_cutoffs = [100, 250, 1000, 2500, 10000, 25000]

        Boolean collect_closest_stats = true
        Boolean create_igv_session = true
        Array[File]? optional_igv_bams
    }

    # Subset to evaluation regions for remainder of analysis
    if (defined(evaluation_bed)) {
        call SV_tasks.SubsetEvaluation as SubsetComp {
            input:
                input_vcf=comp_vcf,
                input_vcf_index=comp_vcf_index,
                evaluation_bed=select_first([evaluation_bed]),
                evaluation_pct=evaluation_pct
        }

        if (defined(base_vcf)) {
            call SV_tasks.SubsetEvaluation as SubsetBase {
                input:
                    input_vcf=select_first([base_vcf]),
                    input_vcf_index=select_first([base_vcf_index]),
                    evaluation_bed=select_first([evaluation_bed]),
                    evaluation_pct=evaluation_pct
            }
        }
    }

    # Replace input files with subset versions
    File subset_comp_vcf = select_first([SubsetComp.output_vcf, comp_vcf])
    File subset_comp_vcf_index = select_first([SubsetComp.output_vcf_index, comp_vcf_index])
    File subset_base_vcf = select_first([SubsetBase.output_vcf, base_vcf])
    File subset_base_vcf_index = select_first([SubsetBase.output_vcf_index, base_vcf_index])

    # Benchmarking - Truvari tasks
    ## Run Truvari over each sample name pair
    call RunTruvari {
        input:
            base_vcf=subset_base_vcf,
            base_vcf_index=subset_base_vcf_index,
            base_sample_name=base_sample_name,
            comp_vcf=subset_comp_vcf,
            comp_vcf_index=subset_comp_vcf_index,
            comp_sample_name=comp_sample_name,
            experiment=experiment,
            ref_fasta=ref_fasta,
            ref_fai=ref_fai
    }

    call SV_tasks.AddIntervalOverlapStats as TruvariTpBaseIntervals {
        input:
            input_table=RunTruvari.tp_base_table,
            table_label="truvari_bench",
            output_name=base_sample_name+"-tp_base_intervals",
            bed_regions=bed_regions,
            bed_labels=bed_labels,
            ref_fai=ref_fai,
            breakpoint_padding=breakpoint_padding
    }

    call SV_tasks.AddIntervalOverlapStats as TruvariTpCompIntervals {
        input:
            input_table=RunTruvari.tp_comp_table,
            table_label="truvari_bench",
            output_name=comp_sample_name+"-tp_comp_intervals",
            bed_regions=bed_regions,
            bed_labels=bed_labels,
            ref_fai=ref_fai,
            breakpoint_padding=breakpoint_padding
    }

    call SV_tasks.AddIntervalOverlapStats as TruvariFpIntervals {
        input:
            input_table=RunTruvari.fp_table,
            table_label="truvari_bench",
            output_name=comp_sample_name+"-fp_intervals",
            bed_regions=bed_regions,
            bed_labels=bed_labels,
            ref_fai=ref_fai,
            breakpoint_padding=breakpoint_padding
    }

    call SV_tasks.AddIntervalOverlapStats as TruvariFnIntervals {
        input:
            input_table=RunTruvari.fn_table,
            table_label="truvari_bench",
            output_name=base_sample_name+"-fn_intervals",
            bed_regions=bed_regions,
            bed_labels=bed_labels,
            ref_fai=ref_fai,
            breakpoint_padding=breakpoint_padding
    }

    call ComputeTruvariIntervalSummaryStats {
        input:
            base_sample_name=base_sample_name,
            comp_sample_name=comp_sample_name,
            tp_base_intervals=TruvariTpBaseIntervals.output_table,
            tp_comp_intervals=TruvariTpCompIntervals.output_table,
            fp_intervals=TruvariFpIntervals.output_table,
            fn_intervals=TruvariFnIntervals.output_table,
            bed_labels=bed_labels,
            experiment=experiment,
            svlen_bin_cutoffs=svlen_bin_cutoffs
    }

    if (collect_closest_stats) {
        call CollectTruvariClosestStats {
            input:
                base_table=RunTruvari.base_table,
                comp_table=RunTruvari.comp_table,
                fp_table=RunTruvari.fp_table,
                fn_table=RunTruvari.fn_table,
                base_sample_name=base_sample_name,
                comp_sample_name=comp_sample_name,
                experiment=experiment,
                ref_fai=ref_fai
        }
    }

    if (create_igv_session) {
        call IGV.CreateIGVSession as IGVSession {
            input:
                bams=optional_igv_bams,
                vcfs=[RunTruvari.tp_base, RunTruvari.tp_comp, RunTruvari.fp, RunTruvari.fn],
                interval_files=bed_regions,
                reference=ref_fasta
        }
    }

    output {
        # Truvari outputs
        File truvari_bench_summary = ComputeTruvariIntervalSummaryStats.full_summary
        File? truvari_fp_closest = CollectTruvariClosestStats.fp_closest
        File? truvari_fn_closest = CollectTruvariClosestStats.fn_closest

        File truvari_tp_base = RunTruvari.tp_base
        File truvari_tp_base_index = RunTruvari.tp_base_index
        File truvari_fn = RunTruvari.fn
        File truvari_fn_index = RunTruvari.fn_index
        File truvari_tp_comp = RunTruvari.tp_comp
        File truvari_tp_comp_index = RunTruvari.tp_comp_index
        File truvari_fp = RunTruvari.fp
        File truvari_fp_index = RunTruvari.fp_index

        # IGV Session
        File? igv_session = IGVSession.igv_session
    }
}

task RunTruvari {
    input {
        File base_vcf
        File base_vcf_index
        String? base_sample_name

        File comp_vcf
        File comp_vcf_index
        String? comp_sample_name

        String experiment = "Experiment"

        File ref_fasta
        File ref_fai

        # Tool arguments
        ## Comparison Threshold
        Int ref_dist = 500
        Float pct_seq = 0.7
        Int min_hap_len = 50
        Float pct_size = 0.7
        Float pct_overlap = 0.0
        Boolean type_ignore = false    # Toggle true to ignore matching SV types for TP
        Boolean dup_to_ins = false    # Toggle true to treat DUP as INS
        Int chunk_size = 1000    # Max ref distance to compare calls
        String num_matches = "ac"   # Num times a variant can match; single, ac, or multi

        ## Filtering
        Int call_size_min = 50
        Int base_size_min = 50
        Int size_max = 50000
        Boolean pass_only = true
        Boolean replace_missing_filter_pass = true
        Array[String] drop_fields = ["FMT/AD"]    # For mismatching headers in refine step

        ## Refine args
        Boolean harmonize_phased_variants = false    # Use truvari refine on phased inputs
        String aligner = "mafft"

        Boolean debug_mode = false

        # Runtime parameters
        RuntimeAttributes runtimeAttributes = {"disk_size": ceil(2 * size(base_vcf, "GB") + 2 * size(comp_vcf, "GB")) + 100, "cpu": 8, "memory": 16}
    }

    command <<<
        set -xueo pipefail

        # Subset to sites for given sample names since Truvari flags don't seem to work right
        if [ -z "~{base_sample_name}~{comp_sample_name}" ]
        then
            mv ~{base_vcf} base-mac1.vcf.gz
            mv ~{base_vcf_index} base-mac1.vcf.gz.tbi
            mv ~{comp_vcf} comp-mac1.vcf.gz
            mv ~{comp_vcf_index} comp-mac1.vcf.gz.tbi
        else
            bcftools view ~{"-s " + base_sample_name} --min-ac 1 -o base-mac1.vcf.gz -Wtbi ~{base_vcf}
            bcftools view ~{"-s " + comp_sample_name} --min-ac 1 -o comp-mac1.vcf.gz -Wtbi ~{comp_vcf}
        fi

        # Split MA sites since truvari doesn't allow
        bcftools norm -m -any -o base-splitMA.vcf.gz -Wtbi base-mac1.vcf.gz
        bcftools norm -m -any -o comp-splitMA.vcf.gz -Wtbi comp-mac1.vcf.gz

        # Drop some fields that can be problematic due to inconsistent headers but irrelevant for comparison
        bcftools annotate -x ~{sep="," drop_fields} -o base.vcf.gz -Wtbi base-splitMA.vcf.gz
        bcftools annotate -x ~{sep="," drop_fields} -o comp.vcf.gz -Wtbi comp-splitMA.vcf.gz

        # Run Truvari for benchmarking
        truvari bench \
            -b base.vcf.gz \
            ~{"--bSample " + base_sample_name} \
            -c comp.vcf.gz \
            ~{"--cSample " + comp_sample_name} \
            -o "output_dir" \
            ~{true="--debug" false="" debug_mode} \
            ~{"--refdist " + ref_dist} \
            ~{"--pctseq " + pct_seq} \
            ~{"--minhaplen " + min_hap_len} \
            ~{"--pctsize " + pct_size} \
            ~{"--pctovl " + pct_overlap} \
            ~{true="--typeignore " false="" type_ignore} \
            ~{true="--dup-to-ins " false="" dup_to_ins} \
            ~{"--chunksize " + chunk_size} \
            ~{"--sizemin " + call_size_min} \
            ~{"--sizefilt " + base_size_min} \
            ~{"--sizemax " + size_max} \
            ~{true="--passonly " false="" pass_only} \
            ~{"--pick " + num_matches}

        # Use truvari refine to harmonize phased variants if requested
        RESULTS_STEM="output_dir/"
        if [ ~{harmonize_phased_variants} = true ]; then
            truvari refine \
                --reference ~{ref_fasta} \
                --regions output_dir/candidate.refine.bed \
                --recount \
                --use-region-coords \
                --use-original-vcfs \
                --align ~{aligner} \
                --threads ~{runtimeAttributes.cpu} \
                output_dir

            truvari ga4gh \
                --input output_dir \
                --output output_dir/combined_result \
                --with-refine

            # Annotate the refined VCFs
            truvari anno svinfo output_dir/combined_result_truth.vcf.gz -o output_dir/anno-combined_result_truth.vcf.gz
            bcftools index -t output_dir/anno-combined_result_truth.vcf.gz
            truvari anno svinfo output_dir/combined_result_query.vcf.gz -o output_dir/anno-combined_result_query.vcf.gz
            bcftools index -t output_dir/anno-combined_result_query.vcf.gz

            bcftools view -i 'FMT/BD="TP"' output_dir/anno-combined_result_truth.vcf.gz -o output_dir/anno-comb-tp-base.vcf.gz -Wtbi
            bcftools view -i 'FMT/BD="FN"' output_dir/anno-combined_result_truth.vcf.gz -o output_dir/anno-comb-fn.vcf.gz -Wtbi
            bcftools view -i 'FMT/BD="TP"' output_dir/anno-combined_result_query.vcf.gz -o output_dir/anno-comb-tp-comp.vcf.gz -Wtbi
            bcftools view -i 'FMT/BD="FP"' output_dir/anno-combined_result_query.vcf.gz -o output_dir/anno-comb-fp.vcf.gz -Wtbi

            RESULTS_STEM="output_dir/anno-comb-"
        fi

        # Use Python to collect the output results and compile across intervals into one table
        python3 << CODE
        import pandas as pd
        import json

        output_dir = 'output_dir' if "~{harmonize_phased_variants}" == "false" else 'output_dir/phab_bench'
        with open(f'{output_dir}/summary.json') as file:
            json_file = json.load(file)

        df = pd.DataFrame({k: v for k,v in json_file.items() if k != 'gt_matrix'}, index=[0])
        df['Base_Name'] = "~{default="" base_sample_name}"
        df['Comp_Name'] = "~{default="" comp_sample_name}"
        df['Experiment'] = "~{experiment}"
        df['Interval'] = "WholeGenome"

        df.to_csv("TruvariSummary.tsv", sep='\t', index=False)

        CODE

        # Create tables for making closest data and for interval overlap stats
        # Transform VCFs into bed files for more convenient analysis
        # Use POS0/END0 for coherence with bed coordinates later
        echo -e "CHROM\tPOS\tEND\tSVLEN\tSVTYPE\tFILTER" > input_header.txt
        INPUT_QUERY="%CHROM\t%POS0\t%END0\t%SVLEN\t%SVTYPE\t%FILTER\n"

        bcftools query -i 'INFO/SVTYPE!="."' -f"${INPUT_QUERY}" base.vcf.gz | bedtools sort -i - > base-preheader.bed
        cat input_header.txt base-preheader.bed > base.bed

        bcftools query -i 'INFO/SVTYPE!="."' -f"${INPUT_QUERY}" comp.vcf.gz | bedtools sort -i - > comp-preheader.bed
        cat input_header.txt comp-preheader.bed > comp.bed

        echo -e "CHROM\tPOS\tEND\tSVLEN\tSVTYPE\tFILTER\tGTMATCH\tGT" > truvari_header.txt
        MAIN_QUERY="%CHROM\t%POS0\t%END0\t%SVLEN\t%SVTYPE\t%FILTER\t%GTMatch\t[%GT]\n"

        bcftools query -i 'INFO/SVTYPE!="."' -f"${MAIN_QUERY}" "${RESULTS_STEM}tp-base.vcf.gz" | bedtools sort -i - > tp-base-preheader.bed
        cat truvari_header.txt tp-base-preheader.bed > tp-base.bed

        bcftools query -i 'INFO/SVTYPE!="."' -f"${MAIN_QUERY}" "${RESULTS_STEM}fn.vcf.gz" | bedtools sort -i - > fn-preheader.bed
        cat truvari_header.txt fn-preheader.bed > fn.bed

        bcftools query -i 'INFO/SVTYPE!="."' -f"${MAIN_QUERY}" "${RESULTS_STEM}tp-comp.vcf.gz" | bedtools sort -i - > tp-comp-preheader.bed
        cat truvari_header.txt tp-comp-preheader.bed > tp-comp.bed

        bcftools query -i 'INFO/SVTYPE!="."' -f"${MAIN_QUERY}" "${RESULTS_STEM}fp.vcf.gz" | bedtools sort -i - > fp-preheader.bed
        cat truvari_header.txt fp-preheader.bed > fp.bed

        if [ ~{replace_missing_filter_pass} = true ]; then
            # Make "." FILTER into "PASS"; relies on 6th column being FILTER from above query
            awk -v OFS="\t" '{ if ($6 == ".") $6 = "PASS"; print }' fn.bed > fn-filtered.bed
            mv fn-filtered.bed fn.bed

            awk -v OFS="\t" '{ if ($6 == ".") $6 = "PASS"; print }' fp.bed > fp-filtered.bed
            mv fp-filtered.bed fp.bed

            awk -v OFS="\t" '{ if ($6 == ".") $6 = "PASS"; print }' tp-base.bed > tp-base-filtered.bed
            mv tp-base-filtered.bed tp-base.bed

            awk -v OFS="\t" '{ if ($6 == ".") $6 = "PASS"; print }' tp-comp.bed > tp-comp-filtered.bed
            mv tp-comp-filtered.bed tp-comp.bed
        fi

        # Organize final outputs together
        mkdir final_outputs
        mv "${RESULTS_STEM}tp-base.vcf.gz" final_outputs/tp-base.vcf.gz
        mv "${RESULTS_STEM}tp-base.vcf.gz.tbi" final_outputs/tp-base.vcf.gz.tbi
        mv "${RESULTS_STEM}fn.vcf.gz" final_outputs/fn.vcf.gz
        mv "${RESULTS_STEM}fn.vcf.gz.tbi" final_outputs/fn.vcf.gz.tbi
        mv "${RESULTS_STEM}tp-comp.vcf.gz" final_outputs/tp-comp.vcf.gz
        mv "${RESULTS_STEM}tp-comp.vcf.gz.tbi" final_outputs/tp-comp.vcf.gz.tbi
        mv "${RESULTS_STEM}fp.vcf.gz" final_outputs/fp.vcf.gz
        mv "${RESULTS_STEM}fp.vcf.gz.tbi" final_outputs/fp.vcf.gz.tbi
    >>>

    runtime {
        docker: "us.gcr.io/broad-dsde-methods/sv_docker:v1.1"
        disks: "local-disk " + runtimeAttributes.disk_size + " HDD"
        memory: runtimeAttributes.memory + " GB"
        cpu: runtimeAttributes.cpu
    }

    output {
        File tp_base = "final_outputs/tp-base.vcf.gz"
        File tp_base_index = "final_outputs/tp-base.vcf.gz.tbi"
        File fn = "final_outputs/fn.vcf.gz"
        File fn_index = "final_outputs/fn.vcf.gz.tbi"
        File tp_comp = "final_outputs/tp-comp.vcf.gz"
        File tp_comp_index = "final_outputs/tp-comp.vcf.gz.tbi"
        File fp = "final_outputs/fp.vcf.gz"
        File fp_index = "final_outputs/fp.vcf.gz.tbi"

        File base_table = "base.bed"
        File comp_table = "comp.bed"
        File tp_base_table = "tp-base.bed"
        File fn_table = "fn.bed"
        File tp_comp_table = "tp-comp.bed"
        File fp_table = "fp.bed"

        File summary = "TruvariSummary.tsv"
    }
}

task CollectTruvariClosestStats {
    input {
        File base_table
        File comp_table
        File fp_table
        File fn_table

        String? base_sample_name
        String? comp_sample_name

        String experiment = ""

        File ref_fai

        Int k_closest = 3    # Number of close by variants to compare to

        # Runtime parameters
        RuntimeAttributes runtimeAttributes = {"disk_size": ceil(4 * size(base_table, "GB")) + 100, "cpu": 4, "memory": 16}
    }

    command <<<
        set -xueo pipefail

        # Generate "closest" data
        echo -e "BASENAME\tCOMPNAME\tExperiment\tLCHROM\tLPOS\tLEND\tLLEN\tLTYPE\tLFILTER\tLGTMatch\tRCHROM\tRPOS\tREND\tRLEN\tRTYPE\tRFILTER\tDIST" > header.txt

        bedtools closest -a ~{fp_table} -b ~{base_table} -k ~{k_closest} -D ref > fp-closest.bed
        awk -v OFS='\t' '{ print "~{base_sample_name}-Base", "~{comp_sample_name}-Comp", "~{experiment}", $0 }' fp-closest.bed > fp-closest-samples.bed
        cat header.txt fp-closest-samples.bed > fp-closest-final.bed

        bedtools closest -a ~{fn_table} -b ~{comp_table} -k ~{k_closest} -D ref > fn-closest.bed
        awk -v OFS='\t' '{ print "~{base_sample_name}-Base", "~{comp_sample_name}-Comp", "~{experiment}", $0 }' fn-closest.bed > fn-closest-samples.bed
        cat header.txt fn-closest-samples.bed > fn-closest-final.bed

        # Correct POS0/END back to original VCF POS/END coordinates
        python3 << CODE
        import pandas as pd

        for file in ['fp-closest-final.bed', 'fn-closest-final.bed']:
            df = pd.read_csv(file, sep='\t')
            df['LPOS'] = df['LPOS'] + 1
            df['LEND'] = df['LEND'] + 1
            df['RPOS'] = df['RPOS'] + 1
            df['REND'] = df['REND'] + 1

        CODE

    >>>

    runtime {
        docker: "us.gcr.io/broad-dsde-methods/sv_docker:v1.1"
        disks: "local-disk " + runtimeAttributes.disk_size + " HDD"
        memory: runtimeAttributes.memory + " GB"
        cpu: runtimeAttributes.cpu
    }

    output {
        File fp_closest = "fp-closest-final.bed"
        File fn_closest = "fn-closest-final.bed"
    }
}

task ComputeTruvariIntervalSummaryStats {
    input {
        File tp_base_intervals
        File tp_comp_intervals
        File fp_intervals
        File fn_intervals

        String base_sample_name = ""
        String comp_sample_name = ""

        Array[String] bed_labels

        String experiment = ""

        Array[Int] svlen_bin_cutoffs = [100, 250, 1000, 2500, 10000, 25000]
        Array[Int] overlap_percents = [0, 25, 50, 75, 100]

        # Runtime Parameters
        RuntimeAttributes runtimeAttributes = {"disk_size": ceil(4 * size(tp_base_intervals, "GB")) + 100, "cpu": 4, "memory": 16}
    }

    command <<<
        set -xueo pipefail

        python3 << CODE
        import pandas as pd
        import re

        tp_base_df = pd.read_csv("~{tp_base_intervals}", sep='\t')
        tp_comp_df = pd.read_csv("~{tp_comp_intervals}", sep='\t')
        fp_df = pd.read_csv("~{fp_intervals}", sep='\t')
        fn_df = pd.read_csv("~{fn_intervals}", sep='\t')

        # Bin counts by SVLEN
        def add_suffix(n):
            if n < 1_000:
                return f'{n}'
            elif n < 1_000_000:
                return f'{n/1000:.1f}k'
            elif n < 1_000_000_000:
                return f'{n/1_000_000:.1f}M'
            else:
                return f'{n}'

        SVLEN_Bin_cutoffs = [~{default="" sep=", " svlen_bin_cutoffs}]
        SVLEN_Bin_start = [f'<{add_suffix(SVLEN_Bin_cutoffs[0])}']
        SVLEN_Bins_middle = [f'{add_suffix(SVLEN_Bin_cutoffs[i])}-{add_suffix(SVLEN_Bin_cutoffs[i+1])}' for i in range(len(SVLEN_Bin_cutoffs)-1)]
        SVLEN_Bin_end = [f'>{add_suffix(SVLEN_Bin_cutoffs[-1])}']
        SVLEN_Bins = SVLEN_Bin_start + SVLEN_Bins_middle + SVLEN_Bin_end

        def bin_svlen(num):
            for i, bin in enumerate(SVLEN_Bin_cutoffs):
                if num < bin:
                    return SVLEN_Bins[i]
            return SVLEN_Bins[-1]

        tp_base_df['SVLEN_Bin'] = tp_base_df['SVLEN'].apply(bin_svlen)
        tp_comp_df['SVLEN_Bin'] = tp_comp_df['SVLEN'].apply(bin_svlen)
        fp_df['SVLEN_Bin'] = fp_df['SVLEN'].apply(bin_svlen)
        fn_df['SVLEN_Bin'] = fn_df['SVLEN'].apply(bin_svlen)

        def label_gt(gt):
            # Split on / or | in gt value
            calls = re.split(r'[|/]', str(gt))    # Fix weird case where pandas reads 1 as int rather than str

            # Check if all calls are equal
            if all(calls[0] == c for c in calls):
                return 'HOM'
            else:
                return 'HET'

        tp_base_df['GT'] = tp_base_df['GT'].apply(label_gt)
        tp_comp_df['GT'] = tp_comp_df['GT'].apply(label_gt)
        fp_df['GT'] = fp_df['GT'].apply(label_gt)
        fn_df['GT'] = fn_df['GT'].apply(label_gt)

        intervals = ["~{default="" sep="\", \"" bed_labels}"]
        overlap_percents = [~{default="" sep=", " overlap_percents}] + ['>0']
        full_counts_df = pd.DataFrame()
        for interval in intervals:
            for pct in overlap_percents:
                merged_df = pd.DataFrame({'SVTYPE': [], 'SVLEN_Bin': [], 'FILTER': [], 'GT': []})
                full_merge_columns = ['SVTYPE', 'SVLEN_Bin', 'FILTER', 'GT', 'Interval', 'Pct_Overlap']
                for label, df in zip(['tp_base', 'tp_comp', 'fp', 'fn'], [tp_base_df, tp_comp_df, fp_df, fn_df]):
                    for overlap_mode in ['full', 'breakpoint']:
                        for gt_mode in ['match', 'no_match']:
                            if gt_mode == 'no_match' or 'tp' in label:    # Skip match GT for FP/FN
                                if overlap_mode == 'full':
                                    count_label = label
                                    if pct == '>0':
                                        subset_condition = df[f'{interval}-overlap'] > 0
                                    else:
                                        subset_condition = df[f'{interval}-overlap'] >= pct/100
                                else:
                                    count_label = f'{label}-BEND'
                                    if pct == '>0':
                                        subset_condition = (df[f'{interval}-LBEND-overlap'] > 0) | (df[f'{interval}-RBEND-overlap'] > 0)
                                    else:
                                        subset_condition = (df[f'{interval}-LBEND-overlap'] >= pct/100) | (df[f'{interval}-RBEND-overlap'] >= pct/100)

                                if gt_mode == 'match':
                                    subset_condition = subset_condition & (df['GTMATCH'] == 0)

                                sub_df = df[subset_condition].copy()
                                if len(sub_df) > 0:
                                    gt_suffix = '-gt' if gt_mode == 'match' else ''
                                    sub_counts = sub_df.groupby(['SVTYPE', 'SVLEN_Bin', 'FILTER', 'GT']).apply(len).reset_index().rename(columns={0: f'{count_label}{gt_suffix}_count'})

                                    svtype_all_counts = sub_counts.groupby(['SVLEN_Bin', 'FILTER', 'GT']).sum(numeric_only=True).reset_index()
                                    svtype_all_counts['SVTYPE'] = 'ALL'
                                    sub_counts = pd.concat([sub_counts, svtype_all_counts])

                                    svlen_all_counts = sub_counts.groupby(['SVTYPE', 'FILTER', 'GT']).sum(numeric_only=True).reset_index()
                                    svlen_all_counts['SVLEN_Bin'] = 'ALL'
                                    sub_counts = pd.concat([sub_counts, svlen_all_counts])

                                    gt_all_counts = sub_counts.groupby(['SVTYPE', 'SVLEN_Bin', 'FILTER']).sum(numeric_only=True).reset_index()
                                    gt_all_counts['GT'] = 'ALL'
                                    sub_counts = pd.concat([sub_counts, gt_all_counts])

                                    sub_counts['Pct_Overlap'] = pct
                                    sub_counts['Interval'] = interval

                                    merge_cols = [c for c in full_merge_columns if c in merged_df.columns]
                                    merged_df = merged_df.merge(sub_counts, on=merge_cols, how='outer').fillna(0)
                full_counts_df = pd.concat([full_counts_df, merged_df])

        # Generate Precision, Recall, F1 stats
        full_counts_df['Precision'] = full_counts_df['tp_comp_count'] / (full_counts_df['tp_comp_count'] + full_counts_df['fp_count'])
        full_counts_df['Precision-BEND'] = full_counts_df['tp_comp-BEND_count'] / (full_counts_df['tp_comp-BEND_count'] + full_counts_df['fp-BEND_count'])
        full_counts_df['Recall'] = full_counts_df['tp_base_count'] / (full_counts_df['tp_base_count'] + full_counts_df['fn_count'])
        full_counts_df['Recall-BEND'] = full_counts_df['tp_base-BEND_count'] / (full_counts_df['tp_base-BEND_count'] + full_counts_df['fn-BEND_count'])
        full_counts_df['F1_Score'] = 2 * full_counts_df['Precision'] * full_counts_df['Recall'] / (full_counts_df['Precision'] + full_counts_df['Recall'])
        full_counts_df['F1_Score-BEND'] = 2 * full_counts_df['Precision-BEND'] * full_counts_df['Recall-BEND'] / (full_counts_df['Precision-BEND'] + full_counts_df['Recall-BEND'])

        full_counts_df['GT_concordance'] = full_counts_df['tp_comp-gt_count'] / (full_counts_df['tp_comp_count'])

        # Add sample names
        full_counts_df['Base_Sample_Name'] = "~{base_sample_name}"
        full_counts_df['Comp_Sample_Name'] = "~{comp_sample_name}"

        full_counts_df['Experiment'] = "~{experiment}"

        column_order = ['Base_Sample_Name', 'Comp_Sample_Name', 'Experiment', 'SVTYPE', 'SVLEN_Bin', 'FILTER', 'GT', 'Interval', 'Pct_Overlap',
                        'tp_base_count', 'tp_base-BEND_count', 'tp_base-gt_count', 'tp_comp_count', 'tp_comp-BEND_count', 'tp_comp-gt_count', 'fp_count', 'fp-BEND_count',
                        'fn_count', 'fn-BEND_count', 'Precision', 'Precision-BEND', 'Recall', 'Recall-BEND', 'F1_Score', 'F1_Score-BEND', 'GT_concordance']

        full_counts_df[column_order].to_csv('FullTruvariSummary.tsv', sep='\t', index=False)

        CODE

    >>>

    runtime {
        docker: "us.gcr.io/broad-dsde-methods/sv_docker:v1.1"
        disks: "local-disk " + runtimeAttributes.disk_size + " HDD"
        memory: runtimeAttributes.memory + " GB"
        cpu: runtimeAttributes.cpu
    }

    output {
        File full_summary = "FullTruvariSummary.tsv"
    }
}