Skip to content

Commit

Permalink
A new benchmarking tool (#168)
Browse files Browse the repository at this point in the history
  • Loading branch information
sampsyo authored Apr 27, 2024
2 parents 5456b60 + b28985b commit a7f542c
Show file tree
Hide file tree
Showing 8 changed files with 374 additions and 1 deletion.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ __pycache__/
**/*.og
**/*.out
**/*.flatgfa
*.json
og_to_gfa.py
compute_maxes.py

Expand Down
2 changes: 2 additions & 0 deletions bench/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
graphs/
results/
2 changes: 2 additions & 0 deletions bench/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
%.svg: %.csv bar.vl.json
jq '.data.url |= "$<"' bar.vl.json | npx -p vega -p vega-lite vl2svg > $@
31 changes: 31 additions & 0 deletions bench/bar.vl.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
{
"data": {
"url": "FILE.csv",
"format": {
"type": "csv",
"parse": {"mean": "number", "stddev": "number"}
}
},
"layer": [
{
"mark": "bar",
"encoding": {
"x": { "field": "graph", "type": "nominal", "title": null },
"xOffset": { "field": "cmd" },
"y": { "field": "mean", "type": "quantitative",
"title": "running time (seconds)" },
"color": { "field": "cmd", "title": null }
}
},
{
"mark": {"type": "errorbar", "ticks": {"color": "black"}},
"encoding": {
"x": { "field": "graph", "type": "nominal" },
"xOffset": { "field": "cmd" },
"y": { "field": "mean", "type": "quantitative",
"title": "running time (seconds)" },
"yError": { "field": "stddev" }
}
}
]
}
234 changes: 234 additions & 0 deletions bench/bench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
try:
import tomllib
except ImportError:
import tomli as tomllib
import os
import subprocess
from subprocess import PIPE
from shlex import quote
import json
import tempfile
from dataclasses import dataclass
import csv
import argparse
import datetime
import logging
from contextlib import contextmanager
import time

BASE = os.path.dirname(__file__)
GRAPHS_TOML = os.path.join(BASE, "graphs.toml")
CONFIG_TOML = os.path.join(BASE, "config.toml")
GRAPHS_DIR = os.path.join(BASE, "graphs")
RESULTS_DIR = os.path.join(BASE, "results")
ALL_TOOLS = ["slow_odgi", "odgi", "flatgfa"]
DECOMPRESS = {
".gz": ["gunzip"],
".zst": ["zstd", "-d"],
}


def check_wait(popen):
err = popen.wait()
if err:
raise subprocess.CalledProcessError(err, popen.args)


@contextmanager
def logtime(log):
start = time.time()
yield
dur = time.time() - start
log.info("done in %.1f seconds", dur)


@dataclass(frozen=True)
class HyperfineResult:
command: str
mean: float
stddev: float
median: float
min: float
max: float
count: float

@classmethod
def from_json(cls, obj):
return cls(
command=obj["command"],
mean=obj["mean"],
stddev=obj["stddev"],
median=obj["median"],
min=obj["min"],
max=obj["max"],
count=len(obj["times"]),
)


def hyperfine(cmds):
"""Run Hyperfine to compare the commands."""
with tempfile.NamedTemporaryFile(delete=False) as tmp:
tmp.close()
subprocess.run(
["hyperfine", "-N", "-w", "1", "--export-json", tmp.name] + cmds,
check=True,
)
with open(tmp.name, "rb") as f:
data = json.load(f)
return [HyperfineResult.from_json(r) for r in data["results"]]
os.unlink(tmp.name)


def graph_path(name, ext):
return os.path.join(GRAPHS_DIR, f"{name}.{ext}")


def fetch_file(dest, url):
os.makedirs(GRAPHS_DIR, exist_ok=True)

_, ext = os.path.splitext(url)
if ext in DECOMPRESS:
# Decompress the file while downloading.
with open(dest, "wb") as f:
curl = subprocess.Popen(["curl", "-L", url], stdout=PIPE)
decomp = subprocess.Popen(DECOMPRESS[ext], stdin=curl.stdout, stdout=f)
curl.stdout.close()
check_wait(decomp)
else:
# Just fetch the raw file.
subprocess.run(["curl", "-L", "-o", dest, url], check=True)


class Runner:
def __init__(self, graphs, config):
self.graphs = graphs
self.config = config

# Some shorthands for tool paths.
self.odgi = config["tools"]["odgi"]
self.fgfa = config["tools"]["fgfa"]
self.slow_odgi = config["tools"]["slow_odgi"]

self.log = logging.getLogger("pollen-bench")
self.log.addHandler(logging.StreamHandler())
self.log.setLevel(logging.DEBUG)

@classmethod
def default(cls):
with open(GRAPHS_TOML, "rb") as f:
graphs = tomllib.load(f)
with open(CONFIG_TOML, "rb") as f:
config = tomllib.load(f)
return cls(graphs, config)

def fetch_graph(self, name):
"""Fetch a single graph, given by its <suite>.<graph> name."""
suite, key = name.split(".")
url = self.graphs[suite][key]
dest = graph_path(name, "gfa")

# If the file exists, don't re-download.
if os.path.exists(dest):
self.log.info("gfa already fetched for %s", name)
return

self.log.info("fetching graph %s", name)
fetch_file(dest, url)

def odgi_convert(self, name):
"""Convert a GFA to odgi's `.og` format."""
og = graph_path(name, "og")
if os.path.exists(og):
self.log.info("og exists for %s", name)
return

gfa = graph_path(name, "gfa")
self.log.info("converting %s to og", name)
with logtime(self.log):
subprocess.run([self.odgi, "build", "-g", gfa, "-o", og])

def flatgfa_convert(self, name):
"""Convert a GFA to the FlatGFA format."""
flatgfa = graph_path(name, "flatgfa")
if os.path.exists(flatgfa):
self.log.info("flatgfa exists for %s", name)
return

gfa = graph_path(name, "gfa")
self.log.info("converting %s to flatgfa", name)
with logtime(self.log):
subprocess.run([self.fgfa, "-I", gfa, "-o", flatgfa])

def compare_paths(self, name, tools):
"""Compare odgi and FlatGFA implementations of path-name extraction."""
commands = {
"odgi": f'{self.odgi} paths -i {quote(graph_path(name, "og"))} -L',
"flatgfa": f'{self.fgfa} -i {quote(graph_path(name, "flatgfa"))} paths',
"slow_odgi": f'{self.slow_odgi} paths {quote(graph_path(name, "gfa"))}',
}
commands = {k: commands[k] for k in tools}

self.log.info("comparing paths for %s", " ".join(tools))
with logtime(self.log):
results = hyperfine(list(commands.values()))
for cmd, res in zip(commands.keys(), results):
yield {
"cmd": cmd,
"mean": res.mean,
"stddev": res.stddev,
"graph": name,
"n": res.count,
}


def run_bench(graph_set, mode, tools, out_csv):
runner = Runner.default()

assert mode == "paths"
graph_names = runner.config["graph_sets"][graph_set]

# Fetch all the graphs and convert them to both odgi and FlatGFA.
for graph in graph_names:
runner.fetch_graph(graph)
runner.odgi_convert(graph)
runner.flatgfa_convert(graph)

runner.log.debug("writing results to %s", out_csv)
os.makedirs(os.path.dirname(out_csv), exist_ok=True)
with open(out_csv, "w") as f:
writer = csv.DictWriter(f, ["graph", "cmd", "mean", "stddev", "n"])
writer.writeheader()
for graph in graph_names:
for row in runner.compare_paths(graph, tools):
writer.writerow(row)


def gen_csv_name(graph_set, mode):
ts = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S.%f")
return os.path.join(RESULTS_DIR, f"{mode}-{graph_set}-{ts}.csv")


def bench_main():
parser = argparse.ArgumentParser(description="benchmarks for GFA stuff")
parser.add_argument(
"--graph-set", "-g", help="name of input graph set", required=True
)
parser.add_argument("--mode", "-m", help="thing to benchmark", required=True)
parser.add_argument("--tool", "-t", help="test this tool", action="append")
parser.add_argument("--output", "-o", help="output CSV")

args = parser.parse_args()
tools = args.tool or ALL_TOOLS
for tool in tools:
assert tool in ALL_TOOLS, "unknown tool name"

run_bench(
graph_set=args.graph_set,
mode=args.mode,
tools=tools,
out_csv=args.output or gen_csv_name(args.graph_set, args.mode),
)


if __name__ == "__main__":
bench_main()
9 changes: 9 additions & 0 deletions bench/config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[tools]
odgi = "odgi"
fgfa = "../flatgfa/target/release/fgfa"
slow_odgi = "../.venv/bin/slow_odgi"

[graph_sets]
smoke = ["test.k"]
mini = ["test.lpa", "test.chr6c4", "hprc.chrM"]
med = ["hprc.chr20", "hprc.chrX", "1000gont.chr16"]
62 changes: 62 additions & 0 deletions bench/graphs.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# From: https://github.com/AndreaGuarracino/1000G-ONT-F100-PGGB/blob/master/data/1000G-ONT-F100-PGGB.gfa.urls.tsv
[1000gont]
chr1 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr1.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
chr2 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr2.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
chr3 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr3.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
chr4 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr4.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
chr5 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr5.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
chr6 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr6.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
chr7 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr7.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
chr8 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr8.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
chr9 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr9.30kbp.fa.gz.445f03b.e34d4cd.b691e61.smooth.final.gfa.zst"
chr10 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr10.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
chr11 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr11.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
chr12 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr12.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
chr13 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr13.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
chr14 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr14.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
chr15 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr15.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
chr16 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr16.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
chr17 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr17.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
chr18 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr18.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
chr19 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr19.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
chr20 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr20.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
chr21 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr21.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
chr22 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr22.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
chrX = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chrX.30kbp.fa.gz.a8a102b.eb0f3d3.a58faa8.smooth.final.gfa.zst"
chrY = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chrY.30kbp.fa.gz.a8a102b.eb0f3d3.0713820.smooth.final.gfa.zst"

# From: https://s3-us-west-2.amazonaws.com/human-pangenomics/index.html?prefix=pangenomes/freeze/freeze1/pggb/chroms/
[hprc]
chrY = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chrY.hprc-v1.0-pggb.gfa.gz"
chr1 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr1.hprc-v1.0-pggb.gfa.gz"
chr10 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr10.hprc-v1.0-pggb.gfa.gz"
chr11 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr11.hprc-v1.0-pggb.gfa.gz"
chr12 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr12.hprc-v1.0-pggb.gfa.gz"
chr13 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr13.hprc-v1.0-pggb.gfa.gz"
chr14 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr14.hprc-v1.0-pggb.gfa.gz"
chr15 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr15.hprc-v1.0-pggb.gfa.gz"
chr16 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr16.hprc-v1.0-pggb.gfa.gz"
chr17 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr17.hprc-v1.0-pggb.gfa.gz"
chr18 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr18.hprc-v1.0-pggb.gfa.gz"
chr19 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr19.hprc-v1.0-pggb.gfa.gz"
chr2 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr2.hprc-v1.0-pggb.gfa.gz"
chr20 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr20.hprc-v1.0-pggb.gfa.gz"
chr21 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr21.hprc-v1.0-pggb.gfa.gz"
chr22 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr22.hprc-v1.0-pggb.gfa.gz"
chr3 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr3.hprc-v1.0-pggb.gfa.gz"
chr4 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr4.hprc-v1.0-pggb.gfa.gz"
chr5 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr5.hprc-v1.0-pggb.gfa.gz"
chr6 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr6.hprc-v1.0-pggb.gfa.gz"
chr7 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr7.hprc-v1.0-pggb.gfa.gz"
chr8 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr8.hprc-v1.0-pggb.gfa.gz"
chr9 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr9.hprc-v1.0-pggb.gfa.gz"
chrM = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chrM.hprc-v1.0-pggb.gfa.gz"
chrX = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chrX.hprc-v1.0-pggb.gfa.gz"

# Small tests from odgi:
# https://github.com/pangenome/odgi/tree/master/test
[test]
k = "https://raw.githubusercontent.com/pangenome/odgi/master/test/k.gfa"
lpa = "https://raw.githubusercontent.com/pangenome/odgi/master/test/LPA.gfa"
chr6c4 = "https://raw.githubusercontent.com/pangenome/odgi/master/test/chr6.C4.gfa"
drb1 = "https://raw.githubusercontent.com/pangenome/odgi/master/test/DRB1-3123.gfa"
34 changes: 34 additions & 0 deletions bench/sizes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import tomllib
import requests
import os

GRAPHS_TOML = os.path.join(os.path.dirname(__file__), "graphs.toml")
SIZE_NAMES = {
0: "",
3: "k",
6: "M",
9: "G",
12: "T",
}


def fmt_size(count):
for scale, name in reversed(SIZE_NAMES.items()):
unit = 10**scale
if count > unit:
return "{:.0f}{}B".format(count / unit, name)


def show_sizes():
with open(GRAPHS_TOML, "rb") as f:
graphs_data = tomllib.load(f)

for category, graphs in graphs_data.items():
for name, url in graphs.items():
res = requests.head(url)
length = int(res.headers["Content-Length"])
print(category, name, fmt_size(length))


if __name__ == "__main__":
show_sizes()

0 comments on commit a7f542c

Please sign in to comment.