Skip to content

Commit 16bbd12

Browse files
authored
Add comment showing change in benchmark file sizes (#7264)
1 parent bf97ba2 commit 16bbd12

File tree

4 files changed

+304
-2
lines changed

4 files changed

+304
-2
lines changed

.github/workflows/sql-benchmarks.yml

Lines changed: 61 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,6 @@ jobs:
223223
${{ matrix.scale_factor && format('--scale-factor {0}', matrix.scale_factor) || '' }}
224224
225225
- name: Install uv
226-
if: inputs.mode == 'pr'
227226
uses: spiraldb/actions/.github/actions/setup-uv@0.18.5
228227
with:
229228
sync: false
@@ -260,6 +259,56 @@ jobs:
260259
# unique benchmark configuration must have a unique comment-tag.
261260
comment-tag: bench-pr-comment-${{ matrix.id }}
262261

262+
- name: Compare file sizes
263+
if: inputs.mode == 'pr' && matrix.remote_storage == null
264+
shell: bash
265+
run: |
266+
set -Eeu -o pipefail -x
267+
268+
# Capture HEAD file sizes (vortex formats only)
269+
uv run --no-project scripts/capture-file-sizes.py \
270+
vortex-bench/data \
271+
--benchmark ${{ matrix.subcommand }} \
272+
--commit ${{ github.event.pull_request.head.sha }} \
273+
-o head-sizes.json
274+
275+
# Get base commit SHA (same as benchmark comparison)
276+
base_commit_sha=$(\
277+
curl -L \
278+
-H "Accept: application/vnd.github+json" \
279+
-H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
280+
https://api.github.com/repos/vortex-data/vortex/actions/workflows/bench.yml/runs\?branch\=develop\&status\=success\&per_page\=1 \
281+
| jq -r '.workflow_runs[].head_sha' \
282+
)
283+
284+
# Download file sizes baseline (per-benchmark file)
285+
python3 scripts/s3-download.py s3://vortex-ci-benchmark-results/file-sizes-${{ matrix.id }}.json.gz file-sizes.json.gz --no-sign-request || true
286+
287+
# Generate comparison report
288+
echo '# File Sizes: ${{ matrix.name }}' > sizes-comment.md
289+
echo '' >> sizes-comment.md
290+
291+
if [ -f file-sizes.json.gz ]; then
292+
gzip -d -c file-sizes.json.gz | grep $base_commit_sha > base-sizes.json || true
293+
if [ -s base-sizes.json ]; then
294+
uv run --no-project scripts/compare-file-sizes.py base-sizes.json head-sizes.json \
295+
>> sizes-comment.md
296+
else
297+
echo '_No baseline file sizes found for base commit._' >> sizes-comment.md
298+
fi
299+
else
300+
echo '_No baseline file sizes available yet._' >> sizes-comment.md
301+
fi
302+
303+
cat sizes-comment.md >> $GITHUB_STEP_SUMMARY
304+
305+
- name: Comment PR with file sizes
306+
if: inputs.mode == 'pr' && matrix.remote_storage == null && github.event.pull_request.head.repo.fork == false
307+
uses: thollander/actions-comment-pull-request@v3
308+
with:
309+
file-path: sizes-comment.md
310+
comment-tag: file-sizes-${{ matrix.id }}
311+
263312
- name: Comment PR on failure
264313
if: failure() && inputs.mode == 'pr' && github.event.pull_request.head.repo.fork == false
265314
uses: thollander/actions-comment-pull-request@v3
@@ -276,6 +325,17 @@ jobs:
276325
run: |
277326
bash scripts/cat-s3.sh vortex-ci-benchmark-results data.json.gz results.json
278327
328+
- name: Upload File Sizes
329+
if: inputs.mode == 'develop' && matrix.remote_storage == null
330+
shell: bash
331+
run: |
332+
uv run --no-project scripts/capture-file-sizes.py \
333+
vortex-bench/data \
334+
--benchmark ${{ matrix.subcommand }} \
335+
--commit ${{ github.sha }} \
336+
-o sizes.json
337+
bash scripts/cat-s3.sh vortex-ci-benchmark-results file-sizes-${{ matrix.id }}.json.gz sizes.json
338+
279339
- name: Alert incident.io
280340
if: failure() && inputs.mode == 'develop'
281341
uses: ./.github/actions/alert-incident-io

scripts/capture-file-sizes.py

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
#!/usr/bin/env python3
2+
# /// script
3+
# requires-python = ">=3.11"
4+
# dependencies = []
5+
# ///
6+
7+
# SPDX-License-Identifier: Apache-2.0
8+
# SPDX-FileCopyrightText: Copyright the Vortex contributors
9+
10+
"""Capture file sizes from benchmark data directories and output as JSONL."""
11+
12+
import argparse
13+
import json
14+
import sys
15+
from pathlib import Path
16+
17+
18+
def main():
19+
parser = argparse.ArgumentParser(description="Capture file sizes from benchmark data directories")
20+
parser.add_argument("data_dir", help="Data directory (e.g., vortex-bench/data)")
21+
parser.add_argument("--benchmark", required=True, help="Benchmark name (e.g., clickbench)")
22+
parser.add_argument("--commit", required=True, help="Commit SHA")
23+
parser.add_argument("-o", "--output", required=True, help="Output JSONL file path")
24+
args = parser.parse_args()
25+
26+
data_dir = Path(args.data_dir)
27+
if not data_dir.exists():
28+
print(f"Data directory not found: {data_dir}", file=sys.stderr)
29+
sys.exit(1)
30+
31+
# Find benchmark directories matching the name (handles flavors like clickbench_partitioned)
32+
# Also handles exact match (e.g., tpch)
33+
benchmark_dirs = [
34+
d
35+
for d in data_dir.iterdir()
36+
if d.is_dir() and (d.name == args.benchmark or d.name.startswith(f"{args.benchmark}_"))
37+
]
38+
39+
if not benchmark_dirs:
40+
print(f"No benchmark directories found matching: {args.benchmark}", file=sys.stderr)
41+
sys.exit(1)
42+
43+
# Formats to capture (vortex formats only, not parquet/duckdb)
44+
# Note: "vortex" CLI arg maps to "vortex-file-compressed" directory name
45+
formats_to_capture = {"vortex-file-compressed", "vortex-compact"}
46+
47+
records = []
48+
49+
# Walk subdirectories looking for format directories
50+
# Handle both direct format dirs (clickbench_partitioned/vortex-file-compressed/)
51+
# and scale factor subdirs (tpch/1.0/vortex-file-compressed/)
52+
for benchmark_dir in benchmark_dirs:
53+
for format_dir in benchmark_dir.rglob("*"):
54+
if not format_dir.is_dir():
55+
continue
56+
57+
format_name = format_dir.name
58+
if format_name not in formats_to_capture:
59+
continue
60+
61+
# Extract scale factor from path (e.g., "1.0" for tpch/1.0/vortex-file-compressed)
62+
# Default to "1.0" if no intermediate directory (e.g., clickbench)
63+
path_between = format_dir.relative_to(benchmark_dir).parent
64+
scale_factor = str(path_between) if str(path_between) != "." else "1.0"
65+
66+
# Capture all files in this format directory
67+
for file_path in format_dir.rglob("*"):
68+
if not file_path.is_file():
69+
continue
70+
71+
size_bytes = file_path.stat().st_size
72+
relative_path = file_path.relative_to(format_dir)
73+
74+
records.append(
75+
{
76+
"commit_id": args.commit,
77+
"benchmark": args.benchmark,
78+
"scale_factor": scale_factor,
79+
"format": format_name,
80+
"file": str(relative_path),
81+
"size_bytes": size_bytes,
82+
}
83+
)
84+
85+
# Sort for deterministic output
86+
records.sort(key=lambda r: (r["benchmark"], r["scale_factor"], r["format"], r["file"]))
87+
88+
# Write JSONL output
89+
with open(args.output, "w") as f:
90+
for record in records:
91+
f.write(json.dumps(record) + "\n")
92+
93+
print(f"Captured {len(records)} file sizes to {args.output}", file=sys.stderr)
94+
95+
96+
if __name__ == "__main__":
97+
main()

scripts/compare-file-sizes.py

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
#!/usr/bin/env python3
2+
# /// script
3+
# requires-python = ">=3.11"
4+
# dependencies = []
5+
# ///
6+
7+
# SPDX-License-Identifier: Apache-2.0
8+
# SPDX-FileCopyrightText: Copyright the Vortex contributors
9+
10+
"""Compare file sizes between base and HEAD and generate markdown report."""
11+
12+
import argparse
13+
import json
14+
import sys
15+
from collections import defaultdict
16+
17+
18+
def format_size(size_bytes: int) -> str:
19+
"""Format bytes as human-readable size."""
20+
if size_bytes >= 1024**3:
21+
return f"{size_bytes / (1024**3):.2f} GB"
22+
elif size_bytes >= 1024**2:
23+
return f"{size_bytes / (1024**2):.2f} MB"
24+
elif size_bytes >= 1024:
25+
return f"{size_bytes / 1024:.2f} KB"
26+
else:
27+
return f"{size_bytes} B"
28+
29+
30+
def format_change(change_bytes: int) -> str:
31+
"""Format byte change with sign."""
32+
sign = "+" if change_bytes > 0 else ""
33+
return f"{sign}{format_size(abs(change_bytes))}"
34+
35+
36+
def format_pct_change(pct: float) -> str:
37+
"""Format percentage change with sign."""
38+
sign = "+" if pct > 0 else ""
39+
return f"{sign}{pct:.1f}%"
40+
41+
42+
def main():
43+
parser = argparse.ArgumentParser(description="Compare file sizes between base and HEAD")
44+
parser.add_argument("base_file", help="Base JSONL file")
45+
parser.add_argument("head_file", help="HEAD JSONL file")
46+
args = parser.parse_args()
47+
48+
# Load base and head data
49+
base_data = {}
50+
try:
51+
with open(args.base_file) as f:
52+
for line in f:
53+
record = json.loads(line)
54+
# Support old records without scale_factor (default to "1.0")
55+
scale_factor = record.get("scale_factor", "1.0")
56+
key = (record["benchmark"], scale_factor, record["format"], record["file"])
57+
base_data[key] = record["size_bytes"]
58+
except FileNotFoundError:
59+
print("_Base file sizes not found._")
60+
sys.exit(0)
61+
62+
head_data = {}
63+
try:
64+
with open(args.head_file) as f:
65+
for line in f:
66+
record = json.loads(line)
67+
scale_factor = record.get("scale_factor", "1.0")
68+
key = (record["benchmark"], scale_factor, record["format"], record["file"])
69+
head_data[key] = record["size_bytes"]
70+
except FileNotFoundError:
71+
print("_HEAD file sizes not found._")
72+
sys.exit(0)
73+
74+
# Compare sizes
75+
comparisons = []
76+
format_totals = defaultdict(lambda: {"base": 0, "head": 0})
77+
78+
all_keys = set(base_data.keys()) | set(head_data.keys())
79+
for key in all_keys:
80+
benchmark, scale_factor, fmt, file_name = key
81+
base_size = base_data.get(key, 0)
82+
head_size = head_data.get(key, 0)
83+
84+
format_totals[fmt]["base"] += base_size
85+
format_totals[fmt]["head"] += head_size
86+
87+
change = head_size - base_size
88+
if change == 0:
89+
continue
90+
91+
if base_size > 0:
92+
pct_change = (head_size / base_size - 1) * 100
93+
elif head_size > 0:
94+
pct_change = float("inf")
95+
else:
96+
pct_change = 0
97+
98+
comparisons.append(
99+
{
100+
"file": file_name,
101+
"scale_factor": scale_factor,
102+
"format": fmt,
103+
"base_size": base_size,
104+
"head_size": head_size,
105+
"change": change,
106+
"pct_change": pct_change,
107+
}
108+
)
109+
110+
if not comparisons:
111+
print("_No file size changes detected._")
112+
return
113+
114+
# Sort by pct_change descending (largest increases first)
115+
comparisons.sort(key=lambda x: x["pct_change"], reverse=True)
116+
117+
# Output markdown table
118+
print("| File | Scale | Format | Base | HEAD | Change | % |")
119+
print("|------|-------|--------|------|------|--------|---|")
120+
121+
for comp in comparisons:
122+
pct_str = format_pct_change(comp["pct_change"]) if comp["pct_change"] != float("inf") else "new"
123+
base_str = format_size(comp["base_size"]) if comp["base_size"] > 0 else "-"
124+
print(
125+
f"| {comp['file']} | {comp['scale_factor']} | {comp['format']} | {base_str} | "
126+
f"{format_size(comp['head_size'])} | {format_change(comp['change'])} | {pct_str} |"
127+
)
128+
129+
# Output totals
130+
print("")
131+
print("**Totals:**")
132+
for fmt in sorted(format_totals.keys()):
133+
totals = format_totals[fmt]
134+
base_total = totals["base"]
135+
head_total = totals["head"]
136+
if base_total > 0:
137+
total_pct = (head_total / base_total - 1) * 100
138+
pct_str = f" ({format_pct_change(total_pct)})"
139+
else:
140+
pct_str = ""
141+
print(f"- {fmt}: {format_size(base_total)} \u2192 {format_size(head_total)}{pct_str}")
142+
143+
144+
if __name__ == "__main__":
145+
main()

vortex-bench/src/statpopgen/statpopgen_benchmark.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ impl StatPopGenBenchmark {
5252
)
5353
})?;
5454

55-
let data_path = "statspopgen".to_data_path().join(format!("{n_rows}/"));
55+
let data_path = "statpopgen".to_data_path().join(format!("{n_rows}/"));
5656

5757
let data_url =
5858
Url::from_directory_path(data_path).map_err(|_| anyhow::anyhow!("bad data path?"))?;

0 commit comments

Comments
 (0)