Files
lush_grading/lush_bench/export.py
Cormac Shannon 20e62f60f6 Reorganize task categories from opaque a/b to descriptive names
Replace category_a/category_b directories with algorithm, pipeline,
environment, filesystem, and process. Add separate mode field (solve/convert)
to decouple orchestration from capability grouping. Add per-category
summary and questionnaire breakdowns to both terminal report and HTML export.
2026-03-29 20:59:01 +01:00

465 lines
18 KiB
Python

from __future__ import annotations
import base64
import html
import io
from pathlib import Path
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from .models import BenchmarkResult
from .report import (
LIKERT_QUESTIONS,
_get_freeform,
_get_likert_scores,
_parse_likert,
load_latest_results,
)
BASH_COLOR = "#4E79A7"
LUSH_COLOR = "#E15759"
NEUTRAL_COLOR = "#999999"
def _fig_to_base64(fig: plt.Figure) -> str:
buf = io.BytesIO()
fig.savefig(buf, format="png", dpi=150, bbox_inches="tight", facecolor="white")
plt.close(fig)
buf.seek(0)
return base64.b64encode(buf.read()).decode()
def _aggregate_likert(results: list[BenchmarkResult]) -> dict[str, dict[str, float]]:
"""Return {question_key: {bash: avg, lush: avg}}."""
agg: dict[str, dict[str, list[float]]] = {}
for key, _ in LIKERT_QUESTIONS:
agg[key] = {"bash": [], "lush": []}
for r in results:
scores = _get_likert_scores(r)
for key in scores:
for lang in ("bash", "lush"):
val = scores[key][lang]
if val is not None:
agg[key][lang].append(val)
return {
key: {
lang: (sum(vals) / len(vals)) if vals else 0.0
for lang, vals in agg[key].items()
}
for key in agg
}
def chart_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
"""Grouped horizontal bar chart comparing bash vs lush on each Likert metric."""
avgs = _aggregate_likert(results)
labels = [label for _, label in LIKERT_QUESTIONS]
bash_vals = [avgs[key]["bash"] for key, _ in LIKERT_QUESTIONS]
lush_vals = [avgs[key]["lush"] for key, _ in LIKERT_QUESTIONS]
fig, ax = plt.subplots(figsize=(8, 4.5))
y = range(len(labels))
bar_h = 0.35
bars_bash = ax.barh([i + bar_h / 2 for i in y], bash_vals, bar_h, label="bash", color=BASH_COLOR)
bars_lush = ax.barh([i - bar_h / 2 for i in y], lush_vals, bar_h, label="lush", color=LUSH_COLOR)
ax.set_yticks(list(y))
ax.set_yticklabels(labels)
ax.set_xlim(0, 5.5)
ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
ax.set_xlabel("Score (1-5)")
ax.set_title("Questionnaire Scores: Bash vs Lush")
ax.legend(loc="lower right")
ax.invert_yaxis()
for bar in bars_bash:
w = bar.get_width()
ax.text(w + 0.08, bar.get_y() + bar.get_height() / 2, f"{w:.1f}", va="center", fontsize=8)
for bar in bars_lush:
w = bar.get_width()
ax.text(w + 0.08, bar.get_y() + bar.get_height() / 2, f"{w:.1f}", va="center", fontsize=8)
ax.grid(axis="x", alpha=0.3)
return _fig_to_base64(fig)
def chart_turns_comparison(results: list[BenchmarkResult]) -> str:
"""Bar chart of agent turns per task for bash vs lush."""
# Only include tasks where the agent actually solved (solve mode)
solve = [r for r in results if r.mode == "solve"]
names = [r.task_name for r in solve]
bash_turns = [r.bash_result.agent_turns if r.bash_result else 0 for r in solve]
lush_turns = [r.lush_result.agent_turns if r.lush_result else 0 for r in solve]
fig, ax = plt.subplots(figsize=(8, 4))
x = range(len(names))
bar_w = 0.35
ax.bar([i - bar_w / 2 for i in x], bash_turns, bar_w, label="bash", color=BASH_COLOR)
ax.bar([i + bar_w / 2 for i in x], lush_turns, bar_w, label="lush", color=LUSH_COLOR)
ax.set_xticks(list(x))
ax.set_xticklabels(names, rotation=35, ha="right", fontsize=8)
ax.set_ylabel("Agent Turns")
ax.set_title("Agent Turns to Solve (Solve Mode)")
ax.yaxis.set_major_locator(ticker.MaxNLocator(integer=True))
ax.legend()
ax.grid(axis="y", alpha=0.3)
return _fig_to_base64(fig)
def chart_per_task_heatmap(results: list[BenchmarkResult]) -> str:
"""Heatmap showing lush-minus-bash score diff per task and metric."""
labels = [label for _, label in LIKERT_QUESTIONS]
tasks = [r.task_name for r in results]
data: list[list[float]] = []
for r in results:
scores = _get_likert_scores(r)
row = []
for key, _ in LIKERT_QUESTIONS:
b = scores[key]["bash"]
l = scores[key]["lush"]
if b is not None and l is not None:
row.append(l - b)
else:
row.append(0.0)
data.append(row)
fig, ax = plt.subplots(figsize=(8, max(4, len(tasks) * 0.45 + 1)))
im = ax.imshow(data, cmap="RdYlGn", aspect="auto", vmin=-3, vmax=3)
ax.set_xticks(range(len(labels)))
ax.set_xticklabels(labels, rotation=35, ha="right", fontsize=8)
ax.set_yticks(range(len(tasks)))
ax.set_yticklabels(tasks, fontsize=8)
for i in range(len(tasks)):
for j in range(len(labels)):
val = data[i][j]
text = f"+{val:.0f}" if val > 0 else f"{val:.0f}" if val < 0 else "0"
ax.text(j, i, text, ha="center", va="center", fontsize=8,
color="white" if abs(val) >= 2 else "black")
ax.set_title("Score Difference (Lush - Bash)")
fig.colorbar(im, ax=ax, shrink=0.8, label="Lush advantage")
return _fig_to_base64(fig)
def chart_per_category_questionnaire(results: list[BenchmarkResult]) -> str:
"""Grouped bar chart: one cluster per category, bars for bash/lush avg scores."""
from collections import defaultdict
by_cat: dict[str, list[BenchmarkResult]] = defaultdict(list)
for r in results:
by_cat[r.category].append(r)
categories = sorted(by_cat)
bash_avgs = []
lush_avgs = []
for cat in categories:
b_scores: list[float] = []
l_scores: list[float] = []
for r in by_cat[cat]:
scores = _get_likert_scores(r)
for key in scores:
if scores[key]["bash"] is not None:
b_scores.append(scores[key]["bash"])
if scores[key]["lush"] is not None:
l_scores.append(scores[key]["lush"])
bash_avgs.append(sum(b_scores) / len(b_scores) if b_scores else 0.0)
lush_avgs.append(sum(l_scores) / len(l_scores) if l_scores else 0.0)
fig, ax = plt.subplots(figsize=(8, 4))
x = range(len(categories))
bar_w = 0.35
bars_b = ax.bar([i - bar_w / 2 for i in x], bash_avgs, bar_w, label="bash", color=BASH_COLOR)
bars_l = ax.bar([i + bar_w / 2 for i in x], lush_avgs, bar_w, label="lush", color=LUSH_COLOR)
ax.set_xticks(list(x))
ax.set_xticklabels(categories, fontsize=9)
ax.set_ylim(0, 5.5)
ax.set_ylabel("Avg Score (1-5)")
ax.set_title("Questionnaire Scores by Category")
ax.legend()
ax.grid(axis="y", alpha=0.3)
for bar in bars_b:
ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.08, f"{bar.get_height():.1f}",
ha="center", va="bottom", fontsize=8)
for bar in bars_l:
ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.08, f"{bar.get_height():.1f}",
ha="center", va="bottom", fontsize=8)
return _fig_to_base64(fig)
def chart_per_category_radar(results: list[BenchmarkResult]) -> list[tuple[str, str]]:
"""Small-multiples bar charts: one per category showing 6 Likert dimensions for bash vs lush."""
import numpy as np
from collections import defaultdict
by_cat: dict[str, list[BenchmarkResult]] = defaultdict(list)
for r in results:
by_cat[r.category].append(r)
charts: list[tuple[str, str]] = []
labels = [label for _, label in LIKERT_QUESTIONS]
for cat in sorted(by_cat):
cat_results = by_cat[cat]
agg: dict[str, dict[str, list[float]]] = {}
for key, _ in LIKERT_QUESTIONS:
agg[key] = {"bash": [], "lush": []}
for r in cat_results:
scores = _get_likert_scores(r)
for key in scores:
for lang in ("bash", "lush"):
val = scores[key][lang]
if val is not None:
agg[key][lang].append(val)
bash_vals = [sum(agg[k]["bash"]) / len(agg[k]["bash"]) if agg[k]["bash"] else 0.0 for k, _ in LIKERT_QUESTIONS]
lush_vals = [sum(agg[k]["lush"]) / len(agg[k]["lush"]) if agg[k]["lush"] else 0.0 for k, _ in LIKERT_QUESTIONS]
fig, ax = plt.subplots(figsize=(6, 3.5))
y = range(len(labels))
bar_h = 0.35
ax.barh([i + bar_h / 2 for i in y], bash_vals, bar_h, label="bash", color=BASH_COLOR)
ax.barh([i - bar_h / 2 for i in y], lush_vals, bar_h, label="lush", color=LUSH_COLOR)
ax.set_yticks(list(y))
ax.set_yticklabels(labels, fontsize=8)
ax.set_xlim(0, 5.5)
ax.set_title(f"{cat}", fontsize=10)
ax.legend(fontsize=8, loc="lower right")
ax.invert_yaxis()
ax.grid(axis="x", alpha=0.3)
charts.append((cat, _fig_to_base64(fig)))
return charts
def _build_per_category_summary_html(results: list[BenchmarkResult]) -> str:
"""HTML table: rows=categories, columns=bash/lush pass rate, turns, scores."""
from collections import defaultdict
by_cat: dict[str, list[BenchmarkResult]] = defaultdict(list)
for r in results:
by_cat[r.category].append(r)
rows = []
for cat in sorted(by_cat):
cat_results = by_cat[cat]
b_passed = sum(1 for r in cat_results if r.bash_result and r.bash_result.all_passed)
l_passed = sum(1 for r in cat_results if r.lush_result and r.lush_result.all_passed)
b_total = sum(1 for r in cat_results if r.bash_result)
l_total = sum(1 for r in cat_results if r.lush_result)
b_turn_vals = [r.bash_result.agent_turns for r in cat_results if r.bash_result and r.bash_result.agent_turns > 0]
l_turn_vals = [r.lush_result.agent_turns for r in cat_results if r.lush_result and r.lush_result.agent_turns > 0]
b_turns_avg = sum(b_turn_vals) / len(b_turn_vals) if b_turn_vals else 0.0
l_turns_avg = sum(l_turn_vals) / len(l_turn_vals) if l_turn_vals else 0.0
b_scores: list[float] = []
l_scores: list[float] = []
for r in cat_results:
scores = _get_likert_scores(r)
for key in scores:
if scores[key]["bash"] is not None:
b_scores.append(scores[key]["bash"])
if scores[key]["lush"] is not None:
l_scores.append(scores[key]["lush"])
b_avg = sum(b_scores) / len(b_scores) if b_scores else 0.0
l_avg = sum(l_scores) / len(l_scores) if l_scores else 0.0
rows.append(f"""<tr>
<td>{html.escape(cat)}</td>
<td>{b_passed}/{b_total}</td><td>{l_passed}/{l_total}</td>
<td>{b_turns_avg:.1f}</td><td>{l_turns_avg:.1f}</td>
<td>{b_avg:.1f}</td><td>{l_avg:.1f}</td>
</tr>""")
return f"""<table>
<thead><tr>
<th>Category</th>
<th>Bash Pass</th><th>Lush Pass</th>
<th>Bash Avg Turns</th><th>Lush Avg Turns</th>
<th>Bash Avg Score</th><th>Lush Avg Score</th>
</tr></thead>
<tbody>{"".join(rows)}</tbody>
</table>"""
def _build_summary_html(results: list[BenchmarkResult]) -> str:
rows = []
for r in results:
b = r.bash_result
l = r.lush_result
b_cls = "pass" if b and b.all_passed else "fail"
l_cls = "pass" if l and l.all_passed else "fail"
b_pass = "PASS" if b and b.all_passed else "FAIL"
l_pass = "PASS" if l and l.all_passed else "FAIL"
b_turns = str(b.agent_turns) if b else "-"
l_turns = str(l.agent_turns) if l else "-"
rows.append(f"""<tr>
<td>{html.escape(r.task_name)}</td><td>{html.escape(r.category)}</td>
<td class="{b_cls}">{b_pass}</td><td>{b_turns}</td>
<td class="{l_cls}">{l_pass}</td><td>{l_turns}</td>
</tr>""")
b_passed = sum(1 for r in results if r.bash_result and r.bash_result.all_passed)
l_passed = sum(1 for r in results if r.lush_result and r.lush_result.all_passed)
total = len(results)
return f"""<table>
<thead><tr>
<th>Task</th><th>Cat</th>
<th>Bash</th><th>Turns</th>
<th>Lush</th><th>Turns</th>
</tr></thead>
<tbody>{"".join(rows)}</tbody>
<tfoot><tr>
<td><strong>Total</strong></td><td></td>
<td><strong>{b_passed}/{total}</strong></td><td></td>
<td><strong>{l_passed}/{total}</strong></td><td></td>
</tr></tfoot>
</table>"""
def _build_detail_html(results: list[BenchmarkResult]) -> str:
sections = []
for r in results:
b_status = "PASS" if r.bash_result and r.bash_result.all_passed else "FAIL"
l_status = "PASS" if r.lush_result and r.lush_result.all_passed else "FAIL"
scores = _get_likert_scores(r)
score_rows = []
for key, label in LIKERT_QUESTIONS:
b_val = scores[key]["bash"]
l_val = scores[key]["lush"]
b_str = f"{b_val:.0f}" if b_val is not None else "-"
l_str = f"{l_val:.0f}" if l_val is not None else "-"
if b_val is not None and l_val is not None:
d = l_val - b_val
d_str = f"+{d:.0f}" if d > 0 else f"{d:.0f}"
d_cls = "pos" if d > 0 else "neg" if d < 0 else ""
else:
d_str = "-"
d_cls = ""
score_rows.append(f'<tr><td>{html.escape(label)}</td>'
f'<td>{b_str}</td><td>{l_str}</td>'
f'<td class="{d_cls}">{d_str}</td></tr>')
obs = _get_freeform(r)
obs_html = ""
for lang, text in obs.items():
obs_html += f'<p><strong>{lang}:</strong> {html.escape(text)}</p>\n'
sections.append(f"""
<div class="task-detail">
<h3>{html.escape(r.task_name)} <span class="cat">[{r.category}/{r.mode}]</span>
<span class="{"pass" if b_status == "PASS" else "fail"}">bash={b_status}</span>
<span class="{"pass" if l_status == "PASS" else "fail"}">lush={l_status}</span>
</h3>
<table class="scores">
<thead><tr><th>Metric</th><th>Bash</th><th>Lush</th><th>Diff</th></tr></thead>
<tbody>{"".join(score_rows)}</tbody>
</table>
<div class="observations">{obs_html}</div>
</div>""")
return "\n".join(sections)
def export_html(results_dir: Path, output_path: Path) -> None:
results = load_latest_results(results_dir)
if not results:
output_path.write_text("<html><body><p>No results found.</p></body></html>")
return
chart_questionnaire = chart_questionnaire_comparison(results)
chart_turns = chart_turns_comparison(results)
chart_heatmap = chart_per_task_heatmap(results)
chart_cat_quest = chart_per_category_questionnaire(results)
cat_radar_charts = chart_per_category_radar(results)
summary_table = _build_summary_html(results)
cat_summary_table = _build_per_category_summary_html(results)
detail_html = _build_detail_html(results)
model = results[0].model if results else "unknown"
timestamp = max(r.timestamp for r in results)
page = f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Lush vs Bash Benchmark Report</title>
<style>
:root {{ --bash: {BASH_COLOR}; --lush: {LUSH_COLOR}; }}
* {{ box-sizing: border-box; margin: 0; padding: 0; }}
body {{ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
max-width: 960px; margin: 40px auto; padding: 0 20px; color: #1a1a1a; line-height: 1.5; }}
h1 {{ font-size: 1.8rem; margin-bottom: 4px; }}
h2 {{ font-size: 1.3rem; margin: 32px 0 16px; border-bottom: 2px solid #e0e0e0; padding-bottom: 6px; }}
h3 {{ font-size: 1.05rem; margin-bottom: 10px; }}
.meta {{ color: #666; font-size: 0.9rem; margin-bottom: 24px; }}
table {{ border-collapse: collapse; width: 100%; margin: 12px 0 20px; font-size: 0.9rem; }}
th, td {{ padding: 8px 12px; text-align: left; border-bottom: 1px solid #e0e0e0; }}
th {{ background: #f5f5f5; font-weight: 600; }}
td.pass {{ color: #2d8a4e; font-weight: 600; }}
td.fail {{ color: #d32f2f; font-weight: 600; }}
td.pos {{ color: #2d8a4e; }}
td.neg {{ color: #d32f2f; }}
tfoot td {{ font-weight: 600; border-top: 2px solid #333; }}
.chart {{ text-align: center; margin: 20px 0; }}
.chart img {{ max-width: 100%; height: auto; border: 1px solid #e0e0e0; border-radius: 4px; }}
.task-detail {{ margin: 20px 0 30px; padding: 16px; background: #fafafa; border-radius: 6px; border: 1px solid #e8e8e8; }}
.task-detail h3 {{ margin-bottom: 12px; }}
.task-detail .cat {{ color: #888; font-weight: normal; }}
.task-detail .pass {{ color: #2d8a4e; font-size: 0.85rem; margin-left: 8px; }}
.task-detail .fail {{ color: #d32f2f; font-size: 0.85rem; margin-left: 8px; }}
.scores {{ width: auto; }}
.scores td:nth-child(n+2) {{ text-align: center; min-width: 50px; }}
.scores th:nth-child(n+2) {{ text-align: center; }}
.observations {{ margin-top: 12px; font-size: 0.85rem; color: #444; }}
.observations p {{ margin-bottom: 6px; }}
</style>
</head>
<body>
<h1>Lush vs Bash Benchmark Report</h1>
<p class="meta">Model: {html.escape(model)} &middot; Latest run: {html.escape(timestamp)} &middot; Tasks: {len(results)}</p>
<h2>Summary</h2>
{summary_table}
<h2>Per-Category Summary</h2>
{cat_summary_table}
<h2>Questionnaire Scores</h2>
<div class="chart"><img src="data:image/png;base64,{chart_questionnaire}" alt="Questionnaire comparison"></div>
<h2>Questionnaire Scores by Category</h2>
<div class="chart"><img src="data:image/png;base64,{chart_cat_quest}" alt="Per-category questionnaire"></div>
<h2>Agent Turns (Solve Mode)</h2>
<div class="chart"><img src="data:image/png;base64,{chart_turns}" alt="Turns comparison"></div>
<h2>Score Difference Heatmap (Lush - Bash)</h2>
<div class="chart"><img src="data:image/png;base64,{chart_heatmap}" alt="Score heatmap"></div>
<h2>Per-Category Breakdown</h2>
{"".join(f'<h3>{cat}</h3><div class="chart"><img src="data:image/png;base64,{img}" alt="{cat} breakdown"></div>' for cat, img in cat_radar_charts)}
<h2>Per-Task Detail</h2>
{detail_html}
</body>
</html>"""
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(page)