Replace category_a/category_b directories with algorithm, pipeline, environment, filesystem, and process. Add separate mode field (solve/convert) to decouple orchestration from capability grouping. Add per-category summary and questionnaire breakdowns to both terminal report and HTML export.
465 lines
18 KiB
Python
465 lines
18 KiB
Python
from __future__ import annotations
|
|
|
|
import base64
|
|
import html
|
|
import io
|
|
from pathlib import Path
|
|
|
|
import matplotlib
|
|
matplotlib.use("Agg")
|
|
import matplotlib.pyplot as plt
|
|
import matplotlib.ticker as ticker
|
|
|
|
from .models import BenchmarkResult
|
|
from .report import (
|
|
LIKERT_QUESTIONS,
|
|
_get_freeform,
|
|
_get_likert_scores,
|
|
_parse_likert,
|
|
load_latest_results,
|
|
)
|
|
|
|
BASH_COLOR = "#4E79A7"
|
|
LUSH_COLOR = "#E15759"
|
|
NEUTRAL_COLOR = "#999999"
|
|
|
|
|
|
def _fig_to_base64(fig: plt.Figure) -> str:
|
|
buf = io.BytesIO()
|
|
fig.savefig(buf, format="png", dpi=150, bbox_inches="tight", facecolor="white")
|
|
plt.close(fig)
|
|
buf.seek(0)
|
|
return base64.b64encode(buf.read()).decode()
|
|
|
|
|
|
def _aggregate_likert(results: list[BenchmarkResult]) -> dict[str, dict[str, float]]:
|
|
"""Return {question_key: {bash: avg, lush: avg}}."""
|
|
agg: dict[str, dict[str, list[float]]] = {}
|
|
for key, _ in LIKERT_QUESTIONS:
|
|
agg[key] = {"bash": [], "lush": []}
|
|
for r in results:
|
|
scores = _get_likert_scores(r)
|
|
for key in scores:
|
|
for lang in ("bash", "lush"):
|
|
val = scores[key][lang]
|
|
if val is not None:
|
|
agg[key][lang].append(val)
|
|
return {
|
|
key: {
|
|
lang: (sum(vals) / len(vals)) if vals else 0.0
|
|
for lang, vals in agg[key].items()
|
|
}
|
|
for key in agg
|
|
}
|
|
|
|
|
|
def chart_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
|
|
"""Grouped horizontal bar chart comparing bash vs lush on each Likert metric."""
|
|
avgs = _aggregate_likert(results)
|
|
labels = [label for _, label in LIKERT_QUESTIONS]
|
|
bash_vals = [avgs[key]["bash"] for key, _ in LIKERT_QUESTIONS]
|
|
lush_vals = [avgs[key]["lush"] for key, _ in LIKERT_QUESTIONS]
|
|
|
|
fig, ax = plt.subplots(figsize=(8, 4.5))
|
|
y = range(len(labels))
|
|
bar_h = 0.35
|
|
bars_bash = ax.barh([i + bar_h / 2 for i in y], bash_vals, bar_h, label="bash", color=BASH_COLOR)
|
|
bars_lush = ax.barh([i - bar_h / 2 for i in y], lush_vals, bar_h, label="lush", color=LUSH_COLOR)
|
|
|
|
ax.set_yticks(list(y))
|
|
ax.set_yticklabels(labels)
|
|
ax.set_xlim(0, 5.5)
|
|
ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
|
|
ax.set_xlabel("Score (1-5)")
|
|
ax.set_title("Questionnaire Scores: Bash vs Lush")
|
|
ax.legend(loc="lower right")
|
|
ax.invert_yaxis()
|
|
|
|
for bar in bars_bash:
|
|
w = bar.get_width()
|
|
ax.text(w + 0.08, bar.get_y() + bar.get_height() / 2, f"{w:.1f}", va="center", fontsize=8)
|
|
for bar in bars_lush:
|
|
w = bar.get_width()
|
|
ax.text(w + 0.08, bar.get_y() + bar.get_height() / 2, f"{w:.1f}", va="center", fontsize=8)
|
|
|
|
ax.grid(axis="x", alpha=0.3)
|
|
return _fig_to_base64(fig)
|
|
|
|
|
|
def chart_turns_comparison(results: list[BenchmarkResult]) -> str:
|
|
"""Bar chart of agent turns per task for bash vs lush."""
|
|
# Only include tasks where the agent actually solved (solve mode)
|
|
solve = [r for r in results if r.mode == "solve"]
|
|
names = [r.task_name for r in solve]
|
|
bash_turns = [r.bash_result.agent_turns if r.bash_result else 0 for r in solve]
|
|
lush_turns = [r.lush_result.agent_turns if r.lush_result else 0 for r in solve]
|
|
|
|
fig, ax = plt.subplots(figsize=(8, 4))
|
|
x = range(len(names))
|
|
bar_w = 0.35
|
|
ax.bar([i - bar_w / 2 for i in x], bash_turns, bar_w, label="bash", color=BASH_COLOR)
|
|
ax.bar([i + bar_w / 2 for i in x], lush_turns, bar_w, label="lush", color=LUSH_COLOR)
|
|
|
|
ax.set_xticks(list(x))
|
|
ax.set_xticklabels(names, rotation=35, ha="right", fontsize=8)
|
|
ax.set_ylabel("Agent Turns")
|
|
ax.set_title("Agent Turns to Solve (Solve Mode)")
|
|
ax.yaxis.set_major_locator(ticker.MaxNLocator(integer=True))
|
|
ax.legend()
|
|
ax.grid(axis="y", alpha=0.3)
|
|
return _fig_to_base64(fig)
|
|
|
|
|
|
def chart_per_task_heatmap(results: list[BenchmarkResult]) -> str:
|
|
"""Heatmap showing lush-minus-bash score diff per task and metric."""
|
|
labels = [label for _, label in LIKERT_QUESTIONS]
|
|
tasks = [r.task_name for r in results]
|
|
|
|
data: list[list[float]] = []
|
|
for r in results:
|
|
scores = _get_likert_scores(r)
|
|
row = []
|
|
for key, _ in LIKERT_QUESTIONS:
|
|
b = scores[key]["bash"]
|
|
l = scores[key]["lush"]
|
|
if b is not None and l is not None:
|
|
row.append(l - b)
|
|
else:
|
|
row.append(0.0)
|
|
data.append(row)
|
|
|
|
fig, ax = plt.subplots(figsize=(8, max(4, len(tasks) * 0.45 + 1)))
|
|
im = ax.imshow(data, cmap="RdYlGn", aspect="auto", vmin=-3, vmax=3)
|
|
|
|
ax.set_xticks(range(len(labels)))
|
|
ax.set_xticklabels(labels, rotation=35, ha="right", fontsize=8)
|
|
ax.set_yticks(range(len(tasks)))
|
|
ax.set_yticklabels(tasks, fontsize=8)
|
|
|
|
for i in range(len(tasks)):
|
|
for j in range(len(labels)):
|
|
val = data[i][j]
|
|
text = f"+{val:.0f}" if val > 0 else f"{val:.0f}" if val < 0 else "0"
|
|
ax.text(j, i, text, ha="center", va="center", fontsize=8,
|
|
color="white" if abs(val) >= 2 else "black")
|
|
|
|
ax.set_title("Score Difference (Lush - Bash)")
|
|
fig.colorbar(im, ax=ax, shrink=0.8, label="Lush advantage")
|
|
return _fig_to_base64(fig)
|
|
|
|
|
|
def chart_per_category_questionnaire(results: list[BenchmarkResult]) -> str:
|
|
"""Grouped bar chart: one cluster per category, bars for bash/lush avg scores."""
|
|
from collections import defaultdict
|
|
|
|
by_cat: dict[str, list[BenchmarkResult]] = defaultdict(list)
|
|
for r in results:
|
|
by_cat[r.category].append(r)
|
|
|
|
categories = sorted(by_cat)
|
|
bash_avgs = []
|
|
lush_avgs = []
|
|
for cat in categories:
|
|
b_scores: list[float] = []
|
|
l_scores: list[float] = []
|
|
for r in by_cat[cat]:
|
|
scores = _get_likert_scores(r)
|
|
for key in scores:
|
|
if scores[key]["bash"] is not None:
|
|
b_scores.append(scores[key]["bash"])
|
|
if scores[key]["lush"] is not None:
|
|
l_scores.append(scores[key]["lush"])
|
|
bash_avgs.append(sum(b_scores) / len(b_scores) if b_scores else 0.0)
|
|
lush_avgs.append(sum(l_scores) / len(l_scores) if l_scores else 0.0)
|
|
|
|
fig, ax = plt.subplots(figsize=(8, 4))
|
|
x = range(len(categories))
|
|
bar_w = 0.35
|
|
bars_b = ax.bar([i - bar_w / 2 for i in x], bash_avgs, bar_w, label="bash", color=BASH_COLOR)
|
|
bars_l = ax.bar([i + bar_w / 2 for i in x], lush_avgs, bar_w, label="lush", color=LUSH_COLOR)
|
|
|
|
ax.set_xticks(list(x))
|
|
ax.set_xticklabels(categories, fontsize=9)
|
|
ax.set_ylim(0, 5.5)
|
|
ax.set_ylabel("Avg Score (1-5)")
|
|
ax.set_title("Questionnaire Scores by Category")
|
|
ax.legend()
|
|
ax.grid(axis="y", alpha=0.3)
|
|
|
|
for bar in bars_b:
|
|
ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.08, f"{bar.get_height():.1f}",
|
|
ha="center", va="bottom", fontsize=8)
|
|
for bar in bars_l:
|
|
ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.08, f"{bar.get_height():.1f}",
|
|
ha="center", va="bottom", fontsize=8)
|
|
|
|
return _fig_to_base64(fig)
|
|
|
|
|
|
def chart_per_category_radar(results: list[BenchmarkResult]) -> list[tuple[str, str]]:
|
|
"""Small-multiples bar charts: one per category showing 6 Likert dimensions for bash vs lush."""
|
|
import numpy as np
|
|
from collections import defaultdict
|
|
|
|
by_cat: dict[str, list[BenchmarkResult]] = defaultdict(list)
|
|
for r in results:
|
|
by_cat[r.category].append(r)
|
|
|
|
charts: list[tuple[str, str]] = []
|
|
labels = [label for _, label in LIKERT_QUESTIONS]
|
|
|
|
for cat in sorted(by_cat):
|
|
cat_results = by_cat[cat]
|
|
agg: dict[str, dict[str, list[float]]] = {}
|
|
for key, _ in LIKERT_QUESTIONS:
|
|
agg[key] = {"bash": [], "lush": []}
|
|
for r in cat_results:
|
|
scores = _get_likert_scores(r)
|
|
for key in scores:
|
|
for lang in ("bash", "lush"):
|
|
val = scores[key][lang]
|
|
if val is not None:
|
|
agg[key][lang].append(val)
|
|
|
|
bash_vals = [sum(agg[k]["bash"]) / len(agg[k]["bash"]) if agg[k]["bash"] else 0.0 for k, _ in LIKERT_QUESTIONS]
|
|
lush_vals = [sum(agg[k]["lush"]) / len(agg[k]["lush"]) if agg[k]["lush"] else 0.0 for k, _ in LIKERT_QUESTIONS]
|
|
|
|
fig, ax = plt.subplots(figsize=(6, 3.5))
|
|
y = range(len(labels))
|
|
bar_h = 0.35
|
|
ax.barh([i + bar_h / 2 for i in y], bash_vals, bar_h, label="bash", color=BASH_COLOR)
|
|
ax.barh([i - bar_h / 2 for i in y], lush_vals, bar_h, label="lush", color=LUSH_COLOR)
|
|
ax.set_yticks(list(y))
|
|
ax.set_yticklabels(labels, fontsize=8)
|
|
ax.set_xlim(0, 5.5)
|
|
ax.set_title(f"{cat}", fontsize=10)
|
|
ax.legend(fontsize=8, loc="lower right")
|
|
ax.invert_yaxis()
|
|
ax.grid(axis="x", alpha=0.3)
|
|
|
|
charts.append((cat, _fig_to_base64(fig)))
|
|
|
|
return charts
|
|
|
|
|
|
def _build_per_category_summary_html(results: list[BenchmarkResult]) -> str:
|
|
"""HTML table: rows=categories, columns=bash/lush pass rate, turns, scores."""
|
|
from collections import defaultdict
|
|
|
|
by_cat: dict[str, list[BenchmarkResult]] = defaultdict(list)
|
|
for r in results:
|
|
by_cat[r.category].append(r)
|
|
|
|
rows = []
|
|
for cat in sorted(by_cat):
|
|
cat_results = by_cat[cat]
|
|
b_passed = sum(1 for r in cat_results if r.bash_result and r.bash_result.all_passed)
|
|
l_passed = sum(1 for r in cat_results if r.lush_result and r.lush_result.all_passed)
|
|
b_total = sum(1 for r in cat_results if r.bash_result)
|
|
l_total = sum(1 for r in cat_results if r.lush_result)
|
|
|
|
b_turn_vals = [r.bash_result.agent_turns for r in cat_results if r.bash_result and r.bash_result.agent_turns > 0]
|
|
l_turn_vals = [r.lush_result.agent_turns for r in cat_results if r.lush_result and r.lush_result.agent_turns > 0]
|
|
b_turns_avg = sum(b_turn_vals) / len(b_turn_vals) if b_turn_vals else 0.0
|
|
l_turns_avg = sum(l_turn_vals) / len(l_turn_vals) if l_turn_vals else 0.0
|
|
|
|
b_scores: list[float] = []
|
|
l_scores: list[float] = []
|
|
for r in cat_results:
|
|
scores = _get_likert_scores(r)
|
|
for key in scores:
|
|
if scores[key]["bash"] is not None:
|
|
b_scores.append(scores[key]["bash"])
|
|
if scores[key]["lush"] is not None:
|
|
l_scores.append(scores[key]["lush"])
|
|
b_avg = sum(b_scores) / len(b_scores) if b_scores else 0.0
|
|
l_avg = sum(l_scores) / len(l_scores) if l_scores else 0.0
|
|
|
|
rows.append(f"""<tr>
|
|
<td>{html.escape(cat)}</td>
|
|
<td>{b_passed}/{b_total}</td><td>{l_passed}/{l_total}</td>
|
|
<td>{b_turns_avg:.1f}</td><td>{l_turns_avg:.1f}</td>
|
|
<td>{b_avg:.1f}</td><td>{l_avg:.1f}</td>
|
|
</tr>""")
|
|
|
|
return f"""<table>
|
|
<thead><tr>
|
|
<th>Category</th>
|
|
<th>Bash Pass</th><th>Lush Pass</th>
|
|
<th>Bash Avg Turns</th><th>Lush Avg Turns</th>
|
|
<th>Bash Avg Score</th><th>Lush Avg Score</th>
|
|
</tr></thead>
|
|
<tbody>{"".join(rows)}</tbody>
|
|
</table>"""
|
|
|
|
|
|
def _build_summary_html(results: list[BenchmarkResult]) -> str:
|
|
rows = []
|
|
for r in results:
|
|
b = r.bash_result
|
|
l = r.lush_result
|
|
b_cls = "pass" if b and b.all_passed else "fail"
|
|
l_cls = "pass" if l and l.all_passed else "fail"
|
|
b_pass = "PASS" if b and b.all_passed else "FAIL"
|
|
l_pass = "PASS" if l and l.all_passed else "FAIL"
|
|
b_turns = str(b.agent_turns) if b else "-"
|
|
l_turns = str(l.agent_turns) if l else "-"
|
|
rows.append(f"""<tr>
|
|
<td>{html.escape(r.task_name)}</td><td>{html.escape(r.category)}</td>
|
|
<td class="{b_cls}">{b_pass}</td><td>{b_turns}</td>
|
|
<td class="{l_cls}">{l_pass}</td><td>{l_turns}</td>
|
|
</tr>""")
|
|
|
|
b_passed = sum(1 for r in results if r.bash_result and r.bash_result.all_passed)
|
|
l_passed = sum(1 for r in results if r.lush_result and r.lush_result.all_passed)
|
|
total = len(results)
|
|
|
|
return f"""<table>
|
|
<thead><tr>
|
|
<th>Task</th><th>Cat</th>
|
|
<th>Bash</th><th>Turns</th>
|
|
<th>Lush</th><th>Turns</th>
|
|
</tr></thead>
|
|
<tbody>{"".join(rows)}</tbody>
|
|
<tfoot><tr>
|
|
<td><strong>Total</strong></td><td></td>
|
|
<td><strong>{b_passed}/{total}</strong></td><td></td>
|
|
<td><strong>{l_passed}/{total}</strong></td><td></td>
|
|
</tr></tfoot>
|
|
</table>"""
|
|
|
|
|
|
def _build_detail_html(results: list[BenchmarkResult]) -> str:
|
|
sections = []
|
|
for r in results:
|
|
b_status = "PASS" if r.bash_result and r.bash_result.all_passed else "FAIL"
|
|
l_status = "PASS" if r.lush_result and r.lush_result.all_passed else "FAIL"
|
|
|
|
scores = _get_likert_scores(r)
|
|
score_rows = []
|
|
for key, label in LIKERT_QUESTIONS:
|
|
b_val = scores[key]["bash"]
|
|
l_val = scores[key]["lush"]
|
|
b_str = f"{b_val:.0f}" if b_val is not None else "-"
|
|
l_str = f"{l_val:.0f}" if l_val is not None else "-"
|
|
if b_val is not None and l_val is not None:
|
|
d = l_val - b_val
|
|
d_str = f"+{d:.0f}" if d > 0 else f"{d:.0f}"
|
|
d_cls = "pos" if d > 0 else "neg" if d < 0 else ""
|
|
else:
|
|
d_str = "-"
|
|
d_cls = ""
|
|
score_rows.append(f'<tr><td>{html.escape(label)}</td>'
|
|
f'<td>{b_str}</td><td>{l_str}</td>'
|
|
f'<td class="{d_cls}">{d_str}</td></tr>')
|
|
|
|
obs = _get_freeform(r)
|
|
obs_html = ""
|
|
for lang, text in obs.items():
|
|
obs_html += f'<p><strong>{lang}:</strong> {html.escape(text)}</p>\n'
|
|
|
|
sections.append(f"""
|
|
<div class="task-detail">
|
|
<h3>{html.escape(r.task_name)} <span class="cat">[{r.category}/{r.mode}]</span>
|
|
<span class="{"pass" if b_status == "PASS" else "fail"}">bash={b_status}</span>
|
|
<span class="{"pass" if l_status == "PASS" else "fail"}">lush={l_status}</span>
|
|
</h3>
|
|
<table class="scores">
|
|
<thead><tr><th>Metric</th><th>Bash</th><th>Lush</th><th>Diff</th></tr></thead>
|
|
<tbody>{"".join(score_rows)}</tbody>
|
|
</table>
|
|
<div class="observations">{obs_html}</div>
|
|
</div>""")
|
|
|
|
return "\n".join(sections)
|
|
|
|
|
|
def export_html(results_dir: Path, output_path: Path) -> None:
|
|
results = load_latest_results(results_dir)
|
|
if not results:
|
|
output_path.write_text("<html><body><p>No results found.</p></body></html>")
|
|
return
|
|
|
|
chart_questionnaire = chart_questionnaire_comparison(results)
|
|
chart_turns = chart_turns_comparison(results)
|
|
chart_heatmap = chart_per_task_heatmap(results)
|
|
chart_cat_quest = chart_per_category_questionnaire(results)
|
|
cat_radar_charts = chart_per_category_radar(results)
|
|
summary_table = _build_summary_html(results)
|
|
cat_summary_table = _build_per_category_summary_html(results)
|
|
detail_html = _build_detail_html(results)
|
|
|
|
model = results[0].model if results else "unknown"
|
|
timestamp = max(r.timestamp for r in results)
|
|
|
|
page = f"""<!DOCTYPE html>
|
|
<html lang="en">
|
|
<head>
|
|
<meta charset="utf-8">
|
|
<title>Lush vs Bash Benchmark Report</title>
|
|
<style>
|
|
:root {{ --bash: {BASH_COLOR}; --lush: {LUSH_COLOR}; }}
|
|
* {{ box-sizing: border-box; margin: 0; padding: 0; }}
|
|
body {{ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
|
|
max-width: 960px; margin: 40px auto; padding: 0 20px; color: #1a1a1a; line-height: 1.5; }}
|
|
h1 {{ font-size: 1.8rem; margin-bottom: 4px; }}
|
|
h2 {{ font-size: 1.3rem; margin: 32px 0 16px; border-bottom: 2px solid #e0e0e0; padding-bottom: 6px; }}
|
|
h3 {{ font-size: 1.05rem; margin-bottom: 10px; }}
|
|
.meta {{ color: #666; font-size: 0.9rem; margin-bottom: 24px; }}
|
|
table {{ border-collapse: collapse; width: 100%; margin: 12px 0 20px; font-size: 0.9rem; }}
|
|
th, td {{ padding: 8px 12px; text-align: left; border-bottom: 1px solid #e0e0e0; }}
|
|
th {{ background: #f5f5f5; font-weight: 600; }}
|
|
td.pass {{ color: #2d8a4e; font-weight: 600; }}
|
|
td.fail {{ color: #d32f2f; font-weight: 600; }}
|
|
td.pos {{ color: #2d8a4e; }}
|
|
td.neg {{ color: #d32f2f; }}
|
|
tfoot td {{ font-weight: 600; border-top: 2px solid #333; }}
|
|
.chart {{ text-align: center; margin: 20px 0; }}
|
|
.chart img {{ max-width: 100%; height: auto; border: 1px solid #e0e0e0; border-radius: 4px; }}
|
|
.task-detail {{ margin: 20px 0 30px; padding: 16px; background: #fafafa; border-radius: 6px; border: 1px solid #e8e8e8; }}
|
|
.task-detail h3 {{ margin-bottom: 12px; }}
|
|
.task-detail .cat {{ color: #888; font-weight: normal; }}
|
|
.task-detail .pass {{ color: #2d8a4e; font-size: 0.85rem; margin-left: 8px; }}
|
|
.task-detail .fail {{ color: #d32f2f; font-size: 0.85rem; margin-left: 8px; }}
|
|
.scores {{ width: auto; }}
|
|
.scores td:nth-child(n+2) {{ text-align: center; min-width: 50px; }}
|
|
.scores th:nth-child(n+2) {{ text-align: center; }}
|
|
.observations {{ margin-top: 12px; font-size: 0.85rem; color: #444; }}
|
|
.observations p {{ margin-bottom: 6px; }}
|
|
</style>
|
|
</head>
|
|
<body>
|
|
|
|
<h1>Lush vs Bash Benchmark Report</h1>
|
|
<p class="meta">Model: {html.escape(model)} · Latest run: {html.escape(timestamp)} · Tasks: {len(results)}</p>
|
|
|
|
<h2>Summary</h2>
|
|
{summary_table}
|
|
|
|
<h2>Per-Category Summary</h2>
|
|
{cat_summary_table}
|
|
|
|
<h2>Questionnaire Scores</h2>
|
|
<div class="chart"><img src="data:image/png;base64,{chart_questionnaire}" alt="Questionnaire comparison"></div>
|
|
|
|
<h2>Questionnaire Scores by Category</h2>
|
|
<div class="chart"><img src="data:image/png;base64,{chart_cat_quest}" alt="Per-category questionnaire"></div>
|
|
|
|
<h2>Agent Turns (Solve Mode)</h2>
|
|
<div class="chart"><img src="data:image/png;base64,{chart_turns}" alt="Turns comparison"></div>
|
|
|
|
<h2>Score Difference Heatmap (Lush - Bash)</h2>
|
|
<div class="chart"><img src="data:image/png;base64,{chart_heatmap}" alt="Score heatmap"></div>
|
|
|
|
<h2>Per-Category Breakdown</h2>
|
|
{"".join(f'<h3>{cat}</h3><div class="chart"><img src="data:image/png;base64,{img}" alt="{cat} breakdown"></div>' for cat, img in cat_radar_charts)}
|
|
|
|
<h2>Per-Task Detail</h2>
|
|
{detail_html}
|
|
|
|
</body>
|
|
</html>"""
|
|
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
output_path.write_text(page)
|