Benchmark harness that uses LLM agents to solve shell scripting tasks in both Bash and Lush, then compares correctness and code quality. - CLI with run, run-all, list-tasks, report, and export commands - Agent loop with retry support via Anthropic Claude provider - Test harness executing solutions in sandboxed subprocesses - LLM-driven questionnaire for subjective code quality evaluation - HTML report export with charts (matplotlib) - 8 Category A tasks (write-from-scratch in both languages) - 4 Category B tasks (verify provided Bash, convert to Lush) - Lush language reference for agent context
308 lines
12 KiB
Python
308 lines
12 KiB
Python
from __future__ import annotations
|
|
|
|
import base64
|
|
import html
|
|
import io
|
|
from pathlib import Path
|
|
|
|
import matplotlib
|
|
matplotlib.use("Agg")
|
|
import matplotlib.pyplot as plt
|
|
import matplotlib.ticker as ticker
|
|
|
|
from .models import BenchmarkResult
|
|
from .report import (
|
|
LIKERT_QUESTIONS,
|
|
_get_freeform,
|
|
_get_likert_scores,
|
|
_parse_likert,
|
|
load_latest_results,
|
|
)
|
|
|
|
BASH_COLOR = "#4E79A7"
|
|
LUSH_COLOR = "#E15759"
|
|
NEUTRAL_COLOR = "#999999"
|
|
|
|
|
|
def _fig_to_base64(fig: plt.Figure) -> str:
|
|
buf = io.BytesIO()
|
|
fig.savefig(buf, format="png", dpi=150, bbox_inches="tight", facecolor="white")
|
|
plt.close(fig)
|
|
buf.seek(0)
|
|
return base64.b64encode(buf.read()).decode()
|
|
|
|
|
|
def _aggregate_likert(results: list[BenchmarkResult]) -> dict[str, dict[str, float]]:
|
|
"""Return {question_key: {bash: avg, lush: avg}}."""
|
|
agg: dict[str, dict[str, list[float]]] = {}
|
|
for key, _ in LIKERT_QUESTIONS:
|
|
agg[key] = {"bash": [], "lush": []}
|
|
for r in results:
|
|
scores = _get_likert_scores(r)
|
|
for key in scores:
|
|
for lang in ("bash", "lush"):
|
|
val = scores[key][lang]
|
|
if val is not None:
|
|
agg[key][lang].append(val)
|
|
return {
|
|
key: {
|
|
lang: (sum(vals) / len(vals)) if vals else 0.0
|
|
for lang, vals in agg[key].items()
|
|
}
|
|
for key in agg
|
|
}
|
|
|
|
|
|
def chart_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
|
|
"""Grouped horizontal bar chart comparing bash vs lush on each Likert metric."""
|
|
avgs = _aggregate_likert(results)
|
|
labels = [label for _, label in LIKERT_QUESTIONS]
|
|
bash_vals = [avgs[key]["bash"] for key, _ in LIKERT_QUESTIONS]
|
|
lush_vals = [avgs[key]["lush"] for key, _ in LIKERT_QUESTIONS]
|
|
|
|
fig, ax = plt.subplots(figsize=(8, 4.5))
|
|
y = range(len(labels))
|
|
bar_h = 0.35
|
|
bars_bash = ax.barh([i + bar_h / 2 for i in y], bash_vals, bar_h, label="bash", color=BASH_COLOR)
|
|
bars_lush = ax.barh([i - bar_h / 2 for i in y], lush_vals, bar_h, label="lush", color=LUSH_COLOR)
|
|
|
|
ax.set_yticks(list(y))
|
|
ax.set_yticklabels(labels)
|
|
ax.set_xlim(0, 5.5)
|
|
ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
|
|
ax.set_xlabel("Score (1-5)")
|
|
ax.set_title("Questionnaire Scores: Bash vs Lush")
|
|
ax.legend(loc="lower right")
|
|
ax.invert_yaxis()
|
|
|
|
for bar in bars_bash:
|
|
w = bar.get_width()
|
|
ax.text(w + 0.08, bar.get_y() + bar.get_height() / 2, f"{w:.1f}", va="center", fontsize=8)
|
|
for bar in bars_lush:
|
|
w = bar.get_width()
|
|
ax.text(w + 0.08, bar.get_y() + bar.get_height() / 2, f"{w:.1f}", va="center", fontsize=8)
|
|
|
|
ax.grid(axis="x", alpha=0.3)
|
|
return _fig_to_base64(fig)
|
|
|
|
|
|
def chart_turns_comparison(results: list[BenchmarkResult]) -> str:
|
|
"""Bar chart of agent turns per task for bash vs lush."""
|
|
# Only include tasks where the agent actually solved (turns > 0)
|
|
cat_a = [r for r in results if r.category == "a"]
|
|
names = [r.task_name for r in cat_a]
|
|
bash_turns = [r.bash_result.agent_turns if r.bash_result else 0 for r in cat_a]
|
|
lush_turns = [r.lush_result.agent_turns if r.lush_result else 0 for r in cat_a]
|
|
|
|
fig, ax = plt.subplots(figsize=(8, 4))
|
|
x = range(len(names))
|
|
bar_w = 0.35
|
|
ax.bar([i - bar_w / 2 for i in x], bash_turns, bar_w, label="bash", color=BASH_COLOR)
|
|
ax.bar([i + bar_w / 2 for i in x], lush_turns, bar_w, label="lush", color=LUSH_COLOR)
|
|
|
|
ax.set_xticks(list(x))
|
|
ax.set_xticklabels(names, rotation=35, ha="right", fontsize=8)
|
|
ax.set_ylabel("Agent Turns")
|
|
ax.set_title("Agent Turns to Solve (Category A)")
|
|
ax.yaxis.set_major_locator(ticker.MaxNLocator(integer=True))
|
|
ax.legend()
|
|
ax.grid(axis="y", alpha=0.3)
|
|
return _fig_to_base64(fig)
|
|
|
|
|
|
def chart_per_task_heatmap(results: list[BenchmarkResult]) -> str:
|
|
"""Heatmap showing lush-minus-bash score diff per task and metric."""
|
|
labels = [label for _, label in LIKERT_QUESTIONS]
|
|
tasks = [r.task_name for r in results]
|
|
|
|
data: list[list[float]] = []
|
|
for r in results:
|
|
scores = _get_likert_scores(r)
|
|
row = []
|
|
for key, _ in LIKERT_QUESTIONS:
|
|
b = scores[key]["bash"]
|
|
l = scores[key]["lush"]
|
|
if b is not None and l is not None:
|
|
row.append(l - b)
|
|
else:
|
|
row.append(0.0)
|
|
data.append(row)
|
|
|
|
fig, ax = plt.subplots(figsize=(8, max(4, len(tasks) * 0.45 + 1)))
|
|
im = ax.imshow(data, cmap="RdYlGn", aspect="auto", vmin=-3, vmax=3)
|
|
|
|
ax.set_xticks(range(len(labels)))
|
|
ax.set_xticklabels(labels, rotation=35, ha="right", fontsize=8)
|
|
ax.set_yticks(range(len(tasks)))
|
|
ax.set_yticklabels(tasks, fontsize=8)
|
|
|
|
for i in range(len(tasks)):
|
|
for j in range(len(labels)):
|
|
val = data[i][j]
|
|
text = f"+{val:.0f}" if val > 0 else f"{val:.0f}" if val < 0 else "0"
|
|
ax.text(j, i, text, ha="center", va="center", fontsize=8,
|
|
color="white" if abs(val) >= 2 else "black")
|
|
|
|
ax.set_title("Score Difference (Lush - Bash)")
|
|
fig.colorbar(im, ax=ax, shrink=0.8, label="Lush advantage")
|
|
return _fig_to_base64(fig)
|
|
|
|
|
|
def _build_summary_html(results: list[BenchmarkResult]) -> str:
|
|
rows = []
|
|
for r in results:
|
|
b = r.bash_result
|
|
l = r.lush_result
|
|
b_cls = "pass" if b and b.all_passed else "fail"
|
|
l_cls = "pass" if l and l.all_passed else "fail"
|
|
b_pass = "PASS" if b and b.all_passed else "FAIL"
|
|
l_pass = "PASS" if l and l.all_passed else "FAIL"
|
|
b_turns = str(b.agent_turns) if b else "-"
|
|
l_turns = str(l.agent_turns) if l else "-"
|
|
rows.append(f"""<tr>
|
|
<td>{html.escape(r.task_name)}</td><td>{r.category.upper()}</td>
|
|
<td class="{b_cls}">{b_pass}</td><td>{b_turns}</td>
|
|
<td class="{l_cls}">{l_pass}</td><td>{l_turns}</td>
|
|
</tr>""")
|
|
|
|
b_passed = sum(1 for r in results if r.bash_result and r.bash_result.all_passed)
|
|
l_passed = sum(1 for r in results if r.lush_result and r.lush_result.all_passed)
|
|
total = len(results)
|
|
|
|
return f"""<table>
|
|
<thead><tr>
|
|
<th>Task</th><th>Cat</th>
|
|
<th>Bash</th><th>Turns</th>
|
|
<th>Lush</th><th>Turns</th>
|
|
</tr></thead>
|
|
<tbody>{"".join(rows)}</tbody>
|
|
<tfoot><tr>
|
|
<td><strong>Total</strong></td><td></td>
|
|
<td><strong>{b_passed}/{total}</strong></td><td></td>
|
|
<td><strong>{l_passed}/{total}</strong></td><td></td>
|
|
</tr></tfoot>
|
|
</table>"""
|
|
|
|
|
|
def _build_detail_html(results: list[BenchmarkResult]) -> str:
|
|
sections = []
|
|
for r in results:
|
|
b_status = "PASS" if r.bash_result and r.bash_result.all_passed else "FAIL"
|
|
l_status = "PASS" if r.lush_result and r.lush_result.all_passed else "FAIL"
|
|
|
|
scores = _get_likert_scores(r)
|
|
score_rows = []
|
|
for key, label in LIKERT_QUESTIONS:
|
|
b_val = scores[key]["bash"]
|
|
l_val = scores[key]["lush"]
|
|
b_str = f"{b_val:.0f}" if b_val is not None else "-"
|
|
l_str = f"{l_val:.0f}" if l_val is not None else "-"
|
|
if b_val is not None and l_val is not None:
|
|
d = l_val - b_val
|
|
d_str = f"+{d:.0f}" if d > 0 else f"{d:.0f}"
|
|
d_cls = "pos" if d > 0 else "neg" if d < 0 else ""
|
|
else:
|
|
d_str = "-"
|
|
d_cls = ""
|
|
score_rows.append(f'<tr><td>{html.escape(label)}</td>'
|
|
f'<td>{b_str}</td><td>{l_str}</td>'
|
|
f'<td class="{d_cls}">{d_str}</td></tr>')
|
|
|
|
obs = _get_freeform(r)
|
|
obs_html = ""
|
|
for lang, text in obs.items():
|
|
obs_html += f'<p><strong>{lang}:</strong> {html.escape(text)}</p>\n'
|
|
|
|
sections.append(f"""
|
|
<div class="task-detail">
|
|
<h3>{html.escape(r.task_name)} <span class="cat">[{r.category}]</span>
|
|
<span class="{"pass" if b_status == "PASS" else "fail"}">bash={b_status}</span>
|
|
<span class="{"pass" if l_status == "PASS" else "fail"}">lush={l_status}</span>
|
|
</h3>
|
|
<table class="scores">
|
|
<thead><tr><th>Metric</th><th>Bash</th><th>Lush</th><th>Diff</th></tr></thead>
|
|
<tbody>{"".join(score_rows)}</tbody>
|
|
</table>
|
|
<div class="observations">{obs_html}</div>
|
|
</div>""")
|
|
|
|
return "\n".join(sections)
|
|
|
|
|
|
def export_html(results_dir: Path, output_path: Path) -> None:
|
|
results = load_latest_results(results_dir)
|
|
if not results:
|
|
output_path.write_text("<html><body><p>No results found.</p></body></html>")
|
|
return
|
|
|
|
chart_questionnaire = chart_questionnaire_comparison(results)
|
|
chart_turns = chart_turns_comparison(results)
|
|
chart_heatmap = chart_per_task_heatmap(results)
|
|
summary_table = _build_summary_html(results)
|
|
detail_html = _build_detail_html(results)
|
|
|
|
model = results[0].model if results else "unknown"
|
|
timestamp = max(r.timestamp for r in results)
|
|
|
|
page = f"""<!DOCTYPE html>
|
|
<html lang="en">
|
|
<head>
|
|
<meta charset="utf-8">
|
|
<title>Lush vs Bash Benchmark Report</title>
|
|
<style>
|
|
:root {{ --bash: {BASH_COLOR}; --lush: {LUSH_COLOR}; }}
|
|
* {{ box-sizing: border-box; margin: 0; padding: 0; }}
|
|
body {{ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
|
|
max-width: 960px; margin: 40px auto; padding: 0 20px; color: #1a1a1a; line-height: 1.5; }}
|
|
h1 {{ font-size: 1.8rem; margin-bottom: 4px; }}
|
|
h2 {{ font-size: 1.3rem; margin: 32px 0 16px; border-bottom: 2px solid #e0e0e0; padding-bottom: 6px; }}
|
|
h3 {{ font-size: 1.05rem; margin-bottom: 10px; }}
|
|
.meta {{ color: #666; font-size: 0.9rem; margin-bottom: 24px; }}
|
|
table {{ border-collapse: collapse; width: 100%; margin: 12px 0 20px; font-size: 0.9rem; }}
|
|
th, td {{ padding: 8px 12px; text-align: left; border-bottom: 1px solid #e0e0e0; }}
|
|
th {{ background: #f5f5f5; font-weight: 600; }}
|
|
td.pass {{ color: #2d8a4e; font-weight: 600; }}
|
|
td.fail {{ color: #d32f2f; font-weight: 600; }}
|
|
td.pos {{ color: #2d8a4e; }}
|
|
td.neg {{ color: #d32f2f; }}
|
|
tfoot td {{ font-weight: 600; border-top: 2px solid #333; }}
|
|
.chart {{ text-align: center; margin: 20px 0; }}
|
|
.chart img {{ max-width: 100%; height: auto; border: 1px solid #e0e0e0; border-radius: 4px; }}
|
|
.task-detail {{ margin: 20px 0 30px; padding: 16px; background: #fafafa; border-radius: 6px; border: 1px solid #e8e8e8; }}
|
|
.task-detail h3 {{ margin-bottom: 12px; }}
|
|
.task-detail .cat {{ color: #888; font-weight: normal; }}
|
|
.task-detail .pass {{ color: #2d8a4e; font-size: 0.85rem; margin-left: 8px; }}
|
|
.task-detail .fail {{ color: #d32f2f; font-size: 0.85rem; margin-left: 8px; }}
|
|
.scores {{ width: auto; }}
|
|
.scores td:nth-child(n+2) {{ text-align: center; min-width: 50px; }}
|
|
.scores th:nth-child(n+2) {{ text-align: center; }}
|
|
.observations {{ margin-top: 12px; font-size: 0.85rem; color: #444; }}
|
|
.observations p {{ margin-bottom: 6px; }}
|
|
</style>
|
|
</head>
|
|
<body>
|
|
|
|
<h1>Lush vs Bash Benchmark Report</h1>
|
|
<p class="meta">Model: {html.escape(model)} · Latest run: {html.escape(timestamp)} · Tasks: {len(results)}</p>
|
|
|
|
<h2>Summary</h2>
|
|
{summary_table}
|
|
|
|
<h2>Questionnaire Scores</h2>
|
|
<div class="chart"><img src="data:image/png;base64,{chart_questionnaire}" alt="Questionnaire comparison"></div>
|
|
|
|
<h2>Agent Turns (Category A)</h2>
|
|
<div class="chart"><img src="data:image/png;base64,{chart_turns}" alt="Turns comparison"></div>
|
|
|
|
<h2>Score Difference Heatmap (Lush - Bash)</h2>
|
|
<div class="chart"><img src="data:image/png;base64,{chart_heatmap}" alt="Score heatmap"></div>
|
|
|
|
<h2>Per-Task Detail</h2>
|
|
{detail_html}
|
|
|
|
</body>
|
|
</html>"""
|
|
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
output_path.write_text(page)
|