Files
lush_grading/lush_bench/report.py
Cormac Shannon be8d657b24 Initial commit: Lush vs Bash AI benchmarking framework
Benchmark harness that uses LLM agents to solve shell scripting tasks
in both Bash and Lush, then compares correctness and code quality.

- CLI with run, run-all, list-tasks, report, and export commands
- Agent loop with retry support via Anthropic Claude provider
- Test harness executing solutions in sandboxed subprocesses
- LLM-driven questionnaire for subjective code quality evaluation
- HTML report export with charts (matplotlib)
- 8 Category A tasks (write-from-scratch in both languages)
- 4 Category B tasks (verify provided Bash, convert to Lush)
- Lush language reference for agent context
2026-03-29 17:56:30 +01:00

229 lines
8.5 KiB
Python

from __future__ import annotations
import json
from pathlib import Path
from .models import BenchmarkResult
# Likert questions in order (must match questionnaire.py QUESTIONS)
LIKERT_QUESTIONS = [
("Readability", "Readability"),
("Expressiveness", "Expressiveness"),
("Conciseness", "Conciseness"),
("Error handling", "Error handling"),
("Overall preference", "Overall preference"),
("Learning curve", "Learning curve"),
]
def load_latest_results(results_dir: Path) -> list[BenchmarkResult]:
"""Load results, keeping only the latest run per task name."""
latest: dict[str, BenchmarkResult] = {}
for d in sorted(results_dir.iterdir()):
result_file = d / "result.json"
if not result_file.exists():
continue
with open(result_file) as f:
r = BenchmarkResult.from_dict(json.load(f))
latest[r.task_name] = r
return sorted(latest.values(), key=lambda r: (r.category, r.task_name))
def _parse_likert(selected: str | int) -> int | None:
"""Extract numeric value from a likert response like '4 - Agree'."""
if isinstance(selected, int):
return selected
s = str(selected).strip()
if s and s[0].isdigit():
return int(s[0])
return None
def _get_likert_scores(result: BenchmarkResult) -> dict[str, dict[str, float | None]]:
"""Extract likert scores per language. Returns {question_key: {bash: N, lush: N}}."""
scores: dict[str, dict[str, float | None]] = {}
for key, _ in LIKERT_QUESTIONS:
scores[key] = {"bash": None, "lush": None}
for lang_name, lang_result in [("bash", result.bash_result), ("lush", result.lush_result)]:
if not lang_result:
continue
for q in lang_result.questionnaire:
for key, _ in LIKERT_QUESTIONS:
if q.question.startswith(key):
val = _parse_likert(q.selected)
if val is not None:
scores[key][lang_name] = float(val)
break
return scores
def _bar(value: float, max_val: float = 5.0, width: int = 20) -> str:
"""Render a small horizontal bar."""
filled = int(round(value / max_val * width))
return "\u2588" * filled + "\u2591" * (width - filled)
def _get_freeform(result: BenchmarkResult) -> dict[str, str]:
"""Extract free-form observations per language."""
obs: dict[str, str] = {}
for lang_name, lang_result in [("bash", result.bash_result), ("lush", result.lush_result)]:
if not lang_result:
continue
for q in lang_result.questionnaire:
if q.question.startswith("Free-form"):
obs[lang_name] = str(q.selected)
break
return obs
def render_summary_table(results: list[BenchmarkResult]) -> str:
"""Render the pass/fail + turns overview table."""
lines: list[str] = []
lines.append("")
lines.append("=" * 78)
lines.append(" BENCHMARK RESULTS SUMMARY")
lines.append("=" * 78)
lines.append("")
header = f" {'Task':<22s} {'Cat':>3s} {'Bash':^14s} {'Lush':^14s}"
lines.append(header)
sub = f" {'':<22s} {'':>3s} {'pass turns':^14s} {'pass turns':^14s}"
lines.append(sub)
lines.append(" " + "-" * 60)
for r in results:
b = r.bash_result
l = r.lush_result
b_pass = "PASS" if b and b.all_passed else "FAIL" if b else "-"
l_pass = "PASS" if l and l.all_passed else "FAIL" if l else "-"
b_turns = str(b.agent_turns) if b else "-"
l_turns = str(l.agent_turns) if l else "-"
lines.append(f" {r.task_name:<22s} [{r.category}] {b_pass:>4s} {b_turns:>5s} {l_pass:>4s} {l_turns:>5s}")
# Totals
b_passed = sum(1 for r in results if r.bash_result and r.bash_result.all_passed)
l_passed = sum(1 for r in results if r.lush_result and r.lush_result.all_passed)
b_total = sum(1 for r in results if r.bash_result)
l_total = sum(1 for r in results if r.lush_result)
b_turns_avg = 0.0
l_turns_avg = 0.0
b_turn_counts = [r.bash_result.agent_turns for r in results if r.bash_result and r.bash_result.agent_turns > 0]
l_turn_counts = [r.lush_result.agent_turns for r in results if r.lush_result and r.lush_result.agent_turns > 0]
if b_turn_counts:
b_turns_avg = sum(b_turn_counts) / len(b_turn_counts)
if l_turn_counts:
l_turns_avg = sum(l_turn_counts) / len(l_turn_counts)
lines.append(" " + "-" * 60)
lines.append(f" {'TOTAL':<22s} {b_passed}/{b_total:>2d} {b_turns_avg:>5.1f} {l_passed}/{l_total:>2d} {l_turns_avg:>5.1f}")
lines.append(f" {'':27s}{'pass avg turns':^14s} {'pass avg turns':^14s}")
lines.append("")
return "\n".join(lines)
def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
"""Render aggregated questionnaire scores with bar charts."""
lines: list[str] = []
lines.append("=" * 78)
lines.append(" QUESTIONNAIRE SCORES (1-5 Likert, higher = better)")
lines.append("=" * 78)
lines.append("")
# Aggregate scores across all tasks
agg: dict[str, dict[str, list[float]]] = {}
for key, _ in LIKERT_QUESTIONS:
agg[key] = {"bash": [], "lush": []}
for r in results:
scores = _get_likert_scores(r)
for key in scores:
for lang in ("bash", "lush"):
val = scores[key][lang]
if val is not None:
agg[key][lang].append(val)
for key, label in LIKERT_QUESTIONS:
b_vals = agg[key]["bash"]
l_vals = agg[key]["lush"]
b_avg = sum(b_vals) / len(b_vals) if b_vals else 0.0
l_avg = sum(l_vals) / len(l_vals) if l_vals else 0.0
diff = l_avg - b_avg
diff_str = f"+{diff:.1f}" if diff > 0 else f"{diff:.1f}" if diff < 0 else " 0.0"
lines.append(f" {label}")
lines.append(f" bash {_bar(b_avg)} {b_avg:.1f}")
lines.append(f" lush {_bar(l_avg)} {l_avg:.1f} ({diff_str})")
lines.append("")
# Overall average
all_bash = [v for key in agg for v in agg[key]["bash"]]
all_lush = [v for key in agg for v in agg[key]["lush"]]
b_overall = sum(all_bash) / len(all_bash) if all_bash else 0.0
l_overall = sum(all_lush) / len(all_lush) if all_lush else 0.0
diff = l_overall - b_overall
diff_str = f"+{diff:.1f}" if diff > 0 else f"{diff:.1f}" if diff < 0 else " 0.0"
lines.append(" " + "-" * 50)
lines.append(f" Overall average")
lines.append(f" bash {_bar(b_overall)} {b_overall:.1f}")
lines.append(f" lush {_bar(l_overall)} {l_overall:.1f} ({diff_str})")
lines.append("")
return "\n".join(lines)
def render_per_task_detail(results: list[BenchmarkResult]) -> str:
"""Render per-task questionnaire breakdown."""
lines: list[str] = []
lines.append("=" * 78)
lines.append(" PER-TASK DETAIL")
lines.append("=" * 78)
for r in results:
lines.append("")
b_status = "PASS" if r.bash_result and r.bash_result.all_passed else "FAIL"
l_status = "PASS" if r.lush_result and r.lush_result.all_passed else "FAIL"
lines.append(f" {r.task_name} [{r.category}] bash={b_status} lush={l_status}")
lines.append("")
scores = _get_likert_scores(r)
lines.append(f" {'Metric':<22s} {'Bash':>4s} {'Lush':>4s} {'Diff':>5s}")
lines.append(" " + "-" * 40)
for key, label in LIKERT_QUESTIONS:
b_val = scores[key]["bash"]
l_val = scores[key]["lush"]
b_str = f"{b_val:.0f}" if b_val is not None else "-"
l_str = f"{l_val:.0f}" if l_val is not None else "-"
if b_val is not None and l_val is not None:
d = l_val - b_val
d_str = f"+{d:.0f}" if d > 0 else f"{d:.0f}" if d < 0 else "0"
else:
d_str = "-"
lines.append(f" {label:<22s} {b_str:>4s} {l_str:>4s} {d_str:>5s}")
# Free-form observations
obs = _get_freeform(r)
if obs:
lines.append("")
for lang, text in obs.items():
# Wrap long text
wrapped = text[:120] + ("..." if len(text) > 120 else "")
lines.append(f" {lang}: {wrapped}")
lines.append("")
return "\n".join(lines)
def render_report(results_dir: Path) -> str:
"""Generate full report."""
results = load_latest_results(results_dir)
if not results:
return "No results found."
parts = [
render_summary_table(results),
render_questionnaire_comparison(results),
render_per_task_detail(results),
]
return "\n".join(parts)