from __future__ import annotations import base64 import html import io from pathlib import Path import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt import matplotlib.ticker as ticker from .models import BenchmarkResult from .report import ( LIKERT_QUESTIONS, _get_freeform, _get_likert_scores, _parse_likert, load_latest_results, ) BASH_COLOR = "#4E79A7" LUSH_COLOR = "#E15759" NEUTRAL_COLOR = "#999999" def _fig_to_base64(fig: plt.Figure) -> str: buf = io.BytesIO() fig.savefig(buf, format="png", dpi=150, bbox_inches="tight", facecolor="white") plt.close(fig) buf.seek(0) return base64.b64encode(buf.read()).decode() def _aggregate_likert(results: list[BenchmarkResult]) -> dict[str, dict[str, float]]: """Return {question_key: {bash: avg, lush: avg}}.""" agg: dict[str, dict[str, list[float]]] = {} for key, _ in LIKERT_QUESTIONS: agg[key] = {"bash": [], "lush": []} for r in results: scores = _get_likert_scores(r) for key in scores: for lang in ("bash", "lush"): val = scores[key][lang] if val is not None: agg[key][lang].append(val) return { key: { lang: (sum(vals) / len(vals)) if vals else 0.0 for lang, vals in agg[key].items() } for key in agg } def chart_questionnaire_comparison(results: list[BenchmarkResult]) -> str: """Grouped horizontal bar chart comparing bash vs lush on each Likert metric.""" avgs = _aggregate_likert(results) labels = [label for _, label in LIKERT_QUESTIONS] bash_vals = [avgs[key]["bash"] for key, _ in LIKERT_QUESTIONS] lush_vals = [avgs[key]["lush"] for key, _ in LIKERT_QUESTIONS] fig, ax = plt.subplots(figsize=(8, 4.5)) y = range(len(labels)) bar_h = 0.35 bars_bash = ax.barh([i + bar_h / 2 for i in y], bash_vals, bar_h, label="bash", color=BASH_COLOR) bars_lush = ax.barh([i - bar_h / 2 for i in y], lush_vals, bar_h, label="lush", color=LUSH_COLOR) ax.set_yticks(list(y)) ax.set_yticklabels(labels) ax.set_xlim(0, 5.5) ax.xaxis.set_major_locator(ticker.MultipleLocator(1)) ax.set_xlabel("Score (1-5)") ax.set_title("Questionnaire Scores: Bash vs Lush") ax.legend(loc="lower right") ax.invert_yaxis() for bar in bars_bash: w = bar.get_width() ax.text(w + 0.08, bar.get_y() + bar.get_height() / 2, f"{w:.1f}", va="center", fontsize=8) for bar in bars_lush: w = bar.get_width() ax.text(w + 0.08, bar.get_y() + bar.get_height() / 2, f"{w:.1f}", va="center", fontsize=8) ax.grid(axis="x", alpha=0.3) return _fig_to_base64(fig) def chart_turns_comparison(results: list[BenchmarkResult]) -> str: """Bar chart of agent turns per task for bash vs lush.""" # Only include tasks where the agent actually solved (turns > 0) cat_a = [r for r in results if r.category == "a"] names = [r.task_name for r in cat_a] bash_turns = [r.bash_result.agent_turns if r.bash_result else 0 for r in cat_a] lush_turns = [r.lush_result.agent_turns if r.lush_result else 0 for r in cat_a] fig, ax = plt.subplots(figsize=(8, 4)) x = range(len(names)) bar_w = 0.35 ax.bar([i - bar_w / 2 for i in x], bash_turns, bar_w, label="bash", color=BASH_COLOR) ax.bar([i + bar_w / 2 for i in x], lush_turns, bar_w, label="lush", color=LUSH_COLOR) ax.set_xticks(list(x)) ax.set_xticklabels(names, rotation=35, ha="right", fontsize=8) ax.set_ylabel("Agent Turns") ax.set_title("Agent Turns to Solve (Category A)") ax.yaxis.set_major_locator(ticker.MaxNLocator(integer=True)) ax.legend() ax.grid(axis="y", alpha=0.3) return _fig_to_base64(fig) def chart_per_task_heatmap(results: list[BenchmarkResult]) -> str: """Heatmap showing lush-minus-bash score diff per task and metric.""" labels = [label for _, label in LIKERT_QUESTIONS] tasks = [r.task_name for r in results] data: list[list[float]] = [] for r in results: scores = _get_likert_scores(r) row = [] for key, _ in LIKERT_QUESTIONS: b = scores[key]["bash"] l = scores[key]["lush"] if b is not None and l is not None: row.append(l - b) else: row.append(0.0) data.append(row) fig, ax = plt.subplots(figsize=(8, max(4, len(tasks) * 0.45 + 1))) im = ax.imshow(data, cmap="RdYlGn", aspect="auto", vmin=-3, vmax=3) ax.set_xticks(range(len(labels))) ax.set_xticklabels(labels, rotation=35, ha="right", fontsize=8) ax.set_yticks(range(len(tasks))) ax.set_yticklabels(tasks, fontsize=8) for i in range(len(tasks)): for j in range(len(labels)): val = data[i][j] text = f"+{val:.0f}" if val > 0 else f"{val:.0f}" if val < 0 else "0" ax.text(j, i, text, ha="center", va="center", fontsize=8, color="white" if abs(val) >= 2 else "black") ax.set_title("Score Difference (Lush - Bash)") fig.colorbar(im, ax=ax, shrink=0.8, label="Lush advantage") return _fig_to_base64(fig) def _build_summary_html(results: list[BenchmarkResult]) -> str: rows = [] for r in results: b = r.bash_result l = r.lush_result b_cls = "pass" if b and b.all_passed else "fail" l_cls = "pass" if l and l.all_passed else "fail" b_pass = "PASS" if b and b.all_passed else "FAIL" l_pass = "PASS" if l and l.all_passed else "FAIL" b_turns = str(b.agent_turns) if b else "-" l_turns = str(l.agent_turns) if l else "-" rows.append(f"""
| Task | Cat | Bash | Turns | Lush | Turns |
|---|---|---|---|---|---|
| Total | {b_passed}/{total} | {l_passed}/{total} |
{lang}: {html.escape(text)}
\n' sections.append(f"""| Metric | Bash | Lush | Diff |
|---|
No results found.
") return chart_questionnaire = chart_questionnaire_comparison(results) chart_turns = chart_turns_comparison(results) chart_heatmap = chart_per_task_heatmap(results) summary_table = _build_summary_html(results) detail_html = _build_detail_html(results) model = results[0].model if results else "unknown" timestamp = max(r.timestamp for r in results) page = f"""