Revamp questionnaire, parallelize run-all, add new tasks

- Replace 6 compound Likert questions with 12 atomic ones grouped by dimension (syntax, expressiveness, data/IO, errors, overall); drop free-form question. Responses now stored as ints, not strings. - Back-compat layer maps legacy keys to new dimensions so existing results still render. - Parallelize run-all with ThreadPoolExecutor (configurable workers) and add a thread-safe min-request-interval rate limiter to the Anthropic provider. - Add new tasks: path_normalizer, todo_manager, currency_converter, locale_weather_url, network_info_parser, url_normalizer.
2026-04-07 19:07:21 +01:00
parent 20e62f60f6
commit 18ce7e57cf
13 changed files with 943 additions and 206 deletions
--- a/lush_bench/export.py
+++ b/lush_bench/export.py
@@ -13,7 +13,6 @@ import matplotlib.ticker as ticker
 from .models import BenchmarkResult
 from .report import (
    LIKERT_QUESTIONS,
-    _get_freeform,
    _get_likert_scores,
    _parse_likert,
    load_latest_results,
@@ -35,7 +34,7 @@ def _fig_to_base64(fig: plt.Figure) -> str:
 def _aggregate_likert(results: list[BenchmarkResult]) -> dict[str, dict[str, float]]:
    """Return {question_key: {bash: avg, lush: avg}}."""
    agg: dict[str, dict[str, list[float]]] = {}
-    for key, _ in LIKERT_QUESTIONS:
+    for key, _, _ in LIKERT_QUESTIONS:
        agg[key] = {"bash": [], "lush": []}
    for r in results:
        scores = _get_likert_scores(r)
@@ -56,11 +55,11 @@ def _aggregate_likert(results: list[BenchmarkResult]) -> dict[str, dict[str, flo
 def chart_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
    """Grouped horizontal bar chart comparing bash vs lush on each Likert metric."""
    avgs = _aggregate_likert(results)
-    labels = [label for _, label in LIKERT_QUESTIONS]
-    bash_vals = [avgs[key]["bash"] for key, _ in LIKERT_QUESTIONS]
-    lush_vals = [avgs[key]["lush"] for key, _ in LIKERT_QUESTIONS]
+    labels = [label for _, label, _ in LIKERT_QUESTIONS]
+    bash_vals = [avgs[key]["bash"] for key, _, _ in LIKERT_QUESTIONS]
+    lush_vals = [avgs[key]["lush"] for key, _, _ in LIKERT_QUESTIONS]

-    fig, ax = plt.subplots(figsize=(8, 4.5))
+    fig, ax = plt.subplots(figsize=(8, 7))
    y = range(len(labels))
    bar_h = 0.35
    bars_bash = ax.barh([i + bar_h / 2 for i in y], bash_vals, bar_h, label="bash", color=BASH_COLOR)
@@ -112,14 +111,14 @@ def chart_turns_comparison(results: list[BenchmarkResult]) -> str:

 def chart_per_task_heatmap(results: list[BenchmarkResult]) -> str:
    """Heatmap showing lush-minus-bash score diff per task and metric."""
-    labels = [label for _, label in LIKERT_QUESTIONS]
+    labels = [label for _, label, _ in LIKERT_QUESTIONS]
    tasks = [r.task_name for r in results]

    data: list[list[float]] = []
    for r in results:
        scores = _get_likert_scores(r)
        row = []
-        for key, _ in LIKERT_QUESTIONS:
+        for key, _, _ in LIKERT_QUESTIONS:
            b = scores[key]["bash"]
            l = scores[key]["lush"]
            if b is not None and l is not None:
@@ -128,11 +127,11 @@ def chart_per_task_heatmap(results: list[BenchmarkResult]) -> str:
                row.append(0.0)
        data.append(row)

-    fig, ax = plt.subplots(figsize=(8, max(4, len(tasks) * 0.45 + 1)))
+    fig, ax = plt.subplots(figsize=(10, max(4, len(tasks) * 0.45 + 1)))
    im = ax.imshow(data, cmap="RdYlGn", aspect="auto", vmin=-3, vmax=3)

    ax.set_xticks(range(len(labels)))
-    ax.set_xticklabels(labels, rotation=35, ha="right", fontsize=8)
+    ax.set_xticklabels(labels, rotation=45, ha="right", fontsize=7)
    ax.set_yticks(range(len(tasks)))
    ax.set_yticklabels(tasks, fontsize=8)

@@ -140,7 +139,7 @@ def chart_per_task_heatmap(results: list[BenchmarkResult]) -> str:
        for j in range(len(labels)):
            val = data[i][j]
            text = f"+{val:.0f}" if val > 0 else f"{val:.0f}" if val < 0 else "0"
-            ax.text(j, i, text, ha="center", va="center", fontsize=8,
+            ax.text(j, i, text, ha="center", va="center", fontsize=7,
                    color="white" if abs(val) >= 2 else "black")

    ax.set_title("Score Difference (Lush - Bash)")
@@ -197,7 +196,7 @@ def chart_per_category_questionnaire(results: list[BenchmarkResult]) -> str:


 def chart_per_category_radar(results: list[BenchmarkResult]) -> list[tuple[str, str]]:
-    """Small-multiples bar charts: one per category showing 6 Likert dimensions for bash vs lush."""
+    """Small-multiples bar charts: one per category showing 12 Likert dimensions for bash vs lush."""
    import numpy as np
    from collections import defaultdict

@@ -206,12 +205,12 @@ def chart_per_category_radar(results: list[BenchmarkResult]) -> list[tuple[str,
        by_cat[r.category].append(r)

    charts: list[tuple[str, str]] = []
-    labels = [label for _, label in LIKERT_QUESTIONS]
+    labels = [label for _, label, _ in LIKERT_QUESTIONS]

    for cat in sorted(by_cat):
        cat_results = by_cat[cat]
        agg: dict[str, dict[str, list[float]]] = {}
-        for key, _ in LIKERT_QUESTIONS:
+        for key, _, _ in LIKERT_QUESTIONS:
            agg[key] = {"bash": [], "lush": []}
        for r in cat_results:
            scores = _get_likert_scores(r)
@@ -221,10 +220,10 @@ def chart_per_category_radar(results: list[BenchmarkResult]) -> list[tuple[str,
                    if val is not None:
                        agg[key][lang].append(val)

-        bash_vals = [sum(agg[k]["bash"]) / len(agg[k]["bash"]) if agg[k]["bash"] else 0.0 for k, _ in LIKERT_QUESTIONS]
-        lush_vals = [sum(agg[k]["lush"]) / len(agg[k]["lush"]) if agg[k]["lush"] else 0.0 for k, _ in LIKERT_QUESTIONS]
+        bash_vals = [sum(agg[k]["bash"]) / len(agg[k]["bash"]) if agg[k]["bash"] else 0.0 for k, _, _ in LIKERT_QUESTIONS]
+        lush_vals = [sum(agg[k]["lush"]) / len(agg[k]["lush"]) if agg[k]["lush"] else 0.0 for k, _, _ in LIKERT_QUESTIONS]

-        fig, ax = plt.subplots(figsize=(6, 3.5))
+        fig, ax = plt.subplots(figsize=(7, 5))
        y = range(len(labels))
        bar_h = 0.35
        ax.barh([i + bar_h / 2 for i in y], bash_vals, bar_h, label="bash", color=BASH_COLOR)
@@ -337,7 +336,7 @@ def _build_detail_html(results: list[BenchmarkResult]) -> str:

        scores = _get_likert_scores(r)
        score_rows = []
-        for key, label in LIKERT_QUESTIONS:
+        for key, label, _ in LIKERT_QUESTIONS:
            b_val = scores[key]["bash"]
            l_val = scores[key]["lush"]
            b_str = f"{b_val:.0f}" if b_val is not None else "-"
@@ -353,11 +352,6 @@ def _build_detail_html(results: list[BenchmarkResult]) -> str:
                              f'<td>{b_str}</td><td>{l_str}</td>'
                              f'<td class="{d_cls}">{d_str}</td></tr>')

-        obs = _get_freeform(r)
-        obs_html = ""
-        for lang, text in obs.items():
-            obs_html += f'<p><strong>{lang}:</strong> {html.escape(text)}</p>\n'
-
        sections.append(f"""
        <div class="task-detail">
            <h3>{html.escape(r.task_name)} <span class="cat">[{r.category}/{r.mode}]</span>
@@ -368,7 +362,6 @@ def _build_detail_html(results: list[BenchmarkResult]) -> str:
                <thead><tr><th>Metric</th><th>Bash</th><th>Lush</th><th>Diff</th></tr></thead>
                <tbody>{"".join(score_rows)}</tbody>
            </table>
-            <div class="observations">{obs_html}</div>
        </div>""")

    return "\n".join(sections)
@@ -424,8 +417,6 @@ def export_html(results_dir: Path, output_path: Path) -> None:
    .scores {{ width: auto; }}
    .scores td:nth-child(n+2) {{ text-align: center; min-width: 50px; }}
    .scores th:nth-child(n+2) {{ text-align: center; }}
-    .observations {{ margin-top: 12px; font-size: 0.85rem; color: #444; }}
-    .observations p {{ margin-bottom: 6px; }}
 </style>
 </head>
 <body>