Revamp questionnaire, parallelize run-all, add new tasks

- Replace 6 compound Likert questions with 12 atomic ones grouped by dimension (syntax, expressiveness, data/IO, errors, overall); drop free-form question. Responses now stored as ints, not strings. - Back-compat layer maps legacy keys to new dimensions so existing results still render. - Parallelize run-all with ThreadPoolExecutor (configurable workers) and add a thread-safe min-request-interval rate limiter to the Anthropic provider. - Add new tasks: path_normalizer, todo_manager, currency_converter, locale_weather_url, network_info_parser, url_normalizer.
2026-04-07 19:07:21 +01:00
parent 20e62f60f6
commit 18ce7e57cf
13 changed files with 943 additions and 206 deletions
--- a/lush_bench/report.py
+++ b/lush_bench/report.py
@@ -5,16 +5,32 @@ from pathlib import Path

 from .models import BenchmarkResult

-# Likert questions in order (must match questionnaire.py QUESTIONS)
+# New 12-item question list: (key, label, dimension)
 LIKERT_QUESTIONS = [
-    ("Readability", "Readability"),
-    ("Expressiveness", "Expressiveness"),
-    ("Conciseness", "Conciseness"),
-    ("Error handling", "Error handling"),
-    ("Overall preference", "Overall preference"),
-    ("Learning curve", "Learning curve"),
+    ("syntax_clarity", "Syntax clarity", "Syntax & Readability"),
+    ("signal_to_noise", "Signal-to-noise", "Syntax & Readability"),
+    ("familiar_conventions", "Familiar conventions", "Syntax & Readability"),
+    ("builtin_ops", "Built-in operations", "Expressiveness"),
+    ("string_ops", "String operations", "Expressiveness"),
+    ("composition", "Composition", "Expressiveness"),
+    ("io_ergonomics", "I/O ergonomics", "Data & I/O"),
+    ("data_structures", "Data structures", "Data & I/O"),
+    ("error_model", "Error model", "Error Handling"),
+    ("edge_case_support", "Edge case support", "Error Handling"),
+    ("learnability", "Learnability", "Overall"),
+    ("fitness", "Fitness for task", "Overall"),
 ]

+# Map old 6 legacy keys to new keys for back-compat with existing results
+LEGACY_KEY_MAP = {
+    "Readability": ["syntax_clarity", "signal_to_noise", "familiar_conventions"],
+    "Expressiveness": ["builtin_ops", "string_ops", "composition"],
+    "Conciseness": ["signal_to_noise"],
+    "Error handling": ["error_model", "edge_case_support"],
+    "Overall preference": ["fitness"],
+    "Learning curve": ["learnability"],
+}
+

 def load_latest_results(results_dir: Path) -> list[BenchmarkResult]:
    """Load results, keeping only the latest run per task name."""
@@ -30,7 +46,7 @@ def load_latest_results(results_dir: Path) -> list[BenchmarkResult]:


 def _parse_likert(selected: str | int) -> int | None:
-    """Extract numeric value from a likert response like '4 - Agree'."""
+    """Extract numeric value from a likert response. Handles int directly or string like '4 - Agree'."""
    if isinstance(selected, int):
        return selected
    s = str(selected).strip()
@@ -40,20 +56,34 @@ def _parse_likert(selected: str | int) -> int | None:


 def _get_likert_scores(result: BenchmarkResult) -> dict[str, dict[str, float | None]]:
-    """Extract likert scores per language. Returns {question_key: {bash: N, lush: N}}."""
+    """Extract likert scores per language. Returns {question_key: {bash: N, lush: N}}.
+
+    Handles both new-format results (exact id match) and legacy results (startswith match
+    mapped to new keys).
+    """
    scores: dict[str, dict[str, float | None]] = {}
-    for key, _ in LIKERT_QUESTIONS:
+    for key, _, _ in LIKERT_QUESTIONS:
        scores[key] = {"bash": None, "lush": None}

    for lang_name, lang_result in [("bash", result.bash_result), ("lush", result.lush_result)]:
        if not lang_result:
            continue
        for q in lang_result.questionnaire:
-            for key, _ in LIKERT_QUESTIONS:
-                if q.question.startswith(key):
+            # Try exact match on new question ids
+            if q.question in scores:
+                val = _parse_likert(q.selected)
+                if val is not None:
+                    scores[q.question][lang_name] = float(val)
+                continue
+
+            # Legacy: map old key to new keys (spread the score)
+            for legacy_prefix, new_keys in LEGACY_KEY_MAP.items():
+                if q.question.startswith(legacy_prefix):
                    val = _parse_likert(q.selected)
                    if val is not None:
-                        scores[key][lang_name] = float(val)
+                        for nk in new_keys:
+                            if scores[nk][lang_name] is None:
+                                scores[nk][lang_name] = float(val)
                    break
    return scores

@@ -64,19 +94,6 @@ def _bar(value: float, max_val: float = 5.0, width: int = 20) -> str:
    return "\u2588" * filled + "\u2591" * (width - filled)


-def _get_freeform(result: BenchmarkResult) -> dict[str, str]:
-    """Extract free-form observations per language."""
-    obs: dict[str, str] = {}
-    for lang_name, lang_result in [("bash", result.bash_result), ("lush", result.lush_result)]:
-        if not lang_result:
-            continue
-        for q in lang_result.questionnaire:
-            if q.question.startswith("Free-form"):
-                obs[lang_name] = str(q.selected)
-                break
-    return obs
-
-
 def render_summary_table(results: list[BenchmarkResult]) -> str:
    """Render the pass/fail + turns overview table."""
    lines: list[str] = []
@@ -123,7 +140,7 @@ def render_summary_table(results: list[BenchmarkResult]) -> str:


 def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
-    """Render aggregated questionnaire scores with bar charts."""
+    """Render aggregated questionnaire scores with bar charts, grouped by dimension."""
    lines: list[str] = []
    lines.append("=" * 78)
    lines.append("  QUESTIONNAIRE SCORES (1-5 Likert, higher = better)")
@@ -132,7 +149,7 @@ def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str:

    # Aggregate scores across all tasks
    agg: dict[str, dict[str, list[float]]] = {}
-    for key, _ in LIKERT_QUESTIONS:
+    for key, _, _ in LIKERT_QUESTIONS:
        agg[key] = {"bash": [], "lush": []}

    for r in results:
@@ -143,7 +160,15 @@ def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
                if val is not None:
                    agg[key][lang].append(val)

-    for key, label in LIKERT_QUESTIONS:
+    # Group by dimension
+    current_dim = None
+    for key, label, dimension in LIKERT_QUESTIONS:
+        if dimension != current_dim:
+            if current_dim is not None:
+                lines.append("")
+            lines.append(f"  [{dimension}]")
+            current_dim = dimension
+
        b_vals = agg[key]["bash"]
        l_vals = agg[key]["lush"]
        b_avg = sum(b_vals) / len(b_vals) if b_vals else 0.0
@@ -151,10 +176,9 @@ def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
        diff = l_avg - b_avg
        diff_str = f"+{diff:.1f}" if diff > 0 else f"{diff:.1f}" if diff < 0 else " 0.0"

-        lines.append(f"  {label}")
-        lines.append(f"    bash  {_bar(b_avg)} {b_avg:.1f}")
-        lines.append(f"    lush  {_bar(l_avg)} {l_avg:.1f}  ({diff_str})")
-        lines.append("")
+        lines.append(f"    {label}")
+        lines.append(f"      bash  {_bar(b_avg)} {b_avg:.1f}")
+        lines.append(f"      lush  {_bar(l_avg)} {l_avg:.1f}  ({diff_str})")

    # Overall average
    all_bash = [v for key in agg for v in agg[key]["bash"]]
@@ -164,6 +188,7 @@ def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
    diff = l_overall - b_overall
    diff_str = f"+{diff:.1f}" if diff > 0 else f"{diff:.1f}" if diff < 0 else " 0.0"

+    lines.append("")
    lines.append("  " + "-" * 50)
    lines.append(f"  Overall average")
    lines.append(f"    bash  {_bar(b_overall)} {b_overall:.1f}")
@@ -244,7 +269,7 @@ def render_per_category_questionnaire(results: list[BenchmarkResult]) -> str:
        lines.append(f"  {cat}")

        agg: dict[str, dict[str, list[float]]] = {}
-        for key, _ in LIKERT_QUESTIONS:
+        for key, _, _ in LIKERT_QUESTIONS:
            agg[key] = {"bash": [], "lush": []}
        for r in cat_results:
            scores = _get_likert_scores(r)
@@ -254,7 +279,7 @@ def render_per_category_questionnaire(results: list[BenchmarkResult]) -> str:
                    if val is not None:
                        agg[key][lang].append(val)

-        for key, label in LIKERT_QUESTIONS:
+        for key, label, _ in LIKERT_QUESTIONS:
            b_vals = agg[key]["bash"]
            l_vals = agg[key]["lush"]
            b_avg = sum(b_vals) / len(b_vals) if b_vals else 0.0
@@ -284,7 +309,7 @@ def render_per_task_detail(results: list[BenchmarkResult]) -> str:
        scores = _get_likert_scores(r)
        lines.append(f"    {'Metric':<22s} {'Bash':>4s}  {'Lush':>4s}  {'Diff':>5s}")
        lines.append("    " + "-" * 40)
-        for key, label in LIKERT_QUESTIONS:
+        for key, label, _ in LIKERT_QUESTIONS:
            b_val = scores[key]["bash"]
            l_val = scores[key]["lush"]
            b_str = f"{b_val:.0f}" if b_val is not None else "-"
@@ -296,15 +321,6 @@ def render_per_task_detail(results: list[BenchmarkResult]) -> str:
                d_str = "-"
            lines.append(f"    {label:<22s} {b_str:>4s}  {l_str:>4s}  {d_str:>5s}")

-        # Free-form observations
-        obs = _get_freeform(r)
-        if obs:
-            lines.append("")
-            for lang, text in obs.items():
-                # Wrap long text
-                wrapped = text[:120] + ("..." if len(text) > 120 else "")
-                lines.append(f"    {lang}: {wrapped}")
-
    lines.append("")
    return "\n".join(lines)