Revamp questionnaire, parallelize run-all, add new tasks

- Replace 6 compound Likert questions with 12 atomic ones grouped by dimension (syntax, expressiveness, data/IO, errors, overall); drop free-form question. Responses now stored as ints, not strings. - Back-compat layer maps legacy keys to new dimensions so existing results still render. - Parallelize run-all with ThreadPoolExecutor (configurable workers) and add a thread-safe min-request-interval rate limiter to the Anthropic provider. - Add new tasks: path_normalizer, todo_manager, currency_converter, locale_weather_url, network_info_parser, url_normalizer.
2026-04-07 19:07:21 +01:00
parent 20e62f60f6
commit 18ce7e57cf
13 changed files with 943 additions and 206 deletions
--- a/lush_bench/config.py
+++ b/lush_bench/config.py
@@ -13,6 +13,7 @@ class Config:
    timeout_seconds: float = 10.0
    normalize_whitespace: bool = True
    output_dir: Path = Path("results")
+    max_workers: int = 4
    provider_configs: dict[str, dict[str, Any]] = field(default_factory=dict)

    @classmethod
@@ -35,5 +36,6 @@ class Config:
            timeout_seconds=agent.get("timeout_seconds", 10.0),
            normalize_whitespace=agent.get("normalize_whitespace", True),
            output_dir=Path(results.get("output_dir", "results")),
+            max_workers=agent.get("max_workers", 4),
            provider_configs=provider_configs,
        )
--- a/lush_bench/export.py
+++ b/lush_bench/export.py
@@ -13,7 +13,6 @@ import matplotlib.ticker as ticker
 from .models import BenchmarkResult
 from .report import (
    LIKERT_QUESTIONS,
-    _get_freeform,
    _get_likert_scores,
    _parse_likert,
    load_latest_results,
@@ -35,7 +34,7 @@ def _fig_to_base64(fig: plt.Figure) -> str:
 def _aggregate_likert(results: list[BenchmarkResult]) -> dict[str, dict[str, float]]:
    """Return {question_key: {bash: avg, lush: avg}}."""
    agg: dict[str, dict[str, list[float]]] = {}
-    for key, _ in LIKERT_QUESTIONS:
+    for key, _, _ in LIKERT_QUESTIONS:
        agg[key] = {"bash": [], "lush": []}
    for r in results:
        scores = _get_likert_scores(r)
@@ -56,11 +55,11 @@ def _aggregate_likert(results: list[BenchmarkResult]) -> dict[str, dict[str, flo
 def chart_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
    """Grouped horizontal bar chart comparing bash vs lush on each Likert metric."""
    avgs = _aggregate_likert(results)
-    labels = [label for _, label in LIKERT_QUESTIONS]
-    bash_vals = [avgs[key]["bash"] for key, _ in LIKERT_QUESTIONS]
-    lush_vals = [avgs[key]["lush"] for key, _ in LIKERT_QUESTIONS]
+    labels = [label for _, label, _ in LIKERT_QUESTIONS]
+    bash_vals = [avgs[key]["bash"] for key, _, _ in LIKERT_QUESTIONS]
+    lush_vals = [avgs[key]["lush"] for key, _, _ in LIKERT_QUESTIONS]

-    fig, ax = plt.subplots(figsize=(8, 4.5))
+    fig, ax = plt.subplots(figsize=(8, 7))
    y = range(len(labels))
    bar_h = 0.35
    bars_bash = ax.barh([i + bar_h / 2 for i in y], bash_vals, bar_h, label="bash", color=BASH_COLOR)
@@ -112,14 +111,14 @@ def chart_turns_comparison(results: list[BenchmarkResult]) -> str:

 def chart_per_task_heatmap(results: list[BenchmarkResult]) -> str:
    """Heatmap showing lush-minus-bash score diff per task and metric."""
-    labels = [label for _, label in LIKERT_QUESTIONS]
+    labels = [label for _, label, _ in LIKERT_QUESTIONS]
    tasks = [r.task_name for r in results]

    data: list[list[float]] = []
    for r in results:
        scores = _get_likert_scores(r)
        row = []
-        for key, _ in LIKERT_QUESTIONS:
+        for key, _, _ in LIKERT_QUESTIONS:
            b = scores[key]["bash"]
            l = scores[key]["lush"]
            if b is not None and l is not None:
@@ -128,11 +127,11 @@ def chart_per_task_heatmap(results: list[BenchmarkResult]) -> str:
                row.append(0.0)
        data.append(row)

-    fig, ax = plt.subplots(figsize=(8, max(4, len(tasks) * 0.45 + 1)))
+    fig, ax = plt.subplots(figsize=(10, max(4, len(tasks) * 0.45 + 1)))
    im = ax.imshow(data, cmap="RdYlGn", aspect="auto", vmin=-3, vmax=3)

    ax.set_xticks(range(len(labels)))
-    ax.set_xticklabels(labels, rotation=35, ha="right", fontsize=8)
+    ax.set_xticklabels(labels, rotation=45, ha="right", fontsize=7)
    ax.set_yticks(range(len(tasks)))
    ax.set_yticklabels(tasks, fontsize=8)

@@ -140,7 +139,7 @@ def chart_per_task_heatmap(results: list[BenchmarkResult]) -> str:
        for j in range(len(labels)):
            val = data[i][j]
            text = f"+{val:.0f}" if val > 0 else f"{val:.0f}" if val < 0 else "0"
-            ax.text(j, i, text, ha="center", va="center", fontsize=8,
+            ax.text(j, i, text, ha="center", va="center", fontsize=7,
                    color="white" if abs(val) >= 2 else "black")

    ax.set_title("Score Difference (Lush - Bash)")
@@ -197,7 +196,7 @@ def chart_per_category_questionnaire(results: list[BenchmarkResult]) -> str:


 def chart_per_category_radar(results: list[BenchmarkResult]) -> list[tuple[str, str]]:
-    """Small-multiples bar charts: one per category showing 6 Likert dimensions for bash vs lush."""
+    """Small-multiples bar charts: one per category showing 12 Likert dimensions for bash vs lush."""
    import numpy as np
    from collections import defaultdict

@@ -206,12 +205,12 @@ def chart_per_category_radar(results: list[BenchmarkResult]) -> list[tuple[str,
        by_cat[r.category].append(r)

    charts: list[tuple[str, str]] = []
-    labels = [label for _, label in LIKERT_QUESTIONS]
+    labels = [label for _, label, _ in LIKERT_QUESTIONS]

    for cat in sorted(by_cat):
        cat_results = by_cat[cat]
        agg: dict[str, dict[str, list[float]]] = {}
-        for key, _ in LIKERT_QUESTIONS:
+        for key, _, _ in LIKERT_QUESTIONS:
            agg[key] = {"bash": [], "lush": []}
        for r in cat_results:
            scores = _get_likert_scores(r)
@@ -221,10 +220,10 @@ def chart_per_category_radar(results: list[BenchmarkResult]) -> list[tuple[str,
                    if val is not None:
                        agg[key][lang].append(val)

-        bash_vals = [sum(agg[k]["bash"]) / len(agg[k]["bash"]) if agg[k]["bash"] else 0.0 for k, _ in LIKERT_QUESTIONS]
-        lush_vals = [sum(agg[k]["lush"]) / len(agg[k]["lush"]) if agg[k]["lush"] else 0.0 for k, _ in LIKERT_QUESTIONS]
+        bash_vals = [sum(agg[k]["bash"]) / len(agg[k]["bash"]) if agg[k]["bash"] else 0.0 for k, _, _ in LIKERT_QUESTIONS]
+        lush_vals = [sum(agg[k]["lush"]) / len(agg[k]["lush"]) if agg[k]["lush"] else 0.0 for k, _, _ in LIKERT_QUESTIONS]

-        fig, ax = plt.subplots(figsize=(6, 3.5))
+        fig, ax = plt.subplots(figsize=(7, 5))
        y = range(len(labels))
        bar_h = 0.35
        ax.barh([i + bar_h / 2 for i in y], bash_vals, bar_h, label="bash", color=BASH_COLOR)
@@ -337,7 +336,7 @@ def _build_detail_html(results: list[BenchmarkResult]) -> str:

        scores = _get_likert_scores(r)
        score_rows = []
-        for key, label in LIKERT_QUESTIONS:
+        for key, label, _ in LIKERT_QUESTIONS:
            b_val = scores[key]["bash"]
            l_val = scores[key]["lush"]
            b_str = f"{b_val:.0f}" if b_val is not None else "-"
@@ -353,11 +352,6 @@ def _build_detail_html(results: list[BenchmarkResult]) -> str:
                              f'<td>{b_str}</td><td>{l_str}</td>'
                              f'<td class="{d_cls}">{d_str}</td></tr>')

-        obs = _get_freeform(r)
-        obs_html = ""
-        for lang, text in obs.items():
-            obs_html += f'<p><strong>{lang}:</strong> {html.escape(text)}</p>\n'
-
        sections.append(f"""
        <div class="task-detail">
            <h3>{html.escape(r.task_name)} <span class="cat">[{r.category}/{r.mode}]</span>
@@ -368,7 +362,6 @@ def _build_detail_html(results: list[BenchmarkResult]) -> str:
                <thead><tr><th>Metric</th><th>Bash</th><th>Lush</th><th>Diff</th></tr></thead>
                <tbody>{"".join(score_rows)}</tbody>
            </table>
-            <div class="observations">{obs_html}</div>
        </div>""")

    return "\n".join(sections)
@@ -424,8 +417,6 @@ def export_html(results_dir: Path, output_path: Path) -> None:
    .scores {{ width: auto; }}
    .scores td:nth-child(n+2) {{ text-align: center; min-width: 50px; }}
    .scores th:nth-child(n+2) {{ text-align: center; }}
-    .observations {{ margin-top: 12px; font-size: 0.85rem; color: #444; }}
-    .observations p {{ margin-bottom: 6px; }}
 </style>
 </head>
 <body>
--- a/lush_bench/providers/anthropic.py
+++ b/lush_bench/providers/anthropic.py
@@ -1,6 +1,8 @@
 from __future__ import annotations

 import os
+import threading
+import time
 from typing import Any

 import anthropic
@@ -17,8 +19,17 @@ class AnthropicProvider:
        self._client = anthropic.Anthropic(api_key=api_key)
        self._model = config.get("model", "claude-sonnet-4-20250514")
        self._max_tokens = config.get("max_tokens", 4096)
+        self._min_request_interval = config.get("min_request_interval", 0.1)
+        self._last_request_time = 0.0
+        self._lock = threading.Lock()

    def send(self, messages: list[Message], system: str = "") -> str:
+        with self._lock:
+            elapsed = time.monotonic() - self._last_request_time
+            if elapsed < self._min_request_interval:
+                time.sleep(self._min_request_interval - elapsed)
+            self._last_request_time = time.monotonic()
+
        api_messages = [{"role": m.role, "content": m.content} for m in messages]
        kwargs: dict[str, Any] = {
            "model": self._model,
--- a/lush_bench/questionnaire.py
+++ b/lush_bench/questionnaire.py
@@ -7,42 +7,38 @@ from .models import QuestionnaireResponse
 from .providers.base import LLMProvider, Message

 QUESTIONS = [
-    {
-        "question": "Readability: The solution is easy to read and understand",
-        "choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
-    },
-    {
-        "question": "Expressiveness: The language provided sufficient constructs to solve the problem naturally",
-        "choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
-    },
-    {
-        "question": "Conciseness: The solution required minimal boilerplate",
-        "choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
-    },
-    {
-        "question": "Error handling: Error handling was straightforward",
-        "choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
-    },
-    {
-        "question": "Overall preference: I would prefer this language for similar tasks",
-        "choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
-    },
-    {
-        "question": "Learning curve: An unfamiliar developer could understand the solution quickly",
-        "choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
-    },
+    # Syntax & Readability
+    {"id": "syntax_clarity", "dimension": "Syntax & Readability", "question": "The language's syntax makes the intent of operations visually obvious"},
+    {"id": "signal_to_noise", "dimension": "Syntax & Readability", "question": "The language keeps boilerplate low — most characters serve the task, not the language"},
+    {"id": "familiar_conventions", "dimension": "Syntax & Readability", "question": "The language follows conventions that developers from other languages would recognize"},
+    # Expressiveness
+    {"id": "builtin_ops", "dimension": "Expressiveness", "question": "The language provides built-in operations for the core task requirements (no workarounds needed)"},
+    {"id": "string_ops", "dimension": "Expressiveness", "question": "The language's string manipulation capabilities are convenient for this task"},
+    {"id": "composition", "dimension": "Expressiveness", "question": "The language makes it easy to compose operations (piping, chaining, nesting)"},
+    # Data & I/O
+    {"id": "io_ergonomics", "dimension": "Data & I/O", "question": "Reading input and producing output is straightforward in this language"},
+    {"id": "data_structures", "dimension": "Data & I/O", "question": "The language's data structures (arrays, maps, variables) are well-suited to this task"},
+    # Error Handling
+    {"id": "error_model", "dimension": "Error Handling", "question": "The language's error handling model is clear and predictable"},
+    {"id": "edge_case_support", "dimension": "Error Handling", "question": "The language makes it easy to handle edge cases (empty input, missing data, type mismatches)"},
+    # Overall
+    {"id": "learnability", "dimension": "Overall", "question": "A developer unfamiliar with this language could learn enough to solve this task quickly"},
+    {"id": "fitness", "dimension": "Overall", "question": "This language is a good fit for this type of task"},
 ]

+CHOICES = ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"]
+

 def build_questionnaire_prompt(
    task_name: str,
    language: str,
    solution_code: str,
 ) -> str:
+    choices_str = ", ".join(f'"{c}"' for c in CHOICES)
+
    questions_text = ""
-    for i, q in enumerate(QUESTIONS, 1):
-        choices_str = ", ".join(f'"{c}"' for c in q["choices"])
-        questions_text += f'  {{"question": "{q["question"]}", "choices": [{choices_str}], "selected": <your choice>}},\n'
+    for q in QUESTIONS:
+        questions_text += f'  {{"id": "{q["id"]}", "question": "{q["question"]}", "selected": <your choice>}},\n'

    return f"""You just solved the task "{task_name}" in {language}. Here is your solution:

@@ -50,11 +46,20 @@ def build_questionnaire_prompt(
 {solution_code}
 ```

-Please evaluate your experience by answering the following questionnaire. Respond with ONLY a JSON array — no other text.
+Rate the **language itself** on each aspect below, not the quality of this particular solution. Consider what the language's design and built-in features afford for this type of task.
+
+Respond with ONLY a JSON array — no other text. For "selected", use one of: {choices_str}

 [
-{questions_text}  {{"question": "Free-form observation about using {language} for this task", "selected": "<your observation>"}}
-]"""
+{questions_text}]"""
+
+
+def _extract_int(value: str) -> int | None:
+    """Extract leading digit from a response like '4 - Agree'."""
+    s = value.strip()
+    if s and s[0].isdigit():
+        return int(s[0])
+    return None


 def parse_questionnaire_response(response: str) -> list[QuestionnaireResponse]:
@@ -70,11 +75,20 @@ def parse_questionnaire_response(response: str) -> list[QuestionnaireResponse]:

    results = []
    for item in data:
+        question_id = item.get("id", item.get("question", ""))
+        raw_selected = item.get("selected", "")
+
+        # Normalize to int
+        if isinstance(raw_selected, int):
+            selected: int | str = raw_selected
+        else:
+            parsed = _extract_int(str(raw_selected))
+            selected = parsed if parsed is not None else raw_selected
+
        results.append(
            QuestionnaireResponse(
-                question=item.get("question", ""),
-                selected=item.get("selected", ""),
-                choices=item.get("choices"),
+                question=question_id,
+                selected=selected,
            )
        )
    return results
--- a/lush_bench/report.py
+++ b/lush_bench/report.py
@@ -5,16 +5,32 @@ from pathlib import Path

 from .models import BenchmarkResult

-# Likert questions in order (must match questionnaire.py QUESTIONS)
+# New 12-item question list: (key, label, dimension)
 LIKERT_QUESTIONS = [
-    ("Readability", "Readability"),
-    ("Expressiveness", "Expressiveness"),
-    ("Conciseness", "Conciseness"),
-    ("Error handling", "Error handling"),
-    ("Overall preference", "Overall preference"),
-    ("Learning curve", "Learning curve"),
+    ("syntax_clarity", "Syntax clarity", "Syntax & Readability"),
+    ("signal_to_noise", "Signal-to-noise", "Syntax & Readability"),
+    ("familiar_conventions", "Familiar conventions", "Syntax & Readability"),
+    ("builtin_ops", "Built-in operations", "Expressiveness"),
+    ("string_ops", "String operations", "Expressiveness"),
+    ("composition", "Composition", "Expressiveness"),
+    ("io_ergonomics", "I/O ergonomics", "Data & I/O"),
+    ("data_structures", "Data structures", "Data & I/O"),
+    ("error_model", "Error model", "Error Handling"),
+    ("edge_case_support", "Edge case support", "Error Handling"),
+    ("learnability", "Learnability", "Overall"),
+    ("fitness", "Fitness for task", "Overall"),
 ]

+# Map old 6 legacy keys to new keys for back-compat with existing results
+LEGACY_KEY_MAP = {
+    "Readability": ["syntax_clarity", "signal_to_noise", "familiar_conventions"],
+    "Expressiveness": ["builtin_ops", "string_ops", "composition"],
+    "Conciseness": ["signal_to_noise"],
+    "Error handling": ["error_model", "edge_case_support"],
+    "Overall preference": ["fitness"],
+    "Learning curve": ["learnability"],
+}
+

 def load_latest_results(results_dir: Path) -> list[BenchmarkResult]:
    """Load results, keeping only the latest run per task name."""
@@ -30,7 +46,7 @@ def load_latest_results(results_dir: Path) -> list[BenchmarkResult]:


 def _parse_likert(selected: str | int) -> int | None:
-    """Extract numeric value from a likert response like '4 - Agree'."""
+    """Extract numeric value from a likert response. Handles int directly or string like '4 - Agree'."""
    if isinstance(selected, int):
        return selected
    s = str(selected).strip()
@@ -40,20 +56,34 @@ def _parse_likert(selected: str | int) -> int | None:


 def _get_likert_scores(result: BenchmarkResult) -> dict[str, dict[str, float | None]]:
-    """Extract likert scores per language. Returns {question_key: {bash: N, lush: N}}."""
+    """Extract likert scores per language. Returns {question_key: {bash: N, lush: N}}.
+
+    Handles both new-format results (exact id match) and legacy results (startswith match
+    mapped to new keys).
+    """
    scores: dict[str, dict[str, float | None]] = {}
-    for key, _ in LIKERT_QUESTIONS:
+    for key, _, _ in LIKERT_QUESTIONS:
        scores[key] = {"bash": None, "lush": None}

    for lang_name, lang_result in [("bash", result.bash_result), ("lush", result.lush_result)]:
        if not lang_result:
            continue
        for q in lang_result.questionnaire:
-            for key, _ in LIKERT_QUESTIONS:
-                if q.question.startswith(key):
+            # Try exact match on new question ids
+            if q.question in scores:
+                val = _parse_likert(q.selected)
+                if val is not None:
+                    scores[q.question][lang_name] = float(val)
+                continue
+
+            # Legacy: map old key to new keys (spread the score)
+            for legacy_prefix, new_keys in LEGACY_KEY_MAP.items():
+                if q.question.startswith(legacy_prefix):
                    val = _parse_likert(q.selected)
                    if val is not None:
-                        scores[key][lang_name] = float(val)
+                        for nk in new_keys:
+                            if scores[nk][lang_name] is None:
+                                scores[nk][lang_name] = float(val)
                    break
    return scores

@@ -64,19 +94,6 @@ def _bar(value: float, max_val: float = 5.0, width: int = 20) -> str:
    return "\u2588" * filled + "\u2591" * (width - filled)


-def _get_freeform(result: BenchmarkResult) -> dict[str, str]:
-    """Extract free-form observations per language."""
-    obs: dict[str, str] = {}
-    for lang_name, lang_result in [("bash", result.bash_result), ("lush", result.lush_result)]:
-        if not lang_result:
-            continue
-        for q in lang_result.questionnaire:
-            if q.question.startswith("Free-form"):
-                obs[lang_name] = str(q.selected)
-                break
-    return obs
-
-
 def render_summary_table(results: list[BenchmarkResult]) -> str:
    """Render the pass/fail + turns overview table."""
    lines: list[str] = []
@@ -123,7 +140,7 @@ def render_summary_table(results: list[BenchmarkResult]) -> str:


 def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
-    """Render aggregated questionnaire scores with bar charts."""
+    """Render aggregated questionnaire scores with bar charts, grouped by dimension."""
    lines: list[str] = []
    lines.append("=" * 78)
    lines.append("  QUESTIONNAIRE SCORES (1-5 Likert, higher = better)")
@@ -132,7 +149,7 @@ def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str:

    # Aggregate scores across all tasks
    agg: dict[str, dict[str, list[float]]] = {}
-    for key, _ in LIKERT_QUESTIONS:
+    for key, _, _ in LIKERT_QUESTIONS:
        agg[key] = {"bash": [], "lush": []}

    for r in results:
@@ -143,7 +160,15 @@ def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
                if val is not None:
                    agg[key][lang].append(val)

-    for key, label in LIKERT_QUESTIONS:
+    # Group by dimension
+    current_dim = None
+    for key, label, dimension in LIKERT_QUESTIONS:
+        if dimension != current_dim:
+            if current_dim is not None:
+                lines.append("")
+            lines.append(f"  [{dimension}]")
+            current_dim = dimension
+
        b_vals = agg[key]["bash"]
        l_vals = agg[key]["lush"]
        b_avg = sum(b_vals) / len(b_vals) if b_vals else 0.0
@@ -151,10 +176,9 @@ def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
        diff = l_avg - b_avg
        diff_str = f"+{diff:.1f}" if diff > 0 else f"{diff:.1f}" if diff < 0 else " 0.0"

-        lines.append(f"  {label}")
-        lines.append(f"    bash  {_bar(b_avg)} {b_avg:.1f}")
-        lines.append(f"    lush  {_bar(l_avg)} {l_avg:.1f}  ({diff_str})")
-        lines.append("")
+        lines.append(f"    {label}")
+        lines.append(f"      bash  {_bar(b_avg)} {b_avg:.1f}")
+        lines.append(f"      lush  {_bar(l_avg)} {l_avg:.1f}  ({diff_str})")

    # Overall average
    all_bash = [v for key in agg for v in agg[key]["bash"]]
@@ -164,6 +188,7 @@ def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
    diff = l_overall - b_overall
    diff_str = f"+{diff:.1f}" if diff > 0 else f"{diff:.1f}" if diff < 0 else " 0.0"

+    lines.append("")
    lines.append("  " + "-" * 50)
    lines.append(f"  Overall average")
    lines.append(f"    bash  {_bar(b_overall)} {b_overall:.1f}")
@@ -244,7 +269,7 @@ def render_per_category_questionnaire(results: list[BenchmarkResult]) -> str:
        lines.append(f"  {cat}")

        agg: dict[str, dict[str, list[float]]] = {}
-        for key, _ in LIKERT_QUESTIONS:
+        for key, _, _ in LIKERT_QUESTIONS:
            agg[key] = {"bash": [], "lush": []}
        for r in cat_results:
            scores = _get_likert_scores(r)
@@ -254,7 +279,7 @@ def render_per_category_questionnaire(results: list[BenchmarkResult]) -> str:
                    if val is not None:
                        agg[key][lang].append(val)

-        for key, label in LIKERT_QUESTIONS:
+        for key, label, _ in LIKERT_QUESTIONS:
            b_vals = agg[key]["bash"]
            l_vals = agg[key]["lush"]
            b_avg = sum(b_vals) / len(b_vals) if b_vals else 0.0
@@ -284,7 +309,7 @@ def render_per_task_detail(results: list[BenchmarkResult]) -> str:
        scores = _get_likert_scores(r)
        lines.append(f"    {'Metric':<22s} {'Bash':>4s}  {'Lush':>4s}  {'Diff':>5s}")
        lines.append("    " + "-" * 40)
-        for key, label in LIKERT_QUESTIONS:
+        for key, label, _ in LIKERT_QUESTIONS:
            b_val = scores[key]["bash"]
            l_val = scores[key]["lush"]
            b_str = f"{b_val:.0f}" if b_val is not None else "-"
@@ -296,15 +321,6 @@ def render_per_task_detail(results: list[BenchmarkResult]) -> str:
                d_str = "-"
            lines.append(f"    {label:<22s} {b_str:>4s}  {l_str:>4s}  {d_str:>5s}")

-        # Free-form observations
-        obs = _get_freeform(r)
-        if obs:
-            lines.append("")
-            for lang, text in obs.items():
-                # Wrap long text
-                wrapped = text[:120] + ("..." if len(text) > 120 else "")
-                lines.append(f"    {lang}: {wrapped}")
-
    lines.append("")
    return "\n".join(lines)