Revamp questionnaire, parallelize run-all, add new tasks

- Replace 6 compound Likert questions with 12 atomic ones grouped by
  dimension (syntax, expressiveness, data/IO, errors, overall); drop
  free-form question. Responses now stored as ints, not strings.
- Back-compat layer maps legacy keys to new dimensions so existing
  results still render.
- Parallelize run-all with ThreadPoolExecutor (configurable workers)
  and add a thread-safe min-request-interval rate limiter to the
  Anthropic provider.
- Add new tasks: path_normalizer, todo_manager, currency_converter,
  locale_weather_url, network_info_parser, url_normalizer.
This commit is contained in:
Cormac Shannon
2026-04-07 19:07:21 +01:00
parent 20e62f60f6
commit 18ce7e57cf
13 changed files with 943 additions and 206 deletions

View File

@@ -13,7 +13,6 @@ import matplotlib.ticker as ticker
from .models import BenchmarkResult
from .report import (
LIKERT_QUESTIONS,
_get_freeform,
_get_likert_scores,
_parse_likert,
load_latest_results,
@@ -35,7 +34,7 @@ def _fig_to_base64(fig: plt.Figure) -> str:
def _aggregate_likert(results: list[BenchmarkResult]) -> dict[str, dict[str, float]]:
"""Return {question_key: {bash: avg, lush: avg}}."""
agg: dict[str, dict[str, list[float]]] = {}
for key, _ in LIKERT_QUESTIONS:
for key, _, _ in LIKERT_QUESTIONS:
agg[key] = {"bash": [], "lush": []}
for r in results:
scores = _get_likert_scores(r)
@@ -56,11 +55,11 @@ def _aggregate_likert(results: list[BenchmarkResult]) -> dict[str, dict[str, flo
def chart_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
"""Grouped horizontal bar chart comparing bash vs lush on each Likert metric."""
avgs = _aggregate_likert(results)
labels = [label for _, label in LIKERT_QUESTIONS]
bash_vals = [avgs[key]["bash"] for key, _ in LIKERT_QUESTIONS]
lush_vals = [avgs[key]["lush"] for key, _ in LIKERT_QUESTIONS]
labels = [label for _, label, _ in LIKERT_QUESTIONS]
bash_vals = [avgs[key]["bash"] for key, _, _ in LIKERT_QUESTIONS]
lush_vals = [avgs[key]["lush"] for key, _, _ in LIKERT_QUESTIONS]
fig, ax = plt.subplots(figsize=(8, 4.5))
fig, ax = plt.subplots(figsize=(8, 7))
y = range(len(labels))
bar_h = 0.35
bars_bash = ax.barh([i + bar_h / 2 for i in y], bash_vals, bar_h, label="bash", color=BASH_COLOR)
@@ -112,14 +111,14 @@ def chart_turns_comparison(results: list[BenchmarkResult]) -> str:
def chart_per_task_heatmap(results: list[BenchmarkResult]) -> str:
"""Heatmap showing lush-minus-bash score diff per task and metric."""
labels = [label for _, label in LIKERT_QUESTIONS]
labels = [label for _, label, _ in LIKERT_QUESTIONS]
tasks = [r.task_name for r in results]
data: list[list[float]] = []
for r in results:
scores = _get_likert_scores(r)
row = []
for key, _ in LIKERT_QUESTIONS:
for key, _, _ in LIKERT_QUESTIONS:
b = scores[key]["bash"]
l = scores[key]["lush"]
if b is not None and l is not None:
@@ -128,11 +127,11 @@ def chart_per_task_heatmap(results: list[BenchmarkResult]) -> str:
row.append(0.0)
data.append(row)
fig, ax = plt.subplots(figsize=(8, max(4, len(tasks) * 0.45 + 1)))
fig, ax = plt.subplots(figsize=(10, max(4, len(tasks) * 0.45 + 1)))
im = ax.imshow(data, cmap="RdYlGn", aspect="auto", vmin=-3, vmax=3)
ax.set_xticks(range(len(labels)))
ax.set_xticklabels(labels, rotation=35, ha="right", fontsize=8)
ax.set_xticklabels(labels, rotation=45, ha="right", fontsize=7)
ax.set_yticks(range(len(tasks)))
ax.set_yticklabels(tasks, fontsize=8)
@@ -140,7 +139,7 @@ def chart_per_task_heatmap(results: list[BenchmarkResult]) -> str:
for j in range(len(labels)):
val = data[i][j]
text = f"+{val:.0f}" if val > 0 else f"{val:.0f}" if val < 0 else "0"
ax.text(j, i, text, ha="center", va="center", fontsize=8,
ax.text(j, i, text, ha="center", va="center", fontsize=7,
color="white" if abs(val) >= 2 else "black")
ax.set_title("Score Difference (Lush - Bash)")
@@ -197,7 +196,7 @@ def chart_per_category_questionnaire(results: list[BenchmarkResult]) -> str:
def chart_per_category_radar(results: list[BenchmarkResult]) -> list[tuple[str, str]]:
"""Small-multiples bar charts: one per category showing 6 Likert dimensions for bash vs lush."""
"""Small-multiples bar charts: one per category showing 12 Likert dimensions for bash vs lush."""
import numpy as np
from collections import defaultdict
@@ -206,12 +205,12 @@ def chart_per_category_radar(results: list[BenchmarkResult]) -> list[tuple[str,
by_cat[r.category].append(r)
charts: list[tuple[str, str]] = []
labels = [label for _, label in LIKERT_QUESTIONS]
labels = [label for _, label, _ in LIKERT_QUESTIONS]
for cat in sorted(by_cat):
cat_results = by_cat[cat]
agg: dict[str, dict[str, list[float]]] = {}
for key, _ in LIKERT_QUESTIONS:
for key, _, _ in LIKERT_QUESTIONS:
agg[key] = {"bash": [], "lush": []}
for r in cat_results:
scores = _get_likert_scores(r)
@@ -221,10 +220,10 @@ def chart_per_category_radar(results: list[BenchmarkResult]) -> list[tuple[str,
if val is not None:
agg[key][lang].append(val)
bash_vals = [sum(agg[k]["bash"]) / len(agg[k]["bash"]) if agg[k]["bash"] else 0.0 for k, _ in LIKERT_QUESTIONS]
lush_vals = [sum(agg[k]["lush"]) / len(agg[k]["lush"]) if agg[k]["lush"] else 0.0 for k, _ in LIKERT_QUESTIONS]
bash_vals = [sum(agg[k]["bash"]) / len(agg[k]["bash"]) if agg[k]["bash"] else 0.0 for k, _, _ in LIKERT_QUESTIONS]
lush_vals = [sum(agg[k]["lush"]) / len(agg[k]["lush"]) if agg[k]["lush"] else 0.0 for k, _, _ in LIKERT_QUESTIONS]
fig, ax = plt.subplots(figsize=(6, 3.5))
fig, ax = plt.subplots(figsize=(7, 5))
y = range(len(labels))
bar_h = 0.35
ax.barh([i + bar_h / 2 for i in y], bash_vals, bar_h, label="bash", color=BASH_COLOR)
@@ -337,7 +336,7 @@ def _build_detail_html(results: list[BenchmarkResult]) -> str:
scores = _get_likert_scores(r)
score_rows = []
for key, label in LIKERT_QUESTIONS:
for key, label, _ in LIKERT_QUESTIONS:
b_val = scores[key]["bash"]
l_val = scores[key]["lush"]
b_str = f"{b_val:.0f}" if b_val is not None else "-"
@@ -353,11 +352,6 @@ def _build_detail_html(results: list[BenchmarkResult]) -> str:
f'<td>{b_str}</td><td>{l_str}</td>'
f'<td class="{d_cls}">{d_str}</td></tr>')
obs = _get_freeform(r)
obs_html = ""
for lang, text in obs.items():
obs_html += f'<p><strong>{lang}:</strong> {html.escape(text)}</p>\n'
sections.append(f"""
<div class="task-detail">
<h3>{html.escape(r.task_name)} <span class="cat">[{r.category}/{r.mode}]</span>
@@ -368,7 +362,6 @@ def _build_detail_html(results: list[BenchmarkResult]) -> str:
<thead><tr><th>Metric</th><th>Bash</th><th>Lush</th><th>Diff</th></tr></thead>
<tbody>{"".join(score_rows)}</tbody>
</table>
<div class="observations">{obs_html}</div>
</div>""")
return "\n".join(sections)
@@ -424,8 +417,6 @@ def export_html(results_dir: Path, output_path: Path) -> None:
.scores {{ width: auto; }}
.scores td:nth-child(n+2) {{ text-align: center; min-width: 50px; }}
.scores th:nth-child(n+2) {{ text-align: center; }}
.observations {{ margin-top: 12px; font-size: 0.85rem; color: #444; }}
.observations p {{ margin-bottom: 6px; }}
</style>
</head>
<body>