Revamp questionnaire, parallelize run-all, add new tasks

- Replace 6 compound Likert questions with 12 atomic ones grouped by
  dimension (syntax, expressiveness, data/IO, errors, overall); drop
  free-form question. Responses now stored as ints, not strings.
- Back-compat layer maps legacy keys to new dimensions so existing
  results still render.
- Parallelize run-all with ThreadPoolExecutor (configurable workers)
  and add a thread-safe min-request-interval rate limiter to the
  Anthropic provider.
- Add new tasks: path_normalizer, todo_manager, currency_converter,
  locale_weather_url, network_info_parser, url_normalizer.
This commit is contained in:
Cormac Shannon
2026-04-07 19:07:21 +01:00
parent 20e62f60f6
commit 18ce7e57cf
13 changed files with 943 additions and 206 deletions

View File

@@ -13,6 +13,7 @@ class Config:
timeout_seconds: float = 10.0
normalize_whitespace: bool = True
output_dir: Path = Path("results")
max_workers: int = 4
provider_configs: dict[str, dict[str, Any]] = field(default_factory=dict)
@classmethod
@@ -35,5 +36,6 @@ class Config:
timeout_seconds=agent.get("timeout_seconds", 10.0),
normalize_whitespace=agent.get("normalize_whitespace", True),
output_dir=Path(results.get("output_dir", "results")),
max_workers=agent.get("max_workers", 4),
provider_configs=provider_configs,
)

View File

@@ -13,7 +13,6 @@ import matplotlib.ticker as ticker
from .models import BenchmarkResult
from .report import (
LIKERT_QUESTIONS,
_get_freeform,
_get_likert_scores,
_parse_likert,
load_latest_results,
@@ -35,7 +34,7 @@ def _fig_to_base64(fig: plt.Figure) -> str:
def _aggregate_likert(results: list[BenchmarkResult]) -> dict[str, dict[str, float]]:
"""Return {question_key: {bash: avg, lush: avg}}."""
agg: dict[str, dict[str, list[float]]] = {}
for key, _ in LIKERT_QUESTIONS:
for key, _, _ in LIKERT_QUESTIONS:
agg[key] = {"bash": [], "lush": []}
for r in results:
scores = _get_likert_scores(r)
@@ -56,11 +55,11 @@ def _aggregate_likert(results: list[BenchmarkResult]) -> dict[str, dict[str, flo
def chart_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
"""Grouped horizontal bar chart comparing bash vs lush on each Likert metric."""
avgs = _aggregate_likert(results)
labels = [label for _, label in LIKERT_QUESTIONS]
bash_vals = [avgs[key]["bash"] for key, _ in LIKERT_QUESTIONS]
lush_vals = [avgs[key]["lush"] for key, _ in LIKERT_QUESTIONS]
labels = [label for _, label, _ in LIKERT_QUESTIONS]
bash_vals = [avgs[key]["bash"] for key, _, _ in LIKERT_QUESTIONS]
lush_vals = [avgs[key]["lush"] for key, _, _ in LIKERT_QUESTIONS]
fig, ax = plt.subplots(figsize=(8, 4.5))
fig, ax = plt.subplots(figsize=(8, 7))
y = range(len(labels))
bar_h = 0.35
bars_bash = ax.barh([i + bar_h / 2 for i in y], bash_vals, bar_h, label="bash", color=BASH_COLOR)
@@ -112,14 +111,14 @@ def chart_turns_comparison(results: list[BenchmarkResult]) -> str:
def chart_per_task_heatmap(results: list[BenchmarkResult]) -> str:
"""Heatmap showing lush-minus-bash score diff per task and metric."""
labels = [label for _, label in LIKERT_QUESTIONS]
labels = [label for _, label, _ in LIKERT_QUESTIONS]
tasks = [r.task_name for r in results]
data: list[list[float]] = []
for r in results:
scores = _get_likert_scores(r)
row = []
for key, _ in LIKERT_QUESTIONS:
for key, _, _ in LIKERT_QUESTIONS:
b = scores[key]["bash"]
l = scores[key]["lush"]
if b is not None and l is not None:
@@ -128,11 +127,11 @@ def chart_per_task_heatmap(results: list[BenchmarkResult]) -> str:
row.append(0.0)
data.append(row)
fig, ax = plt.subplots(figsize=(8, max(4, len(tasks) * 0.45 + 1)))
fig, ax = plt.subplots(figsize=(10, max(4, len(tasks) * 0.45 + 1)))
im = ax.imshow(data, cmap="RdYlGn", aspect="auto", vmin=-3, vmax=3)
ax.set_xticks(range(len(labels)))
ax.set_xticklabels(labels, rotation=35, ha="right", fontsize=8)
ax.set_xticklabels(labels, rotation=45, ha="right", fontsize=7)
ax.set_yticks(range(len(tasks)))
ax.set_yticklabels(tasks, fontsize=8)
@@ -140,7 +139,7 @@ def chart_per_task_heatmap(results: list[BenchmarkResult]) -> str:
for j in range(len(labels)):
val = data[i][j]
text = f"+{val:.0f}" if val > 0 else f"{val:.0f}" if val < 0 else "0"
ax.text(j, i, text, ha="center", va="center", fontsize=8,
ax.text(j, i, text, ha="center", va="center", fontsize=7,
color="white" if abs(val) >= 2 else "black")
ax.set_title("Score Difference (Lush - Bash)")
@@ -197,7 +196,7 @@ def chart_per_category_questionnaire(results: list[BenchmarkResult]) -> str:
def chart_per_category_radar(results: list[BenchmarkResult]) -> list[tuple[str, str]]:
"""Small-multiples bar charts: one per category showing 6 Likert dimensions for bash vs lush."""
"""Small-multiples bar charts: one per category showing 12 Likert dimensions for bash vs lush."""
import numpy as np
from collections import defaultdict
@@ -206,12 +205,12 @@ def chart_per_category_radar(results: list[BenchmarkResult]) -> list[tuple[str,
by_cat[r.category].append(r)
charts: list[tuple[str, str]] = []
labels = [label for _, label in LIKERT_QUESTIONS]
labels = [label for _, label, _ in LIKERT_QUESTIONS]
for cat in sorted(by_cat):
cat_results = by_cat[cat]
agg: dict[str, dict[str, list[float]]] = {}
for key, _ in LIKERT_QUESTIONS:
for key, _, _ in LIKERT_QUESTIONS:
agg[key] = {"bash": [], "lush": []}
for r in cat_results:
scores = _get_likert_scores(r)
@@ -221,10 +220,10 @@ def chart_per_category_radar(results: list[BenchmarkResult]) -> list[tuple[str,
if val is not None:
agg[key][lang].append(val)
bash_vals = [sum(agg[k]["bash"]) / len(agg[k]["bash"]) if agg[k]["bash"] else 0.0 for k, _ in LIKERT_QUESTIONS]
lush_vals = [sum(agg[k]["lush"]) / len(agg[k]["lush"]) if agg[k]["lush"] else 0.0 for k, _ in LIKERT_QUESTIONS]
bash_vals = [sum(agg[k]["bash"]) / len(agg[k]["bash"]) if agg[k]["bash"] else 0.0 for k, _, _ in LIKERT_QUESTIONS]
lush_vals = [sum(agg[k]["lush"]) / len(agg[k]["lush"]) if agg[k]["lush"] else 0.0 for k, _, _ in LIKERT_QUESTIONS]
fig, ax = plt.subplots(figsize=(6, 3.5))
fig, ax = plt.subplots(figsize=(7, 5))
y = range(len(labels))
bar_h = 0.35
ax.barh([i + bar_h / 2 for i in y], bash_vals, bar_h, label="bash", color=BASH_COLOR)
@@ -337,7 +336,7 @@ def _build_detail_html(results: list[BenchmarkResult]) -> str:
scores = _get_likert_scores(r)
score_rows = []
for key, label in LIKERT_QUESTIONS:
for key, label, _ in LIKERT_QUESTIONS:
b_val = scores[key]["bash"]
l_val = scores[key]["lush"]
b_str = f"{b_val:.0f}" if b_val is not None else "-"
@@ -353,11 +352,6 @@ def _build_detail_html(results: list[BenchmarkResult]) -> str:
f'<td>{b_str}</td><td>{l_str}</td>'
f'<td class="{d_cls}">{d_str}</td></tr>')
obs = _get_freeform(r)
obs_html = ""
for lang, text in obs.items():
obs_html += f'<p><strong>{lang}:</strong> {html.escape(text)}</p>\n'
sections.append(f"""
<div class="task-detail">
<h3>{html.escape(r.task_name)} <span class="cat">[{r.category}/{r.mode}]</span>
@@ -368,7 +362,6 @@ def _build_detail_html(results: list[BenchmarkResult]) -> str:
<thead><tr><th>Metric</th><th>Bash</th><th>Lush</th><th>Diff</th></tr></thead>
<tbody>{"".join(score_rows)}</tbody>
</table>
<div class="observations">{obs_html}</div>
</div>""")
return "\n".join(sections)
@@ -424,8 +417,6 @@ def export_html(results_dir: Path, output_path: Path) -> None:
.scores {{ width: auto; }}
.scores td:nth-child(n+2) {{ text-align: center; min-width: 50px; }}
.scores th:nth-child(n+2) {{ text-align: center; }}
.observations {{ margin-top: 12px; font-size: 0.85rem; color: #444; }}
.observations p {{ margin-bottom: 6px; }}
</style>
</head>
<body>

View File

@@ -1,6 +1,8 @@
from __future__ import annotations
import os
import threading
import time
from typing import Any
import anthropic
@@ -17,8 +19,17 @@ class AnthropicProvider:
self._client = anthropic.Anthropic(api_key=api_key)
self._model = config.get("model", "claude-sonnet-4-20250514")
self._max_tokens = config.get("max_tokens", 4096)
self._min_request_interval = config.get("min_request_interval", 0.1)
self._last_request_time = 0.0
self._lock = threading.Lock()
def send(self, messages: list[Message], system: str = "") -> str:
with self._lock:
elapsed = time.monotonic() - self._last_request_time
if elapsed < self._min_request_interval:
time.sleep(self._min_request_interval - elapsed)
self._last_request_time = time.monotonic()
api_messages = [{"role": m.role, "content": m.content} for m in messages]
kwargs: dict[str, Any] = {
"model": self._model,

View File

@@ -7,42 +7,38 @@ from .models import QuestionnaireResponse
from .providers.base import LLMProvider, Message
QUESTIONS = [
{
"question": "Readability: The solution is easy to read and understand",
"choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
},
{
"question": "Expressiveness: The language provided sufficient constructs to solve the problem naturally",
"choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
},
{
"question": "Conciseness: The solution required minimal boilerplate",
"choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
},
{
"question": "Error handling: Error handling was straightforward",
"choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
},
{
"question": "Overall preference: I would prefer this language for similar tasks",
"choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
},
{
"question": "Learning curve: An unfamiliar developer could understand the solution quickly",
"choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
},
# Syntax & Readability
{"id": "syntax_clarity", "dimension": "Syntax & Readability", "question": "The language's syntax makes the intent of operations visually obvious"},
{"id": "signal_to_noise", "dimension": "Syntax & Readability", "question": "The language keeps boilerplate low — most characters serve the task, not the language"},
{"id": "familiar_conventions", "dimension": "Syntax & Readability", "question": "The language follows conventions that developers from other languages would recognize"},
# Expressiveness
{"id": "builtin_ops", "dimension": "Expressiveness", "question": "The language provides built-in operations for the core task requirements (no workarounds needed)"},
{"id": "string_ops", "dimension": "Expressiveness", "question": "The language's string manipulation capabilities are convenient for this task"},
{"id": "composition", "dimension": "Expressiveness", "question": "The language makes it easy to compose operations (piping, chaining, nesting)"},
# Data & I/O
{"id": "io_ergonomics", "dimension": "Data & I/O", "question": "Reading input and producing output is straightforward in this language"},
{"id": "data_structures", "dimension": "Data & I/O", "question": "The language's data structures (arrays, maps, variables) are well-suited to this task"},
# Error Handling
{"id": "error_model", "dimension": "Error Handling", "question": "The language's error handling model is clear and predictable"},
{"id": "edge_case_support", "dimension": "Error Handling", "question": "The language makes it easy to handle edge cases (empty input, missing data, type mismatches)"},
# Overall
{"id": "learnability", "dimension": "Overall", "question": "A developer unfamiliar with this language could learn enough to solve this task quickly"},
{"id": "fitness", "dimension": "Overall", "question": "This language is a good fit for this type of task"},
]
CHOICES = ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"]
def build_questionnaire_prompt(
task_name: str,
language: str,
solution_code: str,
) -> str:
choices_str = ", ".join(f'"{c}"' for c in CHOICES)
questions_text = ""
for i, q in enumerate(QUESTIONS, 1):
choices_str = ", ".join(f'"{c}"' for c in q["choices"])
questions_text += f' {{"question": "{q["question"]}", "choices": [{choices_str}], "selected": <your choice>}},\n'
for q in QUESTIONS:
questions_text += f' {{"id": "{q["id"]}", "question": "{q["question"]}", "selected": <your choice>}},\n'
return f"""You just solved the task "{task_name}" in {language}. Here is your solution:
@@ -50,11 +46,20 @@ def build_questionnaire_prompt(
{solution_code}
```
Please evaluate your experience by answering the following questionnaire. Respond with ONLY a JSON array — no other text.
Rate the **language itself** on each aspect below, not the quality of this particular solution. Consider what the language's design and built-in features afford for this type of task.
Respond with ONLY a JSON array — no other text. For "selected", use one of: {choices_str}
[
{questions_text} {{"question": "Free-form observation about using {language} for this task", "selected": "<your observation>"}}
]"""
{questions_text}]"""
def _extract_int(value: str) -> int | None:
"""Extract leading digit from a response like '4 - Agree'."""
s = value.strip()
if s and s[0].isdigit():
return int(s[0])
return None
def parse_questionnaire_response(response: str) -> list[QuestionnaireResponse]:
@@ -70,11 +75,20 @@ def parse_questionnaire_response(response: str) -> list[QuestionnaireResponse]:
results = []
for item in data:
question_id = item.get("id", item.get("question", ""))
raw_selected = item.get("selected", "")
# Normalize to int
if isinstance(raw_selected, int):
selected: int | str = raw_selected
else:
parsed = _extract_int(str(raw_selected))
selected = parsed if parsed is not None else raw_selected
results.append(
QuestionnaireResponse(
question=item.get("question", ""),
selected=item.get("selected", ""),
choices=item.get("choices"),
question=question_id,
selected=selected,
)
)
return results

View File

@@ -5,16 +5,32 @@ from pathlib import Path
from .models import BenchmarkResult
# Likert questions in order (must match questionnaire.py QUESTIONS)
# New 12-item question list: (key, label, dimension)
LIKERT_QUESTIONS = [
("Readability", "Readability"),
("Expressiveness", "Expressiveness"),
("Conciseness", "Conciseness"),
("Error handling", "Error handling"),
("Overall preference", "Overall preference"),
("Learning curve", "Learning curve"),
("syntax_clarity", "Syntax clarity", "Syntax & Readability"),
("signal_to_noise", "Signal-to-noise", "Syntax & Readability"),
("familiar_conventions", "Familiar conventions", "Syntax & Readability"),
("builtin_ops", "Built-in operations", "Expressiveness"),
("string_ops", "String operations", "Expressiveness"),
("composition", "Composition", "Expressiveness"),
("io_ergonomics", "I/O ergonomics", "Data & I/O"),
("data_structures", "Data structures", "Data & I/O"),
("error_model", "Error model", "Error Handling"),
("edge_case_support", "Edge case support", "Error Handling"),
("learnability", "Learnability", "Overall"),
("fitness", "Fitness for task", "Overall"),
]
# Map old 6 legacy keys to new keys for back-compat with existing results
LEGACY_KEY_MAP = {
"Readability": ["syntax_clarity", "signal_to_noise", "familiar_conventions"],
"Expressiveness": ["builtin_ops", "string_ops", "composition"],
"Conciseness": ["signal_to_noise"],
"Error handling": ["error_model", "edge_case_support"],
"Overall preference": ["fitness"],
"Learning curve": ["learnability"],
}
def load_latest_results(results_dir: Path) -> list[BenchmarkResult]:
"""Load results, keeping only the latest run per task name."""
@@ -30,7 +46,7 @@ def load_latest_results(results_dir: Path) -> list[BenchmarkResult]:
def _parse_likert(selected: str | int) -> int | None:
"""Extract numeric value from a likert response like '4 - Agree'."""
"""Extract numeric value from a likert response. Handles int directly or string like '4 - Agree'."""
if isinstance(selected, int):
return selected
s = str(selected).strip()
@@ -40,20 +56,34 @@ def _parse_likert(selected: str | int) -> int | None:
def _get_likert_scores(result: BenchmarkResult) -> dict[str, dict[str, float | None]]:
"""Extract likert scores per language. Returns {question_key: {bash: N, lush: N}}."""
"""Extract likert scores per language. Returns {question_key: {bash: N, lush: N}}.
Handles both new-format results (exact id match) and legacy results (startswith match
mapped to new keys).
"""
scores: dict[str, dict[str, float | None]] = {}
for key, _ in LIKERT_QUESTIONS:
for key, _, _ in LIKERT_QUESTIONS:
scores[key] = {"bash": None, "lush": None}
for lang_name, lang_result in [("bash", result.bash_result), ("lush", result.lush_result)]:
if not lang_result:
continue
for q in lang_result.questionnaire:
for key, _ in LIKERT_QUESTIONS:
if q.question.startswith(key):
# Try exact match on new question ids
if q.question in scores:
val = _parse_likert(q.selected)
if val is not None:
scores[key][lang_name] = float(val)
scores[q.question][lang_name] = float(val)
continue
# Legacy: map old key to new keys (spread the score)
for legacy_prefix, new_keys in LEGACY_KEY_MAP.items():
if q.question.startswith(legacy_prefix):
val = _parse_likert(q.selected)
if val is not None:
for nk in new_keys:
if scores[nk][lang_name] is None:
scores[nk][lang_name] = float(val)
break
return scores
@@ -64,19 +94,6 @@ def _bar(value: float, max_val: float = 5.0, width: int = 20) -> str:
return "\u2588" * filled + "\u2591" * (width - filled)
def _get_freeform(result: BenchmarkResult) -> dict[str, str]:
"""Extract free-form observations per language."""
obs: dict[str, str] = {}
for lang_name, lang_result in [("bash", result.bash_result), ("lush", result.lush_result)]:
if not lang_result:
continue
for q in lang_result.questionnaire:
if q.question.startswith("Free-form"):
obs[lang_name] = str(q.selected)
break
return obs
def render_summary_table(results: list[BenchmarkResult]) -> str:
"""Render the pass/fail + turns overview table."""
lines: list[str] = []
@@ -123,7 +140,7 @@ def render_summary_table(results: list[BenchmarkResult]) -> str:
def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
"""Render aggregated questionnaire scores with bar charts."""
"""Render aggregated questionnaire scores with bar charts, grouped by dimension."""
lines: list[str] = []
lines.append("=" * 78)
lines.append(" QUESTIONNAIRE SCORES (1-5 Likert, higher = better)")
@@ -132,7 +149,7 @@ def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
# Aggregate scores across all tasks
agg: dict[str, dict[str, list[float]]] = {}
for key, _ in LIKERT_QUESTIONS:
for key, _, _ in LIKERT_QUESTIONS:
agg[key] = {"bash": [], "lush": []}
for r in results:
@@ -143,7 +160,15 @@ def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
if val is not None:
agg[key][lang].append(val)
for key, label in LIKERT_QUESTIONS:
# Group by dimension
current_dim = None
for key, label, dimension in LIKERT_QUESTIONS:
if dimension != current_dim:
if current_dim is not None:
lines.append("")
lines.append(f" [{dimension}]")
current_dim = dimension
b_vals = agg[key]["bash"]
l_vals = agg[key]["lush"]
b_avg = sum(b_vals) / len(b_vals) if b_vals else 0.0
@@ -154,7 +179,6 @@ def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
lines.append(f" {label}")
lines.append(f" bash {_bar(b_avg)} {b_avg:.1f}")
lines.append(f" lush {_bar(l_avg)} {l_avg:.1f} ({diff_str})")
lines.append("")
# Overall average
all_bash = [v for key in agg for v in agg[key]["bash"]]
@@ -164,6 +188,7 @@ def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
diff = l_overall - b_overall
diff_str = f"+{diff:.1f}" if diff > 0 else f"{diff:.1f}" if diff < 0 else " 0.0"
lines.append("")
lines.append(" " + "-" * 50)
lines.append(f" Overall average")
lines.append(f" bash {_bar(b_overall)} {b_overall:.1f}")
@@ -244,7 +269,7 @@ def render_per_category_questionnaire(results: list[BenchmarkResult]) -> str:
lines.append(f" {cat}")
agg: dict[str, dict[str, list[float]]] = {}
for key, _ in LIKERT_QUESTIONS:
for key, _, _ in LIKERT_QUESTIONS:
agg[key] = {"bash": [], "lush": []}
for r in cat_results:
scores = _get_likert_scores(r)
@@ -254,7 +279,7 @@ def render_per_category_questionnaire(results: list[BenchmarkResult]) -> str:
if val is not None:
agg[key][lang].append(val)
for key, label in LIKERT_QUESTIONS:
for key, label, _ in LIKERT_QUESTIONS:
b_vals = agg[key]["bash"]
l_vals = agg[key]["lush"]
b_avg = sum(b_vals) / len(b_vals) if b_vals else 0.0
@@ -284,7 +309,7 @@ def render_per_task_detail(results: list[BenchmarkResult]) -> str:
scores = _get_likert_scores(r)
lines.append(f" {'Metric':<22s} {'Bash':>4s} {'Lush':>4s} {'Diff':>5s}")
lines.append(" " + "-" * 40)
for key, label in LIKERT_QUESTIONS:
for key, label, _ in LIKERT_QUESTIONS:
b_val = scores[key]["bash"]
l_val = scores[key]["lush"]
b_str = f"{b_val:.0f}" if b_val is not None else "-"
@@ -296,15 +321,6 @@ def render_per_task_detail(results: list[BenchmarkResult]) -> str:
d_str = "-"
lines.append(f" {label:<22s} {b_str:>4s} {l_str:>4s} {d_str:>5s}")
# Free-form observations
obs = _get_freeform(r)
if obs:
lines.append("")
for lang, text in obs.items():
# Wrap long text
wrapped = text[:120] + ("..." if len(text) > 120 else "")
lines.append(f" {lang}: {wrapped}")
lines.append("")
return "\n".join(lines)

102
main.py
View File

@@ -2,7 +2,9 @@ from __future__ import annotations
import argparse
import sys
import threading
import tomllib
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime, timezone
from pathlib import Path
@@ -16,6 +18,8 @@ from lush_bench.export import export_html
from lush_bench.report import render_report
from lush_bench.results import save_result
_print_lock = threading.Lock()
PROVIDERS = {
"anthropic": AnthropicProvider,
@@ -70,39 +74,44 @@ def cmd_list_tasks(args: argparse.Namespace) -> None:
print(f" [{task.category:<12s} {task.mode:<7s}] {task.name:20s} {p.relative_to(Path.cwd())}")
def cmd_run(args: argparse.Namespace) -> None:
config = Config.load()
task_path = Path(args.task)
def _log(msg: str) -> None:
"""Thread-safe print."""
with _print_lock:
print(msg)
def _run_task(
task_path: Path,
provider_name: str,
config: Config,
provider: AnthropicProvider | None = None,
) -> BenchmarkResult:
"""Core task runner. Thread-safe — usable from cmd_run or a thread pool."""
task = load_task(task_path)
provider_name = args.provider
if provider_name not in PROVIDERS:
print(f"Unknown provider: {provider_name}. Available: {', '.join(PROVIDERS)}")
sys.exit(1)
if provider is None:
provider_config = config.provider_configs.get(provider_name, {})
provider = PROVIDERS[provider_name](provider_config)
timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
print(f"Running task: {task.name} ({task.category}/{task.mode}) with {provider.model_name}")
_log(f"Running task: {task.name} ({task.category}/{task.mode}) with {provider.model_name}")
bash_result = None
lush_result = None
if task.mode == "solve":
# Solve mode: agent writes code in both languages
print(" Solving in bash...")
_log(f" [{task.name}] Solving in bash...")
bash_result = solve_task(provider, task, "bash", config)
print(f" Bash: {'PASS' if bash_result.all_passed else 'FAIL'} ({bash_result.agent_turns} turns)")
_log(f" [{task.name}] Bash: {'PASS' if bash_result.all_passed else 'FAIL'} ({bash_result.agent_turns} turns)")
print(" Solving in lush...")
_log(f" [{task.name}] Solving in lush...")
lush_result = solve_task(provider, task, "lush", config)
print(f" Lush: {'PASS' if lush_result.all_passed else 'FAIL'} ({lush_result.agent_turns} turns)")
_log(f" [{task.name}] Lush: {'PASS' if lush_result.all_passed else 'FAIL'} ({lush_result.agent_turns} turns)")
elif task.mode == "convert":
# Convert mode: verify provided bash source directly, then convert to lush
assert task.bash_source, f"Convert-mode task {task.name} missing bash_source"
print(" Verifying provided bash source...")
_log(f" [{task.name}] Verifying provided bash source...")
test_results = evaluate(task, task.bash_source, "bash", config)
all_passed = all(tr.passed for tr in test_results)
bash_result = LanguageResult(
@@ -112,16 +121,16 @@ def cmd_run(args: argparse.Namespace) -> None:
all_passed=all_passed,
agent_turns=0,
)
print(f" Bash: {'PASS' if all_passed else 'FAIL'}")
_log(f" [{task.name}] Bash: {'PASS' if all_passed else 'FAIL'}")
print(" Converting to lush...")
_log(f" [{task.name}] Converting to lush...")
lush_result = solve_task(provider, task, "lush", config)
print(f" Lush: {'PASS' if lush_result.all_passed else 'FAIL'} ({lush_result.agent_turns} turns)")
_log(f" [{task.name}] Lush: {'PASS' if lush_result.all_passed else 'FAIL'} ({lush_result.agent_turns} turns)")
# Run questionnaire for each completed language
for lang, result in [("bash", bash_result), ("lush", lush_result)]:
if result and result.solution_code:
print(f" Questionnaire for {lang}...")
_log(f" [{task.name}] Questionnaire for {lang}...")
result.questionnaire = run_questionnaire(provider, task.name, lang, result.solution_code)
benchmark = BenchmarkResult(
@@ -136,20 +145,60 @@ def cmd_run(args: argparse.Namespace) -> None:
)
result_dir = save_result(benchmark, config.output_dir)
print(f" Results saved to {result_dir}")
_log(f" [{task.name}] Results saved to {result_dir}")
return benchmark
def cmd_run(args: argparse.Namespace) -> None:
config = Config.load()
provider_name = args.provider
if provider_name not in PROVIDERS:
print(f"Unknown provider: {provider_name}. Available: {', '.join(PROVIDERS)}")
sys.exit(1)
provider_config = config.provider_configs.get(provider_name, {})
provider = PROVIDERS[provider_name](provider_config)
_run_task(Path(args.task), provider_name, config, provider)
def cmd_run_all(args: argparse.Namespace) -> None:
config = Config.load()
paths = find_tasks(args.category, getattr(args, "mode", None))
if not paths:
print("No tasks found.")
return
for p in paths:
# Reuse cmd_run by constructing a namespace
run_args = argparse.Namespace(task=str(p), provider=args.provider)
cmd_run(run_args)
print()
provider_name = args.provider
if provider_name not in PROVIDERS:
print(f"Unknown provider: {provider_name}. Available: {', '.join(PROVIDERS)}")
sys.exit(1)
# Share one provider instance across threads (its rate limiter is thread-safe)
provider_config = config.provider_configs.get(provider_name, {})
provider = PROVIDERS[provider_name](provider_config)
max_workers = args.workers if args.workers is not None else config.max_workers
print(f"Running {len(paths)} tasks with {max_workers} workers")
failed: list[str] = []
with ThreadPoolExecutor(max_workers=max_workers) as pool:
futures = {
pool.submit(_run_task, p, provider_name, config, provider): p
for p in paths
}
for future in as_completed(futures):
task_path = futures[future]
try:
future.result()
except Exception as exc:
task_name = task_path.stem
failed.append(task_name)
_log(f" [{task_name}] FAILED: {exc}")
print(f"\nDone. {len(paths) - len(failed)}/{len(paths)} succeeded.")
if failed:
print(f"Failed: {', '.join(failed)}")
def cmd_report(args: argparse.Namespace) -> None:
@@ -183,6 +232,7 @@ def main() -> None:
ra.add_argument("--category", choices=["algorithm", "pipeline", "environment", "filesystem", "process"], help="Filter by category")
ra.add_argument("--mode", choices=["solve", "convert"], help="Filter by mode")
ra.add_argument("--provider", default="anthropic", help="LLM provider")
ra.add_argument("--workers", type=int, default=None, help="Max parallel tasks (default: from config, typically 4)")
ra.set_defaults(func=cmd_run_all)
# report

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,97 @@
name = "path_normalizer"
category = "environment"
mode = "convert"
description = """
Read file paths from stdin, one per line. Normalize each path:
1. Replace a leading "~" with the value of $HOME.
2. Remove trailing slashes (except for root "/").
3. Collapse consecutive slashes into one.
4. Resolve "." components (remove them).
5. Resolve ".." components (go up one directory level).
Output the cleaned path, one per line.
Skip empty lines.
"""
bash_source = '''
#!/bin/bash
while IFS= read -r line || [[ -n "$line" ]]; do
[[ -z "$line" ]] && continue
# Expand tilde
path=$(echo "$line" | sed "s:^~:$HOME:")
# Collapse multiple slashes
path=$(echo "$path" | sed 's:/\\+:/:g')
# Remove trailing slash (but keep root)
path=$(echo "$path" | sed 's:/$::')
[[ -z "$path" ]] && path="/"
# Resolve . and .. components
IFS='/' read -ra parts <<< "$path"
result=()
absolute=""
if [[ "$path" == /* ]]; then
absolute="/"
fi
for part in "${parts[@]}"; do
if [[ "$part" == "." || "$part" == "" ]]; then
continue
elif [[ "$part" == ".." ]]; then
if [[ ${#result[@]} -gt 0 && "${result[-1]}" != ".." ]]; then
unset 'result[${#result[@]}-1]'
elif [[ -z "$absolute" ]]; then
result+=("..")
fi
else
result+=("$part")
fi
done
if [[ -n "$absolute" ]]; then
final="/"
IFS='/'; final+="${result[*]}"
else
IFS='/'; final="${result[*]}"
fi
[[ -z "$final" ]] && final="/"
echo "$final"
done
'''
[[test_cases]]
description = "Tilde expansion and trailing slashes"
stdin = """~/Documents/
~/projects/code
/var/log/"""
expected_stdout = """/Users/testuser/Documents
/Users/testuser/projects/code
/var/log"""
env = { "HOME" = "/Users/testuser" }
[[test_cases]]
description = "Resolving . and .. components"
stdin = """/usr/local/./bin
/home/user/../shared/docs
/a/b/c/../../d"""
expected_stdout = """/usr/local/bin
/home/shared/docs
/a/d"""
[[test_cases]]
description = "Collapsing multiple slashes"
stdin = """/usr//local///bin
/var/log//syslog"""
expected_stdout = """/usr/local/bin
/var/log/syslog"""
[[test_cases]]
description = "Root and relative paths"
stdin = """/
./config/settings
../parent/child"""
expected_stdout = """/
config/settings
../parent/child"""

View File

@@ -0,0 +1,150 @@
name = "todo_manager"
category = "filesystem"
mode = "convert"
description = """
A simple todo list manager. Read commands from stdin, one per line:
add <task text> — append the task to the todo file
list — print all tasks numbered as "NN). <task>"
remove <N> — remove task number N (1-based)
clear — remove all tasks
The todo file is "todo.txt" in the working directory.
When listing, pad task numbers to two digits (01, 02, …).
After "add" or "remove", automatically list the remaining tasks.
If the list is empty, print "No tasks found".
"""
bash_source = '''
#!/bin/bash
TODOFILE="./todo.txt"
list_tasks() {
if [ -f "$TODOFILE" ] && [ -s "$TODOFILE" ]; then
count=1
IFS=$'\\n'
while read -r task; do
num=$count
if [ $count -lt 10 ]; then num="0$count"; fi
echo "$num). $task"
count=$(( count + 1 ))
done < "$TODOFILE"
else
echo "No tasks found"
fi
}
add_task() {
echo "$1" >> "$TODOFILE"
}
remove_task() {
taskNum=$1
totalLines=$(wc -l < "$TODOFILE" | tr -d ' ')
if [ "$taskNum" -gt "$totalLines" ] 2>/dev/null; then
echo "Error: task number $taskNum does not exist!"
return 1
fi
tmpfile="./todo_tmp.txt"
count=1
IFS=$'\\n'
while read -r task; do
if [ "$count" -ne "$taskNum" ]; then
echo "$task" >> "$tmpfile"
fi
count=$(( count + 1 ))
done < "$TODOFILE"
if [ -f "$tmpfile" ]; then
mv "$tmpfile" "$TODOFILE"
else
> "$TODOFILE"
fi
echo "Sucessfully removed task number $taskNum"
}
clear_tasks() {
> "$TODOFILE"
echo "Tasks cleared."
}
if [ ! -f "$TODOFILE" ]; then
touch "$TODOFILE"
fi
while IFS= read -r line || [[ -n "$line" ]]; do
cmd=$(echo "$line" | cut -d' ' -f1)
arg=$(echo "$line" | cut -d' ' -f2-)
case "$cmd" in
add)
add_task "$arg"
list_tasks
;;
list)
list_tasks
;;
remove)
remove_task "$arg"
list_tasks
;;
clear)
clear_tasks
;;
esac
done
'''
[[test_cases]]
description = "Add tasks then list"
stdin = """add Buy groceries
add Walk the dog
list"""
expected_stdout = """01). Buy groceries
01). Buy groceries
02). Walk the dog
01). Buy groceries
02). Walk the dog"""
[[test_cases]]
description = "Add, remove, list"
stdin = """add First task
add Second task
add Third task
remove 2
list"""
expected_stdout = """01). First task
01). First task
02). Second task
01). First task
02). Second task
03). Third task
Sucessfully removed task number 2
01). First task
02). Third task
01). First task
02). Third task"""
[[test_cases]]
description = "Empty list and clear"
stdin = """list
add Something
clear
list"""
expected_stdout = """No tasks found
01). Something
Tasks cleared.
No tasks found"""
[[test_cases]]
description = "Works with pre-existing todo file"
stdin = """list
add Third item
list"""
setup_files = { "todo.txt" = "Existing item one\nExisting item two\n" }
expected_stdout = """01). Existing item one
02). Existing item two
01). Existing item one
02). Existing item two
03). Third item
01). Existing item one
02). Existing item two
03). Third item"""
expected_files = { "todo.txt" = "Existing item one\nExisting item two\nThird item\n" }

View File

@@ -0,0 +1,113 @@
name = "currency_converter"
category = "pipeline"
mode = "convert"
description = """
A currency converter that reads conversion requests from stdin.
Each line has the format: AMOUNT FROM TO RATE
- AMOUNT: a decimal number (e.g., 12.35)
- FROM: 3-letter currency code
- TO: 3-letter currency code
- RATE: the exchange rate from FROM's base to TO's base
Some currencies are pegged to others at fixed rates:
BAM is pegged to EUR at 1.95583
BMD is pegged to USD at 1.0
BND is pegged to SGD at 1.0
DJF is pegged to USD at 177.721
PAB is pegged to USD at 1.0
When a pegged currency is involved, the conversion must account for the
peg coefficient. The formula is: result = amount * (rate / coef_from) * coef_to
where coef is the peg ratio (1 if not pegged).
Output one line per input: "AMOUNT FROM = RESULT TO" with RESULT
computed using bc with scale=2.
For invalid lines (wrong field count or non-numeric amount), output "ERROR: <original line>".
"""
bash_source = '''
#!/bin/bash
pegged_to() {
case "$1" in
BAM) echo "EUR:1.95583" ;;
BMD) echo "USD:1.0" ;;
BND) echo "SGD:1.0" ;;
DJF) echo "USD:177.721" ;;
PAB) echo "USD:1.0" ;;
*) echo "NONE:1" ;;
esac
}
while IFS= read -r line || [[ -n "$line" ]]; do
# Skip empty lines
[[ -z "$line" ]] && continue
# Parse fields
set -- $line
if [[ $# -ne 4 ]]; then
echo "ERROR: $line"
continue
fi
amount=$1
from=$2
to=$3
rate=$4
# Validate amount is numeric
if [[ ! "$amount" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then
echo "ERROR: $line"
continue
fi
# Validate rate is numeric
if [[ ! "$rate" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then
echo "ERROR: $line"
continue
fi
# Get peg info
peg_from=$(pegged_to "$from")
coef_from=$(echo "$peg_from" | cut -d: -f2)
peg_to=$(pegged_to "$to")
coef_to=$(echo "$peg_to" | cut -d: -f2)
# Calculate: result = amount * (rate / coef_from) * coef_to
result=$(echo "scale=8; $amount * ($rate / $coef_from) * $coef_to" | bc)
result=$(printf "%.2f" "$result")
echo "$amount $from = $result $to"
done
'''
[[test_cases]]
description = "Standard conversion with direct rate"
stdin = """100 USD EUR 0.92
50 GBP JPY 188.50"""
expected_stdout = """100 USD = 92.00 EUR
50 GBP = 9425.00 JPY"""
[[test_cases]]
description = "Pegged currency conversions"
stdin = """100 BAM USD 1.08
200 BMD EUR 0.92
50 USD DJF 1.0"""
expected_stdout = """100 BAM = 55.22 USD
200 BMD = 184.00 EUR
50 USD = 8886.05 DJF"""
[[test_cases]]
description = "Invalid input lines"
stdin = """abc EUR USD 0.92
100 USD
100 EUR USD 0.85"""
expected_stdout = """ERROR: abc EUR USD 0.92
ERROR: 100 USD
100 EUR = 85.00 USD"""
[[test_cases]]
description = "Pegged-to-pegged conversion"
stdin = "100 BAM BMD 1.08"
expected_stdout = "100 BAM = 55.22 BMD"

View File

@@ -0,0 +1,76 @@
name = "locale_weather_url"
category = "pipeline"
mode = "convert"
description = """
Construct weather API URLs from locale and location information.
Read lines from stdin in the format: LANG_CODE LOCATION
where LANG_CODE is a 2-letter locale (e.g., "en", "fr", "de")
and LOCATION is a city/place name (may contain spaces).
For each line, construct a URL in the format:
https://LANG.wttr.in/LOCATION
Where spaces in the location are replaced with "+" characters.
If LANG_CODE is empty or invalid (not exactly 2 lowercase letters),
default to "en".
Skip empty lines. Output one URL per input line.
"""
bash_source = '''
#!/bin/bash
while IFS= read -r line || [[ -n "$line" ]]; do
[[ -z "$line" ]] && continue
# Extract lang code (first field)
lang=$(echo "$line" | awk '{print $1}')
# Extract location (everything after first field)
location=$(echo "$line" | sed 's/^[^ ]* *//')
# Validate lang code: must be exactly 2 lowercase letters
if [[ ! "$lang" =~ ^[a-z]{2}$ ]]; then
lang="en"
fi
# If location is same as lang (single-word line), skip
if [[ "$location" == "$lang" || -z "$location" ]]; then
location=""
fi
# Replace spaces with +
location=$(echo "$location" | tr ' ' '+')
echo "https://$lang.wttr.in/$location"
done
'''
[[test_cases]]
description = "Various locales and locations"
stdin = """en New York
fr Paris
de Berlin
ja Tokyo"""
expected_stdout = """https://en.wttr.in/New+York
https://fr.wttr.in/Paris
https://de.wttr.in/Berlin
https://ja.wttr.in/Tokyo"""
[[test_cases]]
description = "Multi-word locations"
stdin = """en San Francisco
es Buenos Aires
pt Rio de Janeiro"""
expected_stdout = """https://en.wttr.in/San+Francisco
https://es.wttr.in/Buenos+Aires
https://pt.wttr.in/Rio+de+Janeiro"""
[[test_cases]]
description = "Invalid or missing locale defaults to en"
stdin = """ENG London
123 Moscow
x Rome"""
expected_stdout = """https://en.wttr.in/London
https://en.wttr.in/Moscow
https://en.wttr.in/Rome"""

View File

@@ -0,0 +1,86 @@
name = "network_info_parser"
category = "pipeline"
mode = "convert"
description = """
Parse network interface configuration from stdin (in "ip addr show" format)
and extract a summary of each interface.
For each interface block, output a line:
IFACE: <name> IP: <ipv4_addr> MASK: /<prefix_len>
An interface block starts with a line like:
2: eth0: <BROADCAST,MULTICAST,UP> mtu 1500 ...
and contains inet lines like:
inet 192.168.1.100/24 brd 192.168.1.255 scope global eth0
If an interface has no inet line, output:
IFACE: <name> IP: none MASK: none
Skip the loopback interface (lo).
"""
bash_source = '''
#!/bin/bash
current_iface=""
found_ip=""
found_mask=""
flush_iface() {
if [[ -n "$current_iface" && "$current_iface" != "lo" ]]; then
if [[ -n "$found_ip" ]]; then
echo "IFACE: $current_iface IP: $found_ip MASK: /$found_mask"
else
echo "IFACE: $current_iface IP: none MASK: none"
fi
fi
}
while IFS= read -r line || [[ -n "$line" ]]; do
# Detect interface line: starts with a number followed by colon
if echo "$line" | grep -qE '^[0-9]+:'; then
flush_iface
current_iface=$(echo "$line" | awk -F: '{print $2}' | sed 's/^[[:space:]]*//' | awk '{print $1}')
found_ip=""
found_mask=""
fi
# Detect inet line (IPv4 only, not inet6)
if echo "$line" | grep -qE '^[[:space:]]+inet [0-9]'; then
ip_cidr=$(echo "$line" | awk '{print $2}')
found_ip=$(echo "$ip_cidr" | cut -d/ -f1)
found_mask=$(echo "$ip_cidr" | cut -d/ -f2)
fi
done
flush_iface
'''
[[test_cases]]
description = "Two interfaces with IPs"
stdin = """1: lo: <LOOPBACK,UP> mtu 65536
inet 127.0.0.1/8 scope host lo
2: eth0: <BROADCAST,MULTICAST,UP> mtu 1500
inet 192.168.1.100/24 brd 192.168.1.255 scope global eth0
3: wlan0: <BROADCAST,MULTICAST,UP> mtu 1500
inet 10.0.0.42/16 brd 10.0.255.255 scope global wlan0"""
expected_stdout = """IFACE: eth0 IP: 192.168.1.100 MASK: /24
IFACE: wlan0 IP: 10.0.0.42 MASK: /16"""
[[test_cases]]
description = "Interface with no IP"
stdin = """1: lo: <LOOPBACK,UP> mtu 65536
inet 127.0.0.1/8 scope host lo
2: eth0: <BROADCAST,MULTICAST,UP> mtu 1500
3: docker0: <NO-CARRIER,BROADCAST,MULTICAST,UP> mtu 1500
inet 172.17.0.1/16 brd 172.17.255.255 scope global docker0"""
expected_stdout = """IFACE: eth0 IP: none MASK: none
IFACE: docker0 IP: 172.17.0.1 MASK: /16"""
[[test_cases]]
description = "Single interface"
stdin = """1: lo: <LOOPBACK,UP> mtu 65536
inet 127.0.0.1/8 scope host lo
2: enp3s0: <BROADCAST,MULTICAST,UP> mtu 9000
inet 10.10.10.5/8 brd 10.255.255.255 scope global enp3s0"""
expected_stdout = "IFACE: enp3s0 IP: 10.10.10.5 MASK: /8"

View File

@@ -0,0 +1,79 @@
name = "url_normalizer"
category = "pipeline"
mode = "convert"
description = """
Read URLs from stdin, one per line. Normalize each URL:
1. If the URL already starts with "https://", keep it as-is.
2. If it starts with "http://", keep it as-is.
3. Otherwise, prepend "http://" to it.
4. After normalization, validate that the URL matches a basic pattern:
it must have a protocol (http:// or https://), followed by at least
one character, a dot, and at least one more character for the domain.
5. Output the normalized URL, or "INVALID: <original>" for invalid entries.
Skip empty lines silently.
"""
bash_source = '''
#!/bin/bash
while IFS= read -r line || [[ -n "$line" ]]; do
# Skip empty lines
[[ -z "$line" ]] && continue
# Trim whitespace
url=$(echo "$line" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
[[ -z "$url" ]] && continue
original="$url"
# Check if it already has https://
prefix8=$(echo "$url" | cut -c1-8)
if [[ "$prefix8" == "https://" ]]; then
normalized="$url"
else
prefix7=$(echo "$url" | cut -c1-7)
if [[ "$prefix7" == "http://" ]]; then
normalized="$url"
else
normalized="http://$url"
fi
fi
# Validate: protocol + something.something
if echo "$normalized" | grep -qE '^https?://[^/]+\.[^/]+'; then
echo "$normalized"
else
echo "INVALID: $original"
fi
done
'''
[[test_cases]]
description = "URLs with and without protocol"
stdin = """example.com
http://example.com
https://example.com
www.google.com/search?q=test"""
expected_stdout = """http://example.com
http://example.com
https://example.com
http://www.google.com/search?q=test"""
[[test_cases]]
description = "Invalid entries"
stdin = """notaurl
https://valid.example.com
just-a-word"""
expected_stdout = """INVALID: notaurl
https://valid.example.com
INVALID: just-a-word"""
[[test_cases]]
description = "Mixed valid and empty lines"
stdin = """https://secure.site.org/path
api.service.io:8080
http://old.site.net"""
expected_stdout = """https://secure.site.org/path
http://api.service.io:8080
http://old.site.net"""