diff --git a/lush_bench/config.py b/lush_bench/config.py index 68cd8f0..5b39075 100644 --- a/lush_bench/config.py +++ b/lush_bench/config.py @@ -13,6 +13,7 @@ class Config: timeout_seconds: float = 10.0 normalize_whitespace: bool = True output_dir: Path = Path("results") + max_workers: int = 4 provider_configs: dict[str, dict[str, Any]] = field(default_factory=dict) @classmethod @@ -35,5 +36,6 @@ class Config: timeout_seconds=agent.get("timeout_seconds", 10.0), normalize_whitespace=agent.get("normalize_whitespace", True), output_dir=Path(results.get("output_dir", "results")), + max_workers=agent.get("max_workers", 4), provider_configs=provider_configs, ) diff --git a/lush_bench/export.py b/lush_bench/export.py index 87ab917..28f3ec3 100644 --- a/lush_bench/export.py +++ b/lush_bench/export.py @@ -13,7 +13,6 @@ import matplotlib.ticker as ticker from .models import BenchmarkResult from .report import ( LIKERT_QUESTIONS, - _get_freeform, _get_likert_scores, _parse_likert, load_latest_results, @@ -35,7 +34,7 @@ def _fig_to_base64(fig: plt.Figure) -> str: def _aggregate_likert(results: list[BenchmarkResult]) -> dict[str, dict[str, float]]: """Return {question_key: {bash: avg, lush: avg}}.""" agg: dict[str, dict[str, list[float]]] = {} - for key, _ in LIKERT_QUESTIONS: + for key, _, _ in LIKERT_QUESTIONS: agg[key] = {"bash": [], "lush": []} for r in results: scores = _get_likert_scores(r) @@ -56,11 +55,11 @@ def _aggregate_likert(results: list[BenchmarkResult]) -> dict[str, dict[str, flo def chart_questionnaire_comparison(results: list[BenchmarkResult]) -> str: """Grouped horizontal bar chart comparing bash vs lush on each Likert metric.""" avgs = _aggregate_likert(results) - labels = [label for _, label in LIKERT_QUESTIONS] - bash_vals = [avgs[key]["bash"] for key, _ in LIKERT_QUESTIONS] - lush_vals = [avgs[key]["lush"] for key, _ in LIKERT_QUESTIONS] + labels = [label for _, label, _ in LIKERT_QUESTIONS] + bash_vals = [avgs[key]["bash"] for key, _, _ in LIKERT_QUESTIONS] + lush_vals = [avgs[key]["lush"] for key, _, _ in LIKERT_QUESTIONS] - fig, ax = plt.subplots(figsize=(8, 4.5)) + fig, ax = plt.subplots(figsize=(8, 7)) y = range(len(labels)) bar_h = 0.35 bars_bash = ax.barh([i + bar_h / 2 for i in y], bash_vals, bar_h, label="bash", color=BASH_COLOR) @@ -112,14 +111,14 @@ def chart_turns_comparison(results: list[BenchmarkResult]) -> str: def chart_per_task_heatmap(results: list[BenchmarkResult]) -> str: """Heatmap showing lush-minus-bash score diff per task and metric.""" - labels = [label for _, label in LIKERT_QUESTIONS] + labels = [label for _, label, _ in LIKERT_QUESTIONS] tasks = [r.task_name for r in results] data: list[list[float]] = [] for r in results: scores = _get_likert_scores(r) row = [] - for key, _ in LIKERT_QUESTIONS: + for key, _, _ in LIKERT_QUESTIONS: b = scores[key]["bash"] l = scores[key]["lush"] if b is not None and l is not None: @@ -128,11 +127,11 @@ def chart_per_task_heatmap(results: list[BenchmarkResult]) -> str: row.append(0.0) data.append(row) - fig, ax = plt.subplots(figsize=(8, max(4, len(tasks) * 0.45 + 1))) + fig, ax = plt.subplots(figsize=(10, max(4, len(tasks) * 0.45 + 1))) im = ax.imshow(data, cmap="RdYlGn", aspect="auto", vmin=-3, vmax=3) ax.set_xticks(range(len(labels))) - ax.set_xticklabels(labels, rotation=35, ha="right", fontsize=8) + ax.set_xticklabels(labels, rotation=45, ha="right", fontsize=7) ax.set_yticks(range(len(tasks))) ax.set_yticklabels(tasks, fontsize=8) @@ -140,7 +139,7 @@ def chart_per_task_heatmap(results: list[BenchmarkResult]) -> str: for j in range(len(labels)): val = data[i][j] text = f"+{val:.0f}" if val > 0 else f"{val:.0f}" if val < 0 else "0" - ax.text(j, i, text, ha="center", va="center", fontsize=8, + ax.text(j, i, text, ha="center", va="center", fontsize=7, color="white" if abs(val) >= 2 else "black") ax.set_title("Score Difference (Lush - Bash)") @@ -197,7 +196,7 @@ def chart_per_category_questionnaire(results: list[BenchmarkResult]) -> str: def chart_per_category_radar(results: list[BenchmarkResult]) -> list[tuple[str, str]]: - """Small-multiples bar charts: one per category showing 6 Likert dimensions for bash vs lush.""" + """Small-multiples bar charts: one per category showing 12 Likert dimensions for bash vs lush.""" import numpy as np from collections import defaultdict @@ -206,12 +205,12 @@ def chart_per_category_radar(results: list[BenchmarkResult]) -> list[tuple[str, by_cat[r.category].append(r) charts: list[tuple[str, str]] = [] - labels = [label for _, label in LIKERT_QUESTIONS] + labels = [label for _, label, _ in LIKERT_QUESTIONS] for cat in sorted(by_cat): cat_results = by_cat[cat] agg: dict[str, dict[str, list[float]]] = {} - for key, _ in LIKERT_QUESTIONS: + for key, _, _ in LIKERT_QUESTIONS: agg[key] = {"bash": [], "lush": []} for r in cat_results: scores = _get_likert_scores(r) @@ -221,10 +220,10 @@ def chart_per_category_radar(results: list[BenchmarkResult]) -> list[tuple[str, if val is not None: agg[key][lang].append(val) - bash_vals = [sum(agg[k]["bash"]) / len(agg[k]["bash"]) if agg[k]["bash"] else 0.0 for k, _ in LIKERT_QUESTIONS] - lush_vals = [sum(agg[k]["lush"]) / len(agg[k]["lush"]) if agg[k]["lush"] else 0.0 for k, _ in LIKERT_QUESTIONS] + bash_vals = [sum(agg[k]["bash"]) / len(agg[k]["bash"]) if agg[k]["bash"] else 0.0 for k, _, _ in LIKERT_QUESTIONS] + lush_vals = [sum(agg[k]["lush"]) / len(agg[k]["lush"]) if agg[k]["lush"] else 0.0 for k, _, _ in LIKERT_QUESTIONS] - fig, ax = plt.subplots(figsize=(6, 3.5)) + fig, ax = plt.subplots(figsize=(7, 5)) y = range(len(labels)) bar_h = 0.35 ax.barh([i + bar_h / 2 for i in y], bash_vals, bar_h, label="bash", color=BASH_COLOR) @@ -337,7 +336,7 @@ def _build_detail_html(results: list[BenchmarkResult]) -> str: scores = _get_likert_scores(r) score_rows = [] - for key, label in LIKERT_QUESTIONS: + for key, label, _ in LIKERT_QUESTIONS: b_val = scores[key]["bash"] l_val = scores[key]["lush"] b_str = f"{b_val:.0f}" if b_val is not None else "-" @@ -353,11 +352,6 @@ def _build_detail_html(results: list[BenchmarkResult]) -> str: f'{b_str}{l_str}' f'{d_str}') - obs = _get_freeform(r) - obs_html = "" - for lang, text in obs.items(): - obs_html += f'

{lang}: {html.escape(text)}

\n' - sections.append(f"""

{html.escape(r.task_name)} [{r.category}/{r.mode}] @@ -368,7 +362,6 @@ def _build_detail_html(results: list[BenchmarkResult]) -> str: MetricBashLushDiff {"".join(score_rows)} -
{obs_html}

""") return "\n".join(sections) @@ -424,8 +417,6 @@ def export_html(results_dir: Path, output_path: Path) -> None: .scores {{ width: auto; }} .scores td:nth-child(n+2) {{ text-align: center; min-width: 50px; }} .scores th:nth-child(n+2) {{ text-align: center; }} - .observations {{ margin-top: 12px; font-size: 0.85rem; color: #444; }} - .observations p {{ margin-bottom: 6px; }} diff --git a/lush_bench/providers/anthropic.py b/lush_bench/providers/anthropic.py index 0b52673..77acea2 100644 --- a/lush_bench/providers/anthropic.py +++ b/lush_bench/providers/anthropic.py @@ -1,6 +1,8 @@ from __future__ import annotations import os +import threading +import time from typing import Any import anthropic @@ -17,8 +19,17 @@ class AnthropicProvider: self._client = anthropic.Anthropic(api_key=api_key) self._model = config.get("model", "claude-sonnet-4-20250514") self._max_tokens = config.get("max_tokens", 4096) + self._min_request_interval = config.get("min_request_interval", 0.1) + self._last_request_time = 0.0 + self._lock = threading.Lock() def send(self, messages: list[Message], system: str = "") -> str: + with self._lock: + elapsed = time.monotonic() - self._last_request_time + if elapsed < self._min_request_interval: + time.sleep(self._min_request_interval - elapsed) + self._last_request_time = time.monotonic() + api_messages = [{"role": m.role, "content": m.content} for m in messages] kwargs: dict[str, Any] = { "model": self._model, diff --git a/lush_bench/questionnaire.py b/lush_bench/questionnaire.py index 81ba02e..fe4fd28 100644 --- a/lush_bench/questionnaire.py +++ b/lush_bench/questionnaire.py @@ -7,42 +7,38 @@ from .models import QuestionnaireResponse from .providers.base import LLMProvider, Message QUESTIONS = [ - { - "question": "Readability: The solution is easy to read and understand", - "choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"], - }, - { - "question": "Expressiveness: The language provided sufficient constructs to solve the problem naturally", - "choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"], - }, - { - "question": "Conciseness: The solution required minimal boilerplate", - "choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"], - }, - { - "question": "Error handling: Error handling was straightforward", - "choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"], - }, - { - "question": "Overall preference: I would prefer this language for similar tasks", - "choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"], - }, - { - "question": "Learning curve: An unfamiliar developer could understand the solution quickly", - "choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"], - }, + # Syntax & Readability + {"id": "syntax_clarity", "dimension": "Syntax & Readability", "question": "The language's syntax makes the intent of operations visually obvious"}, + {"id": "signal_to_noise", "dimension": "Syntax & Readability", "question": "The language keeps boilerplate low — most characters serve the task, not the language"}, + {"id": "familiar_conventions", "dimension": "Syntax & Readability", "question": "The language follows conventions that developers from other languages would recognize"}, + # Expressiveness + {"id": "builtin_ops", "dimension": "Expressiveness", "question": "The language provides built-in operations for the core task requirements (no workarounds needed)"}, + {"id": "string_ops", "dimension": "Expressiveness", "question": "The language's string manipulation capabilities are convenient for this task"}, + {"id": "composition", "dimension": "Expressiveness", "question": "The language makes it easy to compose operations (piping, chaining, nesting)"}, + # Data & I/O + {"id": "io_ergonomics", "dimension": "Data & I/O", "question": "Reading input and producing output is straightforward in this language"}, + {"id": "data_structures", "dimension": "Data & I/O", "question": "The language's data structures (arrays, maps, variables) are well-suited to this task"}, + # Error Handling + {"id": "error_model", "dimension": "Error Handling", "question": "The language's error handling model is clear and predictable"}, + {"id": "edge_case_support", "dimension": "Error Handling", "question": "The language makes it easy to handle edge cases (empty input, missing data, type mismatches)"}, + # Overall + {"id": "learnability", "dimension": "Overall", "question": "A developer unfamiliar with this language could learn enough to solve this task quickly"}, + {"id": "fitness", "dimension": "Overall", "question": "This language is a good fit for this type of task"}, ] +CHOICES = ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"] + def build_questionnaire_prompt( task_name: str, language: str, solution_code: str, ) -> str: + choices_str = ", ".join(f'"{c}"' for c in CHOICES) + questions_text = "" - for i, q in enumerate(QUESTIONS, 1): - choices_str = ", ".join(f'"{c}"' for c in q["choices"]) - questions_text += f' {{"question": "{q["question"]}", "choices": [{choices_str}], "selected": }},\n' + for q in QUESTIONS: + questions_text += f' {{"id": "{q["id"]}", "question": "{q["question"]}", "selected": }},\n' return f"""You just solved the task "{task_name}" in {language}. Here is your solution: @@ -50,11 +46,20 @@ def build_questionnaire_prompt( {solution_code} ``` -Please evaluate your experience by answering the following questionnaire. Respond with ONLY a JSON array — no other text. +Rate the **language itself** on each aspect below, not the quality of this particular solution. Consider what the language's design and built-in features afford for this type of task. + +Respond with ONLY a JSON array — no other text. For "selected", use one of: {choices_str} [ -{questions_text} {{"question": "Free-form observation about using {language} for this task", "selected": ""}} -]""" +{questions_text}]""" + + +def _extract_int(value: str) -> int | None: + """Extract leading digit from a response like '4 - Agree'.""" + s = value.strip() + if s and s[0].isdigit(): + return int(s[0]) + return None def parse_questionnaire_response(response: str) -> list[QuestionnaireResponse]: @@ -70,11 +75,20 @@ def parse_questionnaire_response(response: str) -> list[QuestionnaireResponse]: results = [] for item in data: + question_id = item.get("id", item.get("question", "")) + raw_selected = item.get("selected", "") + + # Normalize to int + if isinstance(raw_selected, int): + selected: int | str = raw_selected + else: + parsed = _extract_int(str(raw_selected)) + selected = parsed if parsed is not None else raw_selected + results.append( QuestionnaireResponse( - question=item.get("question", ""), - selected=item.get("selected", ""), - choices=item.get("choices"), + question=question_id, + selected=selected, ) ) return results diff --git a/lush_bench/report.py b/lush_bench/report.py index e696ce3..f1baeee 100644 --- a/lush_bench/report.py +++ b/lush_bench/report.py @@ -5,16 +5,32 @@ from pathlib import Path from .models import BenchmarkResult -# Likert questions in order (must match questionnaire.py QUESTIONS) +# New 12-item question list: (key, label, dimension) LIKERT_QUESTIONS = [ - ("Readability", "Readability"), - ("Expressiveness", "Expressiveness"), - ("Conciseness", "Conciseness"), - ("Error handling", "Error handling"), - ("Overall preference", "Overall preference"), - ("Learning curve", "Learning curve"), + ("syntax_clarity", "Syntax clarity", "Syntax & Readability"), + ("signal_to_noise", "Signal-to-noise", "Syntax & Readability"), + ("familiar_conventions", "Familiar conventions", "Syntax & Readability"), + ("builtin_ops", "Built-in operations", "Expressiveness"), + ("string_ops", "String operations", "Expressiveness"), + ("composition", "Composition", "Expressiveness"), + ("io_ergonomics", "I/O ergonomics", "Data & I/O"), + ("data_structures", "Data structures", "Data & I/O"), + ("error_model", "Error model", "Error Handling"), + ("edge_case_support", "Edge case support", "Error Handling"), + ("learnability", "Learnability", "Overall"), + ("fitness", "Fitness for task", "Overall"), ] +# Map old 6 legacy keys to new keys for back-compat with existing results +LEGACY_KEY_MAP = { + "Readability": ["syntax_clarity", "signal_to_noise", "familiar_conventions"], + "Expressiveness": ["builtin_ops", "string_ops", "composition"], + "Conciseness": ["signal_to_noise"], + "Error handling": ["error_model", "edge_case_support"], + "Overall preference": ["fitness"], + "Learning curve": ["learnability"], +} + def load_latest_results(results_dir: Path) -> list[BenchmarkResult]: """Load results, keeping only the latest run per task name.""" @@ -30,7 +46,7 @@ def load_latest_results(results_dir: Path) -> list[BenchmarkResult]: def _parse_likert(selected: str | int) -> int | None: - """Extract numeric value from a likert response like '4 - Agree'.""" + """Extract numeric value from a likert response. Handles int directly or string like '4 - Agree'.""" if isinstance(selected, int): return selected s = str(selected).strip() @@ -40,20 +56,34 @@ def _parse_likert(selected: str | int) -> int | None: def _get_likert_scores(result: BenchmarkResult) -> dict[str, dict[str, float | None]]: - """Extract likert scores per language. Returns {question_key: {bash: N, lush: N}}.""" + """Extract likert scores per language. Returns {question_key: {bash: N, lush: N}}. + + Handles both new-format results (exact id match) and legacy results (startswith match + mapped to new keys). + """ scores: dict[str, dict[str, float | None]] = {} - for key, _ in LIKERT_QUESTIONS: + for key, _, _ in LIKERT_QUESTIONS: scores[key] = {"bash": None, "lush": None} for lang_name, lang_result in [("bash", result.bash_result), ("lush", result.lush_result)]: if not lang_result: continue for q in lang_result.questionnaire: - for key, _ in LIKERT_QUESTIONS: - if q.question.startswith(key): + # Try exact match on new question ids + if q.question in scores: + val = _parse_likert(q.selected) + if val is not None: + scores[q.question][lang_name] = float(val) + continue + + # Legacy: map old key to new keys (spread the score) + for legacy_prefix, new_keys in LEGACY_KEY_MAP.items(): + if q.question.startswith(legacy_prefix): val = _parse_likert(q.selected) if val is not None: - scores[key][lang_name] = float(val) + for nk in new_keys: + if scores[nk][lang_name] is None: + scores[nk][lang_name] = float(val) break return scores @@ -64,19 +94,6 @@ def _bar(value: float, max_val: float = 5.0, width: int = 20) -> str: return "\u2588" * filled + "\u2591" * (width - filled) -def _get_freeform(result: BenchmarkResult) -> dict[str, str]: - """Extract free-form observations per language.""" - obs: dict[str, str] = {} - for lang_name, lang_result in [("bash", result.bash_result), ("lush", result.lush_result)]: - if not lang_result: - continue - for q in lang_result.questionnaire: - if q.question.startswith("Free-form"): - obs[lang_name] = str(q.selected) - break - return obs - - def render_summary_table(results: list[BenchmarkResult]) -> str: """Render the pass/fail + turns overview table.""" lines: list[str] = [] @@ -123,7 +140,7 @@ def render_summary_table(results: list[BenchmarkResult]) -> str: def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str: - """Render aggregated questionnaire scores with bar charts.""" + """Render aggregated questionnaire scores with bar charts, grouped by dimension.""" lines: list[str] = [] lines.append("=" * 78) lines.append(" QUESTIONNAIRE SCORES (1-5 Likert, higher = better)") @@ -132,7 +149,7 @@ def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str: # Aggregate scores across all tasks agg: dict[str, dict[str, list[float]]] = {} - for key, _ in LIKERT_QUESTIONS: + for key, _, _ in LIKERT_QUESTIONS: agg[key] = {"bash": [], "lush": []} for r in results: @@ -143,7 +160,15 @@ def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str: if val is not None: agg[key][lang].append(val) - for key, label in LIKERT_QUESTIONS: + # Group by dimension + current_dim = None + for key, label, dimension in LIKERT_QUESTIONS: + if dimension != current_dim: + if current_dim is not None: + lines.append("") + lines.append(f" [{dimension}]") + current_dim = dimension + b_vals = agg[key]["bash"] l_vals = agg[key]["lush"] b_avg = sum(b_vals) / len(b_vals) if b_vals else 0.0 @@ -151,10 +176,9 @@ def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str: diff = l_avg - b_avg diff_str = f"+{diff:.1f}" if diff > 0 else f"{diff:.1f}" if diff < 0 else " 0.0" - lines.append(f" {label}") - lines.append(f" bash {_bar(b_avg)} {b_avg:.1f}") - lines.append(f" lush {_bar(l_avg)} {l_avg:.1f} ({diff_str})") - lines.append("") + lines.append(f" {label}") + lines.append(f" bash {_bar(b_avg)} {b_avg:.1f}") + lines.append(f" lush {_bar(l_avg)} {l_avg:.1f} ({diff_str})") # Overall average all_bash = [v for key in agg for v in agg[key]["bash"]] @@ -164,6 +188,7 @@ def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str: diff = l_overall - b_overall diff_str = f"+{diff:.1f}" if diff > 0 else f"{diff:.1f}" if diff < 0 else " 0.0" + lines.append("") lines.append(" " + "-" * 50) lines.append(f" Overall average") lines.append(f" bash {_bar(b_overall)} {b_overall:.1f}") @@ -244,7 +269,7 @@ def render_per_category_questionnaire(results: list[BenchmarkResult]) -> str: lines.append(f" {cat}") agg: dict[str, dict[str, list[float]]] = {} - for key, _ in LIKERT_QUESTIONS: + for key, _, _ in LIKERT_QUESTIONS: agg[key] = {"bash": [], "lush": []} for r in cat_results: scores = _get_likert_scores(r) @@ -254,7 +279,7 @@ def render_per_category_questionnaire(results: list[BenchmarkResult]) -> str: if val is not None: agg[key][lang].append(val) - for key, label in LIKERT_QUESTIONS: + for key, label, _ in LIKERT_QUESTIONS: b_vals = agg[key]["bash"] l_vals = agg[key]["lush"] b_avg = sum(b_vals) / len(b_vals) if b_vals else 0.0 @@ -284,7 +309,7 @@ def render_per_task_detail(results: list[BenchmarkResult]) -> str: scores = _get_likert_scores(r) lines.append(f" {'Metric':<22s} {'Bash':>4s} {'Lush':>4s} {'Diff':>5s}") lines.append(" " + "-" * 40) - for key, label in LIKERT_QUESTIONS: + for key, label, _ in LIKERT_QUESTIONS: b_val = scores[key]["bash"] l_val = scores[key]["lush"] b_str = f"{b_val:.0f}" if b_val is not None else "-" @@ -296,15 +321,6 @@ def render_per_task_detail(results: list[BenchmarkResult]) -> str: d_str = "-" lines.append(f" {label:<22s} {b_str:>4s} {l_str:>4s} {d_str:>5s}") - # Free-form observations - obs = _get_freeform(r) - if obs: - lines.append("") - for lang, text in obs.items(): - # Wrap long text - wrapped = text[:120] + ("..." if len(text) > 120 else "") - lines.append(f" {lang}: {wrapped}") - lines.append("") return "\n".join(lines) diff --git a/main.py b/main.py index 77a2db2..daae578 100644 --- a/main.py +++ b/main.py @@ -2,7 +2,9 @@ from __future__ import annotations import argparse import sys +import threading import tomllib +from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import datetime, timezone from pathlib import Path @@ -16,6 +18,8 @@ from lush_bench.export import export_html from lush_bench.report import render_report from lush_bench.results import save_result +_print_lock = threading.Lock() + PROVIDERS = { "anthropic": AnthropicProvider, @@ -70,39 +74,44 @@ def cmd_list_tasks(args: argparse.Namespace) -> None: print(f" [{task.category:<12s} {task.mode:<7s}] {task.name:20s} {p.relative_to(Path.cwd())}") -def cmd_run(args: argparse.Namespace) -> None: - config = Config.load() - task_path = Path(args.task) +def _log(msg: str) -> None: + """Thread-safe print.""" + with _print_lock: + print(msg) + + +def _run_task( + task_path: Path, + provider_name: str, + config: Config, + provider: AnthropicProvider | None = None, +) -> BenchmarkResult: + """Core task runner. Thread-safe — usable from cmd_run or a thread pool.""" task = load_task(task_path) - provider_name = args.provider - if provider_name not in PROVIDERS: - print(f"Unknown provider: {provider_name}. Available: {', '.join(PROVIDERS)}") - sys.exit(1) + if provider is None: + provider_config = config.provider_configs.get(provider_name, {}) + provider = PROVIDERS[provider_name](provider_config) - provider_config = config.provider_configs.get(provider_name, {}) - provider = PROVIDERS[provider_name](provider_config) timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") - print(f"Running task: {task.name} ({task.category}/{task.mode}) with {provider.model_name}") + _log(f"Running task: {task.name} ({task.category}/{task.mode}) with {provider.model_name}") bash_result = None lush_result = None if task.mode == "solve": - # Solve mode: agent writes code in both languages - print(" Solving in bash...") + _log(f" [{task.name}] Solving in bash...") bash_result = solve_task(provider, task, "bash", config) - print(f" Bash: {'PASS' if bash_result.all_passed else 'FAIL'} ({bash_result.agent_turns} turns)") + _log(f" [{task.name}] Bash: {'PASS' if bash_result.all_passed else 'FAIL'} ({bash_result.agent_turns} turns)") - print(" Solving in lush...") + _log(f" [{task.name}] Solving in lush...") lush_result = solve_task(provider, task, "lush", config) - print(f" Lush: {'PASS' if lush_result.all_passed else 'FAIL'} ({lush_result.agent_turns} turns)") + _log(f" [{task.name}] Lush: {'PASS' if lush_result.all_passed else 'FAIL'} ({lush_result.agent_turns} turns)") elif task.mode == "convert": - # Convert mode: verify provided bash source directly, then convert to lush assert task.bash_source, f"Convert-mode task {task.name} missing bash_source" - print(" Verifying provided bash source...") + _log(f" [{task.name}] Verifying provided bash source...") test_results = evaluate(task, task.bash_source, "bash", config) all_passed = all(tr.passed for tr in test_results) bash_result = LanguageResult( @@ -112,16 +121,16 @@ def cmd_run(args: argparse.Namespace) -> None: all_passed=all_passed, agent_turns=0, ) - print(f" Bash: {'PASS' if all_passed else 'FAIL'}") + _log(f" [{task.name}] Bash: {'PASS' if all_passed else 'FAIL'}") - print(" Converting to lush...") + _log(f" [{task.name}] Converting to lush...") lush_result = solve_task(provider, task, "lush", config) - print(f" Lush: {'PASS' if lush_result.all_passed else 'FAIL'} ({lush_result.agent_turns} turns)") + _log(f" [{task.name}] Lush: {'PASS' if lush_result.all_passed else 'FAIL'} ({lush_result.agent_turns} turns)") # Run questionnaire for each completed language for lang, result in [("bash", bash_result), ("lush", lush_result)]: if result and result.solution_code: - print(f" Questionnaire for {lang}...") + _log(f" [{task.name}] Questionnaire for {lang}...") result.questionnaire = run_questionnaire(provider, task.name, lang, result.solution_code) benchmark = BenchmarkResult( @@ -136,20 +145,60 @@ def cmd_run(args: argparse.Namespace) -> None: ) result_dir = save_result(benchmark, config.output_dir) - print(f" Results saved to {result_dir}") + _log(f" [{task.name}] Results saved to {result_dir}") + return benchmark + + +def cmd_run(args: argparse.Namespace) -> None: + config = Config.load() + + provider_name = args.provider + if provider_name not in PROVIDERS: + print(f"Unknown provider: {provider_name}. Available: {', '.join(PROVIDERS)}") + sys.exit(1) + + provider_config = config.provider_configs.get(provider_name, {}) + provider = PROVIDERS[provider_name](provider_config) + _run_task(Path(args.task), provider_name, config, provider) def cmd_run_all(args: argparse.Namespace) -> None: + config = Config.load() paths = find_tasks(args.category, getattr(args, "mode", None)) if not paths: print("No tasks found.") return - for p in paths: - # Reuse cmd_run by constructing a namespace - run_args = argparse.Namespace(task=str(p), provider=args.provider) - cmd_run(run_args) - print() + provider_name = args.provider + if provider_name not in PROVIDERS: + print(f"Unknown provider: {provider_name}. Available: {', '.join(PROVIDERS)}") + sys.exit(1) + + # Share one provider instance across threads (its rate limiter is thread-safe) + provider_config = config.provider_configs.get(provider_name, {}) + provider = PROVIDERS[provider_name](provider_config) + + max_workers = args.workers if args.workers is not None else config.max_workers + print(f"Running {len(paths)} tasks with {max_workers} workers") + + failed: list[str] = [] + with ThreadPoolExecutor(max_workers=max_workers) as pool: + futures = { + pool.submit(_run_task, p, provider_name, config, provider): p + for p in paths + } + for future in as_completed(futures): + task_path = futures[future] + try: + future.result() + except Exception as exc: + task_name = task_path.stem + failed.append(task_name) + _log(f" [{task_name}] FAILED: {exc}") + + print(f"\nDone. {len(paths) - len(failed)}/{len(paths)} succeeded.") + if failed: + print(f"Failed: {', '.join(failed)}") def cmd_report(args: argparse.Namespace) -> None: @@ -183,6 +232,7 @@ def main() -> None: ra.add_argument("--category", choices=["algorithm", "pipeline", "environment", "filesystem", "process"], help="Filter by category") ra.add_argument("--mode", choices=["solve", "convert"], help="Filter by mode") ra.add_argument("--provider", default="anthropic", help="LLM provider") + ra.add_argument("--workers", type=int, default=None, help="Max parallel tasks (default: from config, typically 4)") ra.set_defaults(func=cmd_run_all) # report diff --git a/report.html b/report.html index 79acd1a..a13361b 100644 --- a/report.html +++ b/report.html @@ -30,14 +30,12 @@ .scores { width: auto; } .scores td:nth-child(n+2) { text-align: center; min-width: 50px; } .scores th:nth-child(n+2) { text-align: center; } - .observations { margin-top: 12px; font-size: 0.85rem; color: #444; } - .observations p { margin-bottom: 6px; }

Lush vs Bash Benchmark Report

-

Model: claude-sonnet-4-20250514 · Latest run: 20260327T224550Z · Tasks: 12

+

Model: claude-sonnet-4-20250514 · Latest run: 20260401T183152Z · Tasks: 18

Summary

@@ -60,12 +58,16 @@ - + + + + + @@ -73,32 +75,52 @@ - + + + + + + + + + + + + + + + + + - + + + + + - + - - + +
PASS1
env_configenvironmentPASS3FAIL4 PASS2
env_path_builderenvironment PASS0 PASS1
path_normalizerenvironmentPASS0PASS1
file_organizerfilesystem FAIL4
multi_file_searchfilesystem PASS1PASS3PASS2
todo_managerfilesystemPASS0PASS1
csv_transformpipeline PASS0 PASS1
currency_converterpipelinePASS0PASS1
locale_weather_urlpipelinePASS0PASS1
log_parserpipeline PASS0 PASS1
network_info_parserpipelinePASS0PASS1
pipeline_transformpipeline PASS1PASS2PASS1
pipeline_word_freqpipeline PASS0 PASS1
url_normalizerpipelinePASS0PASS1
process_exit_codesprocessPASS3PASS4 PASS1
Total11/1212/1216/1818/18
@@ -114,44 +136,44 @@ algorithm 3/33/3 1.01.0 - 3.64.1 + 3.53.9 environment - 2/22/2 - 3.01.5 - 3.13.7 + 2/33/3 + 4.01.3 + 2.83.9 filesystem - 1/22/2 - 2.52.0 - 3.23.9 + 2/33/3 + 2.51.3 + 3.13.8 pipeline - 4/44/4 - 1.01.2 - 3.64.0 + 8/88/8 + 1.01.0 + 3.03.9 process 1/11/1 - 3.01.0 - 3.84.3 + 4.01.0 + 3.24.0

Questionnaire Scores

-
Questionnaire comparison
+
Questionnaire comparison

Questionnaire Scores by Category

-
Per-category questionnaire
+
Per-category questionnaire

Agent Turns (Solve Mode)

-
Turns comparison
+
Turns comparison

Score Difference Heatmap (Lush - Bash)

-
Score heatmap
+
Score heatmap

Per-Category Breakdown

-

algorithm

algorithm breakdown

environment

environment breakdown

filesystem

filesystem breakdown

pipeline

pipeline breakdown

process

process breakdown
+

algorithm

algorithm breakdown

environment

environment breakdown

filesystem

filesystem breakdown

pipeline

pipeline breakdown

process

process breakdown

Per-Task Detail

@@ -162,11 +184,8 @@ - +
MetricBashLushDiff
Readability45+1
Expressiveness45+1
Conciseness45+1
Error handling24+2
Overall preference34+1
Learning curve35+2
Syntax clarity35+2
Signal-to-noise34+1
Familiar conventions45+1
Built-in operations45+1
String operations440
Composition53-2
I/O ergonomics54-1
Data structures440
Error model23+1
Edge case support23+1
Learnability35+2
Fitness for task440
-

bash: Bash works well for FizzBuzz with clean arithmetic operations and conditionals. The C-style for loop syntax is familiar, but bash's quirky syntax like double parentheses for arithmetic might confuse newcomers. Error handling for invalid input would require additional validation that bash doesn't make particularly elegant.

-

lush: Lush's Lua-like syntax made the FizzBuzz implementation very straightforward and clean. The language offers excellent readability with familiar control structures and operators. The modulo operator, conditional statements, and print function work exactly as expected, requiring no special syntax or workarounds.

-
@@ -176,11 +195,8 @@ - +
MetricBashLushDiff
Readability550
Expressiveness550
Conciseness550
Error handling330
Overall preference440
Learning curve440
Syntax clarity440
Signal-to-noise54-1
Familiar conventions25+3
Built-in operations550
String operations45+1
Composition54-1
I/O ergonomics54-1
Data structures440
Error model330
Edge case support330
Learnability45+1
Fitness for task440
-

bash: Bash excels at text manipulation tasks like this due to its built-in utilities like 'rev'. The solution is extremely concise and leverages the shell's natural pipeline approach. However, error handling would require additional checks for edge cases like empty input or command failures.

-

lush: Lush appears to be Lua-based with very familiar syntax and built-in string manipulation functions. The solution was extremely straightforward - just read input and use the built-in reverse function. The language feels mature and well-designed for simple scripting tasks like this.

-
@@ -190,25 +206,19 @@ - +
MetricBashLushDiff
Readability440
Expressiveness34+1
Conciseness43-1
Error handling220
Overall preference23+1
Learning curve34+1
Syntax clarity24+2
Signal-to-noise23+1
Familiar conventions34+1
Built-in operations440
String operations34+1
Composition43-1
I/O ergonomics440
Data structures34+1
Error model23+1
Edge case support220
Learnability34+1
Fitness for task24+2
-

bash: Bash works for simple algorithmic problems but lacks data structures and type safety that would make this more robust. The C-style for loop syntax feels awkward compared to bash's typical idioms, and array handling is somewhat clunky. Better suited for text processing and system tasks than algorithmic problems.

-

lush: Lua's syntax is clean and straightforward for this algorithmic problem. The hash table implementation using tables works well, and string parsing with gmatch is reasonably elegant. However, the input parsing required some manual work, and there's limited built-in error handling. The 1-based indexing required careful adjustment for 0-based output. Overall functional but not particularly specialized for competitive programming tasks.

-

env_config [environment/solve] - bash=PASS + bash=FAIL lush=PASS

- +
MetricBashLushDiff
Readability34+1
Expressiveness34+1
Conciseness23+1
Error handling23+1
Overall preference34+1
Learning curve24+2
Syntax clarity24+2
Signal-to-noise23+1
Familiar conventions24+2
Built-in operations34+1
String operations34+1
Composition45+1
I/O ergonomics45+1
Data structures34+1
Error model23+1
Edge case support23+1
Learnability24+2
Fitness for task35+2
-

bash: Bash's regex matching and array handling work well for this task, but the syntax is arcane (${!key} for indirect variable access, BASH_REMATCH array). The printf-pipe-sort pattern for array sorting is a common bash idiom but not intuitive. Error handling would require additional validation that bash doesn't make easy. Good for system administration contexts where bash is expected, but the syntax barriers make it less accessible than higher-level languages.

-

lush: Lush combines Lua's familiarity with shell-like conveniences effectively. The backtick syntax for command execution and built-in functions like lush.envset feel natural. However, the solution required more manual string processing than ideal - having to parse env output and match keys could be more elegant with better built-in environment variable introspection.

-
@@ -218,11 +228,19 @@ - + +
MetricBashLushDiff
Readability440
Expressiveness440
Conciseness440
Error handling330
Overall preference440
Learning curve330
Syntax clarity24+2
Signal-to-noise24+2
Familiar conventions34+1
Built-in operations330
String operations330
Composition43-1
I/O ergonomics45+1
Data structures440
Error model23+1
Edge case support330
Learnability24+2
Fitness for task440
+
+ +
+

path_normalizer [environment/convert] + bash=PASS + lush=PASS +

+ + +
MetricBashLushDiff
Syntax clarity24+2
Signal-to-noise24+2
Familiar conventions25+3
Built-in operations24+2
String operations35+2
Composition43-1
I/O ergonomics45+1
Data structures34+1
Error model23+1
Edge case support330
Learnability24+2
Fitness for task34+1
-

bash: Bash is well-suited for PATH manipulation tasks since it's designed for system administration. The string manipulation features work naturally for this problem, though the syntax quirks (like the pattern matching in conditionals) might confuse newcomers. The solution handles edge cases like empty input and duplicate detection efficiently.

-

lush: Lush's combination of Lua scripting with shell-like environment variable access ($MYPATH) made this task quite natural. The string manipulation and pattern matching capabilities were well-suited for path building. However, the mix of Lua and shell syntax might confuse developers not familiar with both paradigms.

-
@@ -232,11 +250,8 @@ - +
MetricBashLushDiff
Readability34+1
Expressiveness25+3
Conciseness24+2
Error handling24+2
Overall preference24+2
Learning curve24+2
Syntax clarity24+2
Signal-to-noise34+1
Familiar conventions25+3
Built-in operations45+1
String operations34+1
Composition43-1
I/O ergonomics45+1
Data structures35+2
Error model24+2
Edge case support23+1
Learnability24+2
Fitness for task45+1
-

bash: Bash required several workarounds and awkward constructs: dynamic variable creation with eval (security risk), manual string building for output sorting, complex parameter expansion syntax, and careful escaping. While bash excels at file operations, the lack of proper data structures made this solution verbose and harder to maintain compared to higher-level languages.

-

lush: Lush's shell command interpolation with backticks and the seamless integration with Lua made this file organization task very natural to implement. The ability to execute shell commands inline while maintaining full Lua data structures for processing was particularly elegant for this type of system administration task.

-
@@ -246,11 +261,19 @@ - + +
MetricBashLushDiff
Readability440
Expressiveness54-1
Conciseness53-2
Error handling34+1
Overall preference54-1
Learning curve330
Syntax clarity440
Signal-to-noise43-1
Familiar conventions34+1
Built-in operations52-3
String operations440
Composition53-2
I/O ergonomics54-1
Data structures34+1
Error model23+1
Edge case support23+1
Learnability24+2
Fitness for task53-2
+
+ +
+

todo_manager [filesystem/convert] + bash=PASS + lush=PASS +

+ + +
MetricBashLushDiff
Syntax clarity24+2
Signal-to-noise24+2
Familiar conventions34+1
Built-in operations23+1
String operations24+2
Composition43-1
I/O ergonomics440
Data structures24+2
Error model220
Edge case support23+1
Learnability34+1
Fitness for task34+1
-

bash: Bash excels at text processing and file operations like this. The pipeline approach with grep, sed, and sort feels very natural. However, the syntax can be cryptic for newcomers (especially the sort flags), and proper error handling would require additional complexity that might hurt readability.

-

lush: Lush combines Lua's simplicity with shell command integration nicely. The backtick syntax for command execution is intuitive, and the Lua-based file I/O and string processing felt natural. However, the file globbing required some manual parsing and filtering that felt a bit verbose compared to dedicated shell scripting. Overall, it strikes a good balance between scripting convenience and programming language features.

-
@@ -260,11 +283,30 @@ - + +
MetricBashLushDiff
Readability440
Expressiveness440
Conciseness550
Error handling23+1
Overall preference440
Learning curve34+1
Syntax clarity24+2
Signal-to-noise34+1
Familiar conventions24+2
Built-in operations440
String operations440
Composition53-2
I/O ergonomics45+1
Data structures34+1
Error model23+1
Edge case support23+1
Learnability24+2
Fitness for task440
+
+ +
+

currency_converter [pipeline/convert] + bash=PASS + lush=PASS +

+ + + +
MetricBashLushDiff
Syntax clarity24+2
Signal-to-noise23+1
Familiar conventions24+2
Built-in operations12+1
String operations34+1
Composition440
I/O ergonomics440
Data structures24+2
Error model23+1
Edge case support330
Learnability24+2
Fitness for task24+2
+
+ +
+

locale_weather_url [pipeline/convert] + bash=PASS + lush=PASS +

+ + +
MetricBashLushDiff
Syntax clarity24+2
Signal-to-noise34+1
Familiar conventions24+2
Built-in operations25+3
String operations35+2
Composition440
I/O ergonomics45+1
Data structures34+1
Error model23+1
Edge case support34+1
Learnability24+2
Fitness for task45+1
-

bash: Bash excels at simple text processing tasks like this CSV transformation. The IFS mechanism and read command make parsing straightforward, and the solution is very concise. However, bash's quirky syntax (like the `|| [[ -n "$name" ]]` construct for handling files without trailing newlines) can be confusing for newcomers. For more complex CSV handling with edge cases, a dedicated tool or language might be better.

-

lush: Lush's Lua-based syntax made the CSV parsing task quite natural. The pattern matching with match() worked well for simple CSV parsing, and the io functions provided clean file handling. The solution was concise and readable, though proper CSV parsing (handling quoted fields, escapes) would require more complex regex or a dedicated library.

-
@@ -274,11 +316,19 @@ - + +
MetricBashLushDiff
Readability35+2
Expressiveness440
Conciseness54-1
Error handling23+1
Overall preference440
Learning curve24+2
Syntax clarity24+2
Signal-to-noise34+1
Familiar conventions24+2
Built-in operations440
String operations45+1
Composition53-2
I/O ergonomics45+1
Data structures34+1
Error model23+1
Edge case support330
Learnability24+2
Fitness for task440
+
+ +
+

network_info_parser [pipeline/convert] + bash=PASS + lush=PASS +

+ + +
MetricBashLushDiff
Syntax clarity24+2
Signal-to-noise24+2
Familiar conventions35+2
Built-in operations25+3
String operations25+3
Composition440
I/O ergonomics45+1
Data structures34+1
Error model23+1
Edge case support23+1
Learnability24+2
Fitness for task45+1
-

bash: Bash excels at text processing pipelines like this, making it very concise. However, the syntax for robust input handling and parameter expansion can be cryptic. The pipeline approach is elegant but debugging can be challenging since errors may not propagate clearly through the chain.

-

lush: Lua's syntax is clean and readable for text processing tasks. The pattern matching with string.match() worked well for parsing log lines. However, the manual table sorting approach feels slightly verbose compared to languages with built-in sorted collections. The 'or 0' idiom for default values is elegant. Overall, Lua provides a good balance of simplicity and power for this type of scripting task.

-
@@ -288,11 +338,8 @@ - +
MetricBashLushDiff
Readability440
Expressiveness54-1
Conciseness53-2
Error handling34+1
Overall preference54-1
Learning curve34+1
Syntax clarity34+1
Signal-to-noise53-2
Familiar conventions24+2
Built-in operations53-2
String operations440
Composition52-3
I/O ergonomics54-1
Data structures440
Error model23+1
Edge case support23+1
Learnability34+1
Fitness for task53-2
-

bash: Bash excels at pipeline transformations like this - the pipe operator creates a natural flow that mirrors the logical steps. The regex in sed might be intimidating to newcomers, but the overall structure is intuitive. Error handling is implicit (pipes fail if any command fails) but lacks granular control.

-

lush: Lush felt very much like Lua, which made the text processing task straightforward. The pattern matching and string manipulation functions were intuitive. However, the manual deduplication logic required more verbose code than some other languages might need. Overall, it provided good control and clarity for this pipeline transformation task.

-
@@ -302,11 +349,19 @@ - + +
MetricBashLushDiff
Readability24+2
Expressiveness440
Conciseness54-1
Error handling34+1
Overall preference440
Learning curve24+2
Syntax clarity24+2
Signal-to-noise440
Familiar conventions25+3
Built-in operations54-1
String operations45+1
Composition53-2
I/O ergonomics54-1
Data structures34+1
Error model23+1
Edge case support330
Learnability24+2
Fitness for task54-1
+
+ +
+

url_normalizer [pipeline/convert] + bash=PASS + lush=PASS +

+ + +
MetricBashLushDiff
Syntax clarity24+2
Signal-to-noise24+2
Familiar conventions34+1
Built-in operations24+2
String operations35+2
Composition440
I/O ergonomics45+1
Data structures34+1
Error model23+1
Edge case support34+1
Learnability24+2
Fitness for task34+1
-

bash: Bash excels at text processing pipelines with its built-in tools like tr, grep, sort, and uniq. The solution is extremely concise but requires knowledge of Unix command-line tools and their options. The pipe-based approach is natural for this problem, but the syntax can be cryptic with options like 'tr -cs' and 'sort -k1,1rn'. The final while loop with the complex condition is necessary but makes the code less readable.

-

lush: Lush feels very much like Lua with clean syntax for text processing. The gmatch function for pattern matching and the flexible table structure made the word frequency counting natural. The sorting with custom comparison functions worked smoothly. Overall, it struck a good balance between simplicity and power for this text processing task.

-
@@ -316,11 +371,8 @@ - +
MetricBashLushDiff
Readability45+1
Expressiveness45+1
Conciseness550
Error handling330
Overall preference440
Learning curve34+1
Syntax clarity24+2
Signal-to-noise34+1
Familiar conventions24+2
Built-in operations45+1
String operations440
Composition53-2
I/O ergonomics45+1
Data structures34+1
Error model330
Edge case support23+1
Learnability24+2
Fitness for task550
-

bash: Bash is well-suited for this task since it naturally handles command execution and exit codes. The main challenges are bash-specific syntax quirks like the read loop condition and variable arithmetic. The use of eval introduces potential security concerns but is necessary for dynamic command execution. Overall, bash feels like the right tool for this system administration type task.

-

lush: Lush combines the simplicity of shell scripting with Lua's readability. The command execution syntax `${command}` is intuitive and the ability to access exit codes via .code is clean. The Lua-based control structures and string handling make it much more readable than pure bash while maintaining the shell's natural command execution feel.

-
diff --git a/tasks/environment/path_normalizer.toml b/tasks/environment/path_normalizer.toml new file mode 100644 index 0000000..abd120d --- /dev/null +++ b/tasks/environment/path_normalizer.toml @@ -0,0 +1,97 @@ +name = "path_normalizer" +category = "environment" +mode = "convert" +description = """ +Read file paths from stdin, one per line. Normalize each path: +1. Replace a leading "~" with the value of $HOME. +2. Remove trailing slashes (except for root "/"). +3. Collapse consecutive slashes into one. +4. Resolve "." components (remove them). +5. Resolve ".." components (go up one directory level). +Output the cleaned path, one per line. +Skip empty lines. +""" + +bash_source = ''' +#!/bin/bash +while IFS= read -r line || [[ -n "$line" ]]; do + [[ -z "$line" ]] && continue + + # Expand tilde + path=$(echo "$line" | sed "s:^~:$HOME:") + + # Collapse multiple slashes + path=$(echo "$path" | sed 's:/\\+:/:g') + + # Remove trailing slash (but keep root) + path=$(echo "$path" | sed 's:/$::') + [[ -z "$path" ]] && path="/" + + # Resolve . and .. components + IFS='/' read -ra parts <<< "$path" + result=() + absolute="" + if [[ "$path" == /* ]]; then + absolute="/" + fi + + for part in "${parts[@]}"; do + if [[ "$part" == "." || "$part" == "" ]]; then + continue + elif [[ "$part" == ".." ]]; then + if [[ ${#result[@]} -gt 0 && "${result[-1]}" != ".." ]]; then + unset 'result[${#result[@]}-1]' + elif [[ -z "$absolute" ]]; then + result+=("..") + fi + else + result+=("$part") + fi + done + + if [[ -n "$absolute" ]]; then + final="/" + IFS='/'; final+="${result[*]}" + else + IFS='/'; final="${result[*]}" + fi + [[ -z "$final" ]] && final="/" + + echo "$final" +done +''' + +[[test_cases]] +description = "Tilde expansion and trailing slashes" +stdin = """~/Documents/ +~/projects/code +/var/log/""" +expected_stdout = """/Users/testuser/Documents +/Users/testuser/projects/code +/var/log""" +env = { "HOME" = "/Users/testuser" } + +[[test_cases]] +description = "Resolving . and .. components" +stdin = """/usr/local/./bin +/home/user/../shared/docs +/a/b/c/../../d""" +expected_stdout = """/usr/local/bin +/home/shared/docs +/a/d""" + +[[test_cases]] +description = "Collapsing multiple slashes" +stdin = """/usr//local///bin +/var/log//syslog""" +expected_stdout = """/usr/local/bin +/var/log/syslog""" + +[[test_cases]] +description = "Root and relative paths" +stdin = """/ +./config/settings +../parent/child""" +expected_stdout = """/ +config/settings +../parent/child""" diff --git a/tasks/filesystem/todo_manager.toml b/tasks/filesystem/todo_manager.toml new file mode 100644 index 0000000..330da44 --- /dev/null +++ b/tasks/filesystem/todo_manager.toml @@ -0,0 +1,150 @@ +name = "todo_manager" +category = "filesystem" +mode = "convert" +description = """ +A simple todo list manager. Read commands from stdin, one per line: + add — append the task to the todo file + list — print all tasks numbered as "NN). " + remove — remove task number N (1-based) + clear — remove all tasks + +The todo file is "todo.txt" in the working directory. +When listing, pad task numbers to two digits (01, 02, …). +After "add" or "remove", automatically list the remaining tasks. +If the list is empty, print "No tasks found". +""" + +bash_source = ''' +#!/bin/bash +TODOFILE="./todo.txt" + +list_tasks() { + if [ -f "$TODOFILE" ] && [ -s "$TODOFILE" ]; then + count=1 + IFS=$'\\n' + while read -r task; do + num=$count + if [ $count -lt 10 ]; then num="0$count"; fi + echo "$num). $task" + count=$(( count + 1 )) + done < "$TODOFILE" + else + echo "No tasks found" + fi +} + +add_task() { + echo "$1" >> "$TODOFILE" +} + +remove_task() { + taskNum=$1 + totalLines=$(wc -l < "$TODOFILE" | tr -d ' ') + if [ "$taskNum" -gt "$totalLines" ] 2>/dev/null; then + echo "Error: task number $taskNum does not exist!" + return 1 + fi + tmpfile="./todo_tmp.txt" + count=1 + IFS=$'\\n' + while read -r task; do + if [ "$count" -ne "$taskNum" ]; then + echo "$task" >> "$tmpfile" + fi + count=$(( count + 1 )) + done < "$TODOFILE" + if [ -f "$tmpfile" ]; then + mv "$tmpfile" "$TODOFILE" + else + > "$TODOFILE" + fi + echo "Sucessfully removed task number $taskNum" +} + +clear_tasks() { + > "$TODOFILE" + echo "Tasks cleared." +} + +if [ ! -f "$TODOFILE" ]; then + touch "$TODOFILE" +fi + +while IFS= read -r line || [[ -n "$line" ]]; do + cmd=$(echo "$line" | cut -d' ' -f1) + arg=$(echo "$line" | cut -d' ' -f2-) + case "$cmd" in + add) + add_task "$arg" + list_tasks + ;; + list) + list_tasks + ;; + remove) + remove_task "$arg" + list_tasks + ;; + clear) + clear_tasks + ;; + esac +done +''' + +[[test_cases]] +description = "Add tasks then list" +stdin = """add Buy groceries +add Walk the dog +list""" +expected_stdout = """01). Buy groceries +01). Buy groceries +02). Walk the dog +01). Buy groceries +02). Walk the dog""" + +[[test_cases]] +description = "Add, remove, list" +stdin = """add First task +add Second task +add Third task +remove 2 +list""" +expected_stdout = """01). First task +01). First task +02). Second task +01). First task +02). Second task +03). Third task +Sucessfully removed task number 2 +01). First task +02). Third task +01). First task +02). Third task""" + +[[test_cases]] +description = "Empty list and clear" +stdin = """list +add Something +clear +list""" +expected_stdout = """No tasks found +01). Something +Tasks cleared. +No tasks found""" + +[[test_cases]] +description = "Works with pre-existing todo file" +stdin = """list +add Third item +list""" +setup_files = { "todo.txt" = "Existing item one\nExisting item two\n" } +expected_stdout = """01). Existing item one +02). Existing item two +01). Existing item one +02). Existing item two +03). Third item +01). Existing item one +02). Existing item two +03). Third item""" +expected_files = { "todo.txt" = "Existing item one\nExisting item two\nThird item\n" } diff --git a/tasks/pipeline/currency_converter.toml b/tasks/pipeline/currency_converter.toml new file mode 100644 index 0000000..6b0d7aa --- /dev/null +++ b/tasks/pipeline/currency_converter.toml @@ -0,0 +1,113 @@ +name = "currency_converter" +category = "pipeline" +mode = "convert" +description = """ +A currency converter that reads conversion requests from stdin. +Each line has the format: AMOUNT FROM TO RATE + - AMOUNT: a decimal number (e.g., 12.35) + - FROM: 3-letter currency code + - TO: 3-letter currency code + - RATE: the exchange rate from FROM's base to TO's base + +Some currencies are pegged to others at fixed rates: + BAM is pegged to EUR at 1.95583 + BMD is pegged to USD at 1.0 + BND is pegged to SGD at 1.0 + DJF is pegged to USD at 177.721 + PAB is pegged to USD at 1.0 + +When a pegged currency is involved, the conversion must account for the +peg coefficient. The formula is: result = amount * (rate / coef_from) * coef_to +where coef is the peg ratio (1 if not pegged). + +Output one line per input: "AMOUNT FROM = RESULT TO" with RESULT +computed using bc with scale=2. +For invalid lines (wrong field count or non-numeric amount), output "ERROR: ". +""" + +bash_source = ''' +#!/bin/bash + +pegged_to() { + case "$1" in + BAM) echo "EUR:1.95583" ;; + BMD) echo "USD:1.0" ;; + BND) echo "SGD:1.0" ;; + DJF) echo "USD:177.721" ;; + PAB) echo "USD:1.0" ;; + *) echo "NONE:1" ;; + esac +} + +while IFS= read -r line || [[ -n "$line" ]]; do + # Skip empty lines + [[ -z "$line" ]] && continue + + # Parse fields + set -- $line + if [[ $# -ne 4 ]]; then + echo "ERROR: $line" + continue + fi + + amount=$1 + from=$2 + to=$3 + rate=$4 + + # Validate amount is numeric + if [[ ! "$amount" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then + echo "ERROR: $line" + continue + fi + + # Validate rate is numeric + if [[ ! "$rate" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then + echo "ERROR: $line" + continue + fi + + # Get peg info + peg_from=$(pegged_to "$from") + coef_from=$(echo "$peg_from" | cut -d: -f2) + + peg_to=$(pegged_to "$to") + coef_to=$(echo "$peg_to" | cut -d: -f2) + + # Calculate: result = amount * (rate / coef_from) * coef_to + result=$(echo "scale=8; $amount * ($rate / $coef_from) * $coef_to" | bc) + result=$(printf "%.2f" "$result") + + echo "$amount $from = $result $to" +done +''' + +[[test_cases]] +description = "Standard conversion with direct rate" +stdin = """100 USD EUR 0.92 +50 GBP JPY 188.50""" +expected_stdout = """100 USD = 92.00 EUR +50 GBP = 9425.00 JPY""" + +[[test_cases]] +description = "Pegged currency conversions" +stdin = """100 BAM USD 1.08 +200 BMD EUR 0.92 +50 USD DJF 1.0""" +expected_stdout = """100 BAM = 55.22 USD +200 BMD = 184.00 EUR +50 USD = 8886.05 DJF""" + +[[test_cases]] +description = "Invalid input lines" +stdin = """abc EUR USD 0.92 +100 USD +100 EUR USD 0.85""" +expected_stdout = """ERROR: abc EUR USD 0.92 +ERROR: 100 USD +100 EUR = 85.00 USD""" + +[[test_cases]] +description = "Pegged-to-pegged conversion" +stdin = "100 BAM BMD 1.08" +expected_stdout = "100 BAM = 55.22 BMD" diff --git a/tasks/pipeline/locale_weather_url.toml b/tasks/pipeline/locale_weather_url.toml new file mode 100644 index 0000000..5327743 --- /dev/null +++ b/tasks/pipeline/locale_weather_url.toml @@ -0,0 +1,76 @@ +name = "locale_weather_url" +category = "pipeline" +mode = "convert" +description = """ +Construct weather API URLs from locale and location information. +Read lines from stdin in the format: LANG_CODE LOCATION +where LANG_CODE is a 2-letter locale (e.g., "en", "fr", "de") +and LOCATION is a city/place name (may contain spaces). + +For each line, construct a URL in the format: + https://LANG.wttr.in/LOCATION + +Where spaces in the location are replaced with "+" characters. + +If LANG_CODE is empty or invalid (not exactly 2 lowercase letters), +default to "en". + +Skip empty lines. Output one URL per input line. +""" + +bash_source = ''' +#!/bin/bash +while IFS= read -r line || [[ -n "$line" ]]; do + [[ -z "$line" ]] && continue + + # Extract lang code (first field) + lang=$(echo "$line" | awk '{print $1}') + + # Extract location (everything after first field) + location=$(echo "$line" | sed 's/^[^ ]* *//') + + # Validate lang code: must be exactly 2 lowercase letters + if [[ ! "$lang" =~ ^[a-z]{2}$ ]]; then + lang="en" + fi + + # If location is same as lang (single-word line), skip + if [[ "$location" == "$lang" || -z "$location" ]]; then + location="" + fi + + # Replace spaces with + + location=$(echo "$location" | tr ' ' '+') + + echo "https://$lang.wttr.in/$location" +done +''' + +[[test_cases]] +description = "Various locales and locations" +stdin = """en New York +fr Paris +de Berlin +ja Tokyo""" +expected_stdout = """https://en.wttr.in/New+York +https://fr.wttr.in/Paris +https://de.wttr.in/Berlin +https://ja.wttr.in/Tokyo""" + +[[test_cases]] +description = "Multi-word locations" +stdin = """en San Francisco +es Buenos Aires +pt Rio de Janeiro""" +expected_stdout = """https://en.wttr.in/San+Francisco +https://es.wttr.in/Buenos+Aires +https://pt.wttr.in/Rio+de+Janeiro""" + +[[test_cases]] +description = "Invalid or missing locale defaults to en" +stdin = """ENG London +123 Moscow +x Rome""" +expected_stdout = """https://en.wttr.in/London +https://en.wttr.in/Moscow +https://en.wttr.in/Rome""" diff --git a/tasks/pipeline/network_info_parser.toml b/tasks/pipeline/network_info_parser.toml new file mode 100644 index 0000000..55d2eae --- /dev/null +++ b/tasks/pipeline/network_info_parser.toml @@ -0,0 +1,86 @@ +name = "network_info_parser" +category = "pipeline" +mode = "convert" +description = """ +Parse network interface configuration from stdin (in "ip addr show" format) +and extract a summary of each interface. + +For each interface block, output a line: + IFACE: IP: MASK: / + +An interface block starts with a line like: + 2: eth0: mtu 1500 ... +and contains inet lines like: + inet 192.168.1.100/24 brd 192.168.1.255 scope global eth0 + +If an interface has no inet line, output: + IFACE: IP: none MASK: none + +Skip the loopback interface (lo). +""" + +bash_source = ''' +#!/bin/bash + +current_iface="" +found_ip="" +found_mask="" + +flush_iface() { + if [[ -n "$current_iface" && "$current_iface" != "lo" ]]; then + if [[ -n "$found_ip" ]]; then + echo "IFACE: $current_iface IP: $found_ip MASK: /$found_mask" + else + echo "IFACE: $current_iface IP: none MASK: none" + fi + fi +} + +while IFS= read -r line || [[ -n "$line" ]]; do + # Detect interface line: starts with a number followed by colon + if echo "$line" | grep -qE '^[0-9]+:'; then + flush_iface + current_iface=$(echo "$line" | awk -F: '{print $2}' | sed 's/^[[:space:]]*//' | awk '{print $1}') + found_ip="" + found_mask="" + fi + + # Detect inet line (IPv4 only, not inet6) + if echo "$line" | grep -qE '^[[:space:]]+inet [0-9]'; then + ip_cidr=$(echo "$line" | awk '{print $2}') + found_ip=$(echo "$ip_cidr" | cut -d/ -f1) + found_mask=$(echo "$ip_cidr" | cut -d/ -f2) + fi +done + +flush_iface +''' + +[[test_cases]] +description = "Two interfaces with IPs" +stdin = """1: lo: mtu 65536 + inet 127.0.0.1/8 scope host lo +2: eth0: mtu 1500 + inet 192.168.1.100/24 brd 192.168.1.255 scope global eth0 +3: wlan0: mtu 1500 + inet 10.0.0.42/16 brd 10.0.255.255 scope global wlan0""" +expected_stdout = """IFACE: eth0 IP: 192.168.1.100 MASK: /24 +IFACE: wlan0 IP: 10.0.0.42 MASK: /16""" + +[[test_cases]] +description = "Interface with no IP" +stdin = """1: lo: mtu 65536 + inet 127.0.0.1/8 scope host lo +2: eth0: mtu 1500 +3: docker0: mtu 1500 + inet 172.17.0.1/16 brd 172.17.255.255 scope global docker0""" +expected_stdout = """IFACE: eth0 IP: none MASK: none +IFACE: docker0 IP: 172.17.0.1 MASK: /16""" + +[[test_cases]] +description = "Single interface" +stdin = """1: lo: mtu 65536 + inet 127.0.0.1/8 scope host lo +2: enp3s0: mtu 9000 + inet 10.10.10.5/8 brd 10.255.255.255 scope global enp3s0""" +expected_stdout = "IFACE: enp3s0 IP: 10.10.10.5 MASK: /8" diff --git a/tasks/pipeline/url_normalizer.toml b/tasks/pipeline/url_normalizer.toml new file mode 100644 index 0000000..1e56bc0 --- /dev/null +++ b/tasks/pipeline/url_normalizer.toml @@ -0,0 +1,79 @@ +name = "url_normalizer" +category = "pipeline" +mode = "convert" +description = """ +Read URLs from stdin, one per line. Normalize each URL: +1. If the URL already starts with "https://", keep it as-is. +2. If it starts with "http://", keep it as-is. +3. Otherwise, prepend "http://" to it. +4. After normalization, validate that the URL matches a basic pattern: + it must have a protocol (http:// or https://), followed by at least + one character, a dot, and at least one more character for the domain. +5. Output the normalized URL, or "INVALID: " for invalid entries. + +Skip empty lines silently. +""" + +bash_source = ''' +#!/bin/bash +while IFS= read -r line || [[ -n "$line" ]]; do + # Skip empty lines + [[ -z "$line" ]] && continue + + # Trim whitespace + url=$(echo "$line" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//') + [[ -z "$url" ]] && continue + + original="$url" + + # Check if it already has https:// + prefix8=$(echo "$url" | cut -c1-8) + if [[ "$prefix8" == "https://" ]]; then + normalized="$url" + else + prefix7=$(echo "$url" | cut -c1-7) + if [[ "$prefix7" == "http://" ]]; then + normalized="$url" + else + normalized="http://$url" + fi + fi + + # Validate: protocol + something.something + if echo "$normalized" | grep -qE '^https?://[^/]+\.[^/]+'; then + echo "$normalized" + else + echo "INVALID: $original" + fi +done +''' + +[[test_cases]] +description = "URLs with and without protocol" +stdin = """example.com +http://example.com +https://example.com +www.google.com/search?q=test""" +expected_stdout = """http://example.com +http://example.com +https://example.com +http://www.google.com/search?q=test""" + +[[test_cases]] +description = "Invalid entries" +stdin = """notaurl +https://valid.example.com +just-a-word""" +expected_stdout = """INVALID: notaurl +https://valid.example.com +INVALID: just-a-word""" + +[[test_cases]] +description = "Mixed valid and empty lines" +stdin = """https://secure.site.org/path + +api.service.io:8080 +http://old.site.net""" +expected_stdout = """https://secure.site.org/path +http://api.service.io:8080 +http://old.site.net"""