Revamp questionnaire, parallelize run-all, add new tasks

- Replace 6 compound Likert questions with 12 atomic ones grouped by dimension (syntax, expressiveness, data/IO, errors, overall); drop free-form question. Responses now stored as ints, not strings. - Back-compat layer maps legacy keys to new dimensions so existing results still render. - Parallelize run-all with ThreadPoolExecutor (configurable workers) and add a thread-safe min-request-interval rate limiter to the Anthropic provider. - Add new tasks: path_normalizer, todo_manager, currency_converter, locale_weather_url, network_info_parser, url_normalizer.
2026-04-07 19:07:21 +01:00
parent 20e62f60f6
commit 18ce7e57cf
13 changed files with 943 additions and 206 deletions
--- a/lush_bench/config.py
+++ b/lush_bench/config.py
@@ -13,6 +13,7 @@ class Config:
    timeout_seconds: float = 10.0
    normalize_whitespace: bool = True
    output_dir: Path = Path("results")
    max_workers: int = 4
    provider_configs: dict[str, dict[str, Any]] = field(default_factory=dict)
    @classmethod
@@ -35,5 +36,6 @@ class Config:
            timeout_seconds=agent.get("timeout_seconds", 10.0),
            normalize_whitespace=agent.get("normalize_whitespace", True),
            output_dir=Path(results.get("output_dir", "results")),
            max_workers=agent.get("max_workers", 4),
            provider_configs=provider_configs,
        )
--- a/lush_bench/export.py
+++ b/lush_bench/export.py
@@ -13,7 +13,6 @@ import matplotlib.ticker as ticker
 from .models import BenchmarkResult
 from .report import (
    LIKERT_QUESTIONS,
    _get_freeform,
    _get_likert_scores,
    _parse_likert,
    load_latest_results,
@@ -35,7 +34,7 @@ def _fig_to_base64(fig: plt.Figure) -> str:
 def _aggregate_likert(results: list[BenchmarkResult]) -> dict[str, dict[str, float]]:
    """Return {question_key: {bash: avg, lush: avg}}."""
    agg: dict[str, dict[str, list[float]]] = {}
-    for key, _ in LIKERT_QUESTIONS:
+    for key, _, _ in LIKERT_QUESTIONS:
        agg[key] = {"bash": [], "lush": []}
    for r in results:
        scores = _get_likert_scores(r)
@@ -56,11 +55,11 @@ def _aggregate_likert(results: list[BenchmarkResult]) -> dict[str, dict[str, flo
 def chart_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
    """Grouped horizontal bar chart comparing bash vs lush on each Likert metric."""
    avgs = _aggregate_likert(results)
-    labels = [label for _, label in LIKERT_QUESTIONS]
+    labels = [label for _, label, _ in LIKERT_QUESTIONS]
-    bash_vals = [avgs[key]["bash"] for key, _ in LIKERT_QUESTIONS]
+    bash_vals = [avgs[key]["bash"] for key, _, _ in LIKERT_QUESTIONS]
-    lush_vals = [avgs[key]["lush"] for key, _ in LIKERT_QUESTIONS]
+    lush_vals = [avgs[key]["lush"] for key, _, _ in LIKERT_QUESTIONS]
-    fig, ax = plt.subplots(figsize=(8, 4.5))
+    fig, ax = plt.subplots(figsize=(8, 7))
    y = range(len(labels))
    bar_h = 0.35
    bars_bash = ax.barh([i + bar_h / 2 for i in y], bash_vals, bar_h, label="bash", color=BASH_COLOR)
@@ -112,14 +111,14 @@ def chart_turns_comparison(results: list[BenchmarkResult]) -> str:
 def chart_per_task_heatmap(results: list[BenchmarkResult]) -> str:
    """Heatmap showing lush-minus-bash score diff per task and metric."""
-    labels = [label for _, label in LIKERT_QUESTIONS]
+    labels = [label for _, label, _ in LIKERT_QUESTIONS]
    tasks = [r.task_name for r in results]
    data: list[list[float]] = []
    for r in results:
        scores = _get_likert_scores(r)
        row = []
-        for key, _ in LIKERT_QUESTIONS:
+        for key, _, _ in LIKERT_QUESTIONS:
            b = scores[key]["bash"]
            l = scores[key]["lush"]
            if b is not None and l is not None:
@@ -128,11 +127,11 @@ def chart_per_task_heatmap(results: list[BenchmarkResult]) -> str:
                row.append(0.0)
        data.append(row)
-    fig, ax = plt.subplots(figsize=(8, max(4, len(tasks) * 0.45 + 1)))
+    fig, ax = plt.subplots(figsize=(10, max(4, len(tasks) * 0.45 + 1)))
    im = ax.imshow(data, cmap="RdYlGn", aspect="auto", vmin=-3, vmax=3)
    ax.set_xticks(range(len(labels)))
-    ax.set_xticklabels(labels, rotation=35, ha="right", fontsize=8)
+    ax.set_xticklabels(labels, rotation=45, ha="right", fontsize=7)
    ax.set_yticks(range(len(tasks)))
    ax.set_yticklabels(tasks, fontsize=8)
@@ -140,7 +139,7 @@ def chart_per_task_heatmap(results: list[BenchmarkResult]) -> str:
        for j in range(len(labels)):
            val = data[i][j]
            text = f"+{val:.0f}" if val > 0 else f"{val:.0f}" if val < 0 else "0"
-            ax.text(j, i, text, ha="center", va="center", fontsize=8,
+            ax.text(j, i, text, ha="center", va="center", fontsize=7,
                    color="white" if abs(val) >= 2 else "black")
    ax.set_title("Score Difference (Lush - Bash)")
@@ -197,7 +196,7 @@ def chart_per_category_questionnaire(results: list[BenchmarkResult]) -> str:
 def chart_per_category_radar(results: list[BenchmarkResult]) -> list[tuple[str, str]]:
-    """Small-multiples bar charts: one per category showing 6 Likert dimensions for bash vs lush."""
+    """Small-multiples bar charts: one per category showing 12 Likert dimensions for bash vs lush."""
    import numpy as np
    from collections import defaultdict
@@ -206,12 +205,12 @@ def chart_per_category_radar(results: list[BenchmarkResult]) -> list[tuple[str,
        by_cat[r.category].append(r)
    charts: list[tuple[str, str]] = []
-    labels = [label for _, label in LIKERT_QUESTIONS]
+    labels = [label for _, label, _ in LIKERT_QUESTIONS]
    for cat in sorted(by_cat):
        cat_results = by_cat[cat]
        agg: dict[str, dict[str, list[float]]] = {}
-        for key, _ in LIKERT_QUESTIONS:
+        for key, _, _ in LIKERT_QUESTIONS:
            agg[key] = {"bash": [], "lush": []}
        for r in cat_results:
            scores = _get_likert_scores(r)
@@ -221,10 +220,10 @@ def chart_per_category_radar(results: list[BenchmarkResult]) -> list[tuple[str,
                    if val is not None:
                        agg[key][lang].append(val)
-        bash_vals = [sum(agg[k]["bash"]) / len(agg[k]["bash"]) if agg[k]["bash"] else 0.0 for k, _ in LIKERT_QUESTIONS]
+        bash_vals = [sum(agg[k]["bash"]) / len(agg[k]["bash"]) if agg[k]["bash"] else 0.0 for k, _, _ in LIKERT_QUESTIONS]
-        lush_vals = [sum(agg[k]["lush"]) / len(agg[k]["lush"]) if agg[k]["lush"] else 0.0 for k, _ in LIKERT_QUESTIONS]
+        lush_vals = [sum(agg[k]["lush"]) / len(agg[k]["lush"]) if agg[k]["lush"] else 0.0 for k, _, _ in LIKERT_QUESTIONS]
-        fig, ax = plt.subplots(figsize=(6, 3.5))
+        fig, ax = plt.subplots(figsize=(7, 5))
        y = range(len(labels))
        bar_h = 0.35
        ax.barh([i + bar_h / 2 for i in y], bash_vals, bar_h, label="bash", color=BASH_COLOR)
@@ -337,7 +336,7 @@ def _build_detail_html(results: list[BenchmarkResult]) -> str:
        scores = _get_likert_scores(r)
        score_rows = []
-        for key, label in LIKERT_QUESTIONS:
+        for key, label, _ in LIKERT_QUESTIONS:
            b_val = scores[key]["bash"]
            l_val = scores[key]["lush"]
            b_str = f"{b_val:.0f}" if b_val is not None else "-"
@@ -353,11 +352,6 @@ def _build_detail_html(results: list[BenchmarkResult]) -> str:
                              f'<td>{b_str}</td><td>{l_str}</td>'
                              f'<td class="{d_cls}">{d_str}</td></tr>')
        obs = _get_freeform(r)
        obs_html = ""
        for lang, text in obs.items():
            obs_html += f'<p><strong>{lang}:</strong> {html.escape(text)}</p>\n'
        sections.append(f"""
        <div class="task-detail">
            <h3>{html.escape(r.task_name)} <span class="cat">[{r.category}/{r.mode}]</span>
@@ -368,7 +362,6 @@ def _build_detail_html(results: list[BenchmarkResult]) -> str:
                <thead><tr><th>Metric</th><th>Bash</th><th>Lush</th><th>Diff</th></tr></thead>
                <tbody>{"".join(score_rows)}</tbody>
            </table>
            <div class="observations">{obs_html}</div>
        </div>""")
    return "\n".join(sections)
@@ -424,8 +417,6 @@ def export_html(results_dir: Path, output_path: Path) -> None:
    .scores {{ width: auto; }}
    .scores td:nth-child(n+2) {{ text-align: center; min-width: 50px; }}
    .scores th:nth-child(n+2) {{ text-align: center; }}
    .observations {{ margin-top: 12px; font-size: 0.85rem; color: #444; }}
    .observations p {{ margin-bottom: 6px; }}
 </style>
 </head>
 <body>
--- a/lush_bench/providers/anthropic.py
+++ b/lush_bench/providers/anthropic.py
@@ -1,6 +1,8 @@
 from __future__ import annotations
 import os
 import threading
 import time
 from typing import Any
 import anthropic
@@ -17,8 +19,17 @@ class AnthropicProvider:
        self._client = anthropic.Anthropic(api_key=api_key)
        self._model = config.get("model", "claude-sonnet-4-20250514")
        self._max_tokens = config.get("max_tokens", 4096)
        self._min_request_interval = config.get("min_request_interval", 0.1)
        self._last_request_time = 0.0
        self._lock = threading.Lock()
    def send(self, messages: list[Message], system: str = "") -> str:
        with self._lock:
            elapsed = time.monotonic() - self._last_request_time
            if elapsed < self._min_request_interval:
                time.sleep(self._min_request_interval - elapsed)
            self._last_request_time = time.monotonic()
        api_messages = [{"role": m.role, "content": m.content} for m in messages]
        kwargs: dict[str, Any] = {
            "model": self._model,
--- a/lush_bench/questionnaire.py
+++ b/lush_bench/questionnaire.py
@@ -7,42 +7,38 @@ from .models import QuestionnaireResponse
 from .providers.base import LLMProvider, Message
 QUESTIONS = [
-    {
+    # Syntax & Readability
-        "question": "Readability: The solution is easy to read and understand",
+    {"id": "syntax_clarity", "dimension": "Syntax & Readability", "question": "The language's syntax makes the intent of operations visually obvious"},
-        "choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
+    {"id": "signal_to_noise", "dimension": "Syntax & Readability", "question": "The language keeps boilerplate low — most characters serve the task, not the language"},
-    },
+    {"id": "familiar_conventions", "dimension": "Syntax & Readability", "question": "The language follows conventions that developers from other languages would recognize"},
-    {
+    # Expressiveness
-        "question": "Expressiveness: The language provided sufficient constructs to solve the problem naturally",
+    {"id": "builtin_ops", "dimension": "Expressiveness", "question": "The language provides built-in operations for the core task requirements (no workarounds needed)"},
-        "choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
+    {"id": "string_ops", "dimension": "Expressiveness", "question": "The language's string manipulation capabilities are convenient for this task"},
-    },
+    {"id": "composition", "dimension": "Expressiveness", "question": "The language makes it easy to compose operations (piping, chaining, nesting)"},
-    {
+    # Data & I/O
-        "question": "Conciseness: The solution required minimal boilerplate",
+    {"id": "io_ergonomics", "dimension": "Data & I/O", "question": "Reading input and producing output is straightforward in this language"},
-        "choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
+    {"id": "data_structures", "dimension": "Data & I/O", "question": "The language's data structures (arrays, maps, variables) are well-suited to this task"},
-    },
+    # Error Handling
-    {
+    {"id": "error_model", "dimension": "Error Handling", "question": "The language's error handling model is clear and predictable"},
-        "question": "Error handling: Error handling was straightforward",
+    {"id": "edge_case_support", "dimension": "Error Handling", "question": "The language makes it easy to handle edge cases (empty input, missing data, type mismatches)"},
-        "choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
+    # Overall
-    },
+    {"id": "learnability", "dimension": "Overall", "question": "A developer unfamiliar with this language could learn enough to solve this task quickly"},
-    {
+    {"id": "fitness", "dimension": "Overall", "question": "This language is a good fit for this type of task"},
        "question": "Overall preference: I would prefer this language for similar tasks",
        "choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
    },
    {
        "question": "Learning curve: An unfamiliar developer could understand the solution quickly",
        "choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
    },
 ]
 CHOICES = ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"]
 def build_questionnaire_prompt(
    task_name: str,
    language: str,
    solution_code: str,
 ) -> str:
    choices_str = ", ".join(f'"{c}"' for c in CHOICES)
    questions_text = ""
-    for i, q in enumerate(QUESTIONS, 1):
+    for q in QUESTIONS:
-        choices_str = ", ".join(f'"{c}"' for c in q["choices"])
+        questions_text += f'  {{"id": "{q["id"]}", "question": "{q["question"]}", "selected": <your choice>}},\n'
        questions_text += f'  {{"question": "{q["question"]}", "choices": [{choices_str}], "selected": <your choice>}},\n'
    return f"""You just solved the task "{task_name}" in {language}. Here is your solution:
@@ -50,11 +46,20 @@ def build_questionnaire_prompt(
 {solution_code}
 ```
-Please evaluate your experience by answering the following questionnaire. Respond with ONLY a JSON array — no other text.
+Rate the **language itself** on each aspect below, not the quality of this particular solution. Consider what the language's design and built-in features afford for this type of task.
 Respond with ONLY a JSON array — no other text. For "selected", use one of: {choices_str}
 [
-{questions_text}  {{"question": "Free-form observation about using {language} for this task", "selected": "<your observation>"}}
+{questions_text}]"""
-]"""
+
 def _extract_int(value: str) -> int | None:
    """Extract leading digit from a response like '4 - Agree'."""
    s = value.strip()
    if s and s[0].isdigit():
        return int(s[0])
    return None
 def parse_questionnaire_response(response: str) -> list[QuestionnaireResponse]:
@@ -70,11 +75,20 @@ def parse_questionnaire_response(response: str) -> list[QuestionnaireResponse]:
    results = []
    for item in data:
        question_id = item.get("id", item.get("question", ""))
        raw_selected = item.get("selected", "")
        # Normalize to int
        if isinstance(raw_selected, int):
            selected: int | str = raw_selected
        else:
            parsed = _extract_int(str(raw_selected))
            selected = parsed if parsed is not None else raw_selected
        results.append(
            QuestionnaireResponse(
-                question=item.get("question", ""),
+                question=question_id,
-                selected=item.get("selected", ""),
+                selected=selected,
                choices=item.get("choices"),
            )
        )
    return results
--- a/lush_bench/report.py
+++ b/lush_bench/report.py
@@ -5,16 +5,32 @@ from pathlib import Path
 from .models import BenchmarkResult
-# Likert questions in order (must match questionnaire.py QUESTIONS)
+# New 12-item question list: (key, label, dimension)
 LIKERT_QUESTIONS = [
-    ("Readability", "Readability"),
+    ("syntax_clarity", "Syntax clarity", "Syntax & Readability"),
-    ("Expressiveness", "Expressiveness"),
+    ("signal_to_noise", "Signal-to-noise", "Syntax & Readability"),
-    ("Conciseness", "Conciseness"),
+    ("familiar_conventions", "Familiar conventions", "Syntax & Readability"),
-    ("Error handling", "Error handling"),
+    ("builtin_ops", "Built-in operations", "Expressiveness"),
-    ("Overall preference", "Overall preference"),
+    ("string_ops", "String operations", "Expressiveness"),
-    ("Learning curve", "Learning curve"),
+    ("composition", "Composition", "Expressiveness"),
    ("io_ergonomics", "I/O ergonomics", "Data & I/O"),
    ("data_structures", "Data structures", "Data & I/O"),
    ("error_model", "Error model", "Error Handling"),
    ("edge_case_support", "Edge case support", "Error Handling"),
    ("learnability", "Learnability", "Overall"),
    ("fitness", "Fitness for task", "Overall"),
 ]
 # Map old 6 legacy keys to new keys for back-compat with existing results
 LEGACY_KEY_MAP = {
    "Readability": ["syntax_clarity", "signal_to_noise", "familiar_conventions"],
    "Expressiveness": ["builtin_ops", "string_ops", "composition"],
    "Conciseness": ["signal_to_noise"],
    "Error handling": ["error_model", "edge_case_support"],
    "Overall preference": ["fitness"],
    "Learning curve": ["learnability"],
 }
 def load_latest_results(results_dir: Path) -> list[BenchmarkResult]:
    """Load results, keeping only the latest run per task name."""
@@ -30,7 +46,7 @@ def load_latest_results(results_dir: Path) -> list[BenchmarkResult]:
 def _parse_likert(selected: str | int) -> int | None:
-    """Extract numeric value from a likert response like '4 - Agree'."""
+    """Extract numeric value from a likert response. Handles int directly or string like '4 - Agree'."""
    if isinstance(selected, int):
        return selected
    s = str(selected).strip()
@@ -40,20 +56,34 @@ def _parse_likert(selected: str | int) -> int | None:
 def _get_likert_scores(result: BenchmarkResult) -> dict[str, dict[str, float | None]]:
-    """Extract likert scores per language. Returns {question_key: {bash: N, lush: N}}."""
+    """Extract likert scores per language. Returns {question_key: {bash: N, lush: N}}.
    Handles both new-format results (exact id match) and legacy results (startswith match
    mapped to new keys).
    """
    scores: dict[str, dict[str, float | None]] = {}
-    for key, _ in LIKERT_QUESTIONS:
+    for key, _, _ in LIKERT_QUESTIONS:
        scores[key] = {"bash": None, "lush": None}
    for lang_name, lang_result in [("bash", result.bash_result), ("lush", result.lush_result)]:
        if not lang_result:
            continue
        for q in lang_result.questionnaire:
-            for key, _ in LIKERT_QUESTIONS:
+            # Try exact match on new question ids
-                if q.question.startswith(key):
+            if q.question in scores:
                val = _parse_likert(q.selected)
                if val is not None:
                    scores[q.question][lang_name] = float(val)
                continue
            # Legacy: map old key to new keys (spread the score)
            for legacy_prefix, new_keys in LEGACY_KEY_MAP.items():
                if q.question.startswith(legacy_prefix):
                    val = _parse_likert(q.selected)
                    if val is not None:
-                        scores[key][lang_name] = float(val)
+                        for nk in new_keys:
                            if scores[nk][lang_name] is None:
                                scores[nk][lang_name] = float(val)
                    break
    return scores
@@ -64,19 +94,6 @@ def _bar(value: float, max_val: float = 5.0, width: int = 20) -> str:
    return "\u2588" * filled + "\u2591" * (width - filled)
 def _get_freeform(result: BenchmarkResult) -> dict[str, str]:
    """Extract free-form observations per language."""
    obs: dict[str, str] = {}
    for lang_name, lang_result in [("bash", result.bash_result), ("lush", result.lush_result)]:
        if not lang_result:
            continue
        for q in lang_result.questionnaire:
            if q.question.startswith("Free-form"):
                obs[lang_name] = str(q.selected)
                break
    return obs
 def render_summary_table(results: list[BenchmarkResult]) -> str:
    """Render the pass/fail + turns overview table."""
    lines: list[str] = []
@@ -123,7 +140,7 @@ def render_summary_table(results: list[BenchmarkResult]) -> str:
 def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
-    """Render aggregated questionnaire scores with bar charts."""
+    """Render aggregated questionnaire scores with bar charts, grouped by dimension."""
    lines: list[str] = []
    lines.append("=" * 78)
    lines.append("  QUESTIONNAIRE SCORES (1-5 Likert, higher = better)")
@@ -132,7 +149,7 @@ def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
    # Aggregate scores across all tasks
    agg: dict[str, dict[str, list[float]]] = {}
-    for key, _ in LIKERT_QUESTIONS:
+    for key, _, _ in LIKERT_QUESTIONS:
        agg[key] = {"bash": [], "lush": []}
    for r in results:
@@ -143,7 +160,15 @@ def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
                if val is not None:
                    agg[key][lang].append(val)
-    for key, label in LIKERT_QUESTIONS:
+    # Group by dimension
    current_dim = None
    for key, label, dimension in LIKERT_QUESTIONS:
        if dimension != current_dim:
            if current_dim is not None:
                lines.append("")
            lines.append(f"  [{dimension}]")
            current_dim = dimension
        b_vals = agg[key]["bash"]
        l_vals = agg[key]["lush"]
        b_avg = sum(b_vals) / len(b_vals) if b_vals else 0.0
@@ -151,10 +176,9 @@ def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
        diff = l_avg - b_avg
        diff_str = f"+{diff:.1f}" if diff > 0 else f"{diff:.1f}" if diff < 0 else " 0.0"
-        lines.append(f"  {label}")
+        lines.append(f"    {label}")
-        lines.append(f"    bash  {_bar(b_avg)} {b_avg:.1f}")
+        lines.append(f"      bash  {_bar(b_avg)} {b_avg:.1f}")
-        lines.append(f"    lush  {_bar(l_avg)} {l_avg:.1f}  ({diff_str})")
+        lines.append(f"      lush  {_bar(l_avg)} {l_avg:.1f}  ({diff_str})")
        lines.append("")
    # Overall average
    all_bash = [v for key in agg for v in agg[key]["bash"]]
@@ -164,6 +188,7 @@ def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
    diff = l_overall - b_overall
    diff_str = f"+{diff:.1f}" if diff > 0 else f"{diff:.1f}" if diff < 0 else " 0.0"
    lines.append("")
    lines.append("  " + "-" * 50)
    lines.append(f"  Overall average")
    lines.append(f"    bash  {_bar(b_overall)} {b_overall:.1f}")
@@ -244,7 +269,7 @@ def render_per_category_questionnaire(results: list[BenchmarkResult]) -> str:
        lines.append(f"  {cat}")
        agg: dict[str, dict[str, list[float]]] = {}
-        for key, _ in LIKERT_QUESTIONS:
+        for key, _, _ in LIKERT_QUESTIONS:
            agg[key] = {"bash": [], "lush": []}
        for r in cat_results:
            scores = _get_likert_scores(r)
@@ -254,7 +279,7 @@ def render_per_category_questionnaire(results: list[BenchmarkResult]) -> str:
                    if val is not None:
                        agg[key][lang].append(val)
-        for key, label in LIKERT_QUESTIONS:
+        for key, label, _ in LIKERT_QUESTIONS:
            b_vals = agg[key]["bash"]
            l_vals = agg[key]["lush"]
            b_avg = sum(b_vals) / len(b_vals) if b_vals else 0.0
@@ -284,7 +309,7 @@ def render_per_task_detail(results: list[BenchmarkResult]) -> str:
        scores = _get_likert_scores(r)
        lines.append(f"    {'Metric':<22s} {'Bash':>4s}  {'Lush':>4s}  {'Diff':>5s}")
        lines.append("    " + "-" * 40)
-        for key, label in LIKERT_QUESTIONS:
+        for key, label, _ in LIKERT_QUESTIONS:
            b_val = scores[key]["bash"]
            l_val = scores[key]["lush"]
            b_str = f"{b_val:.0f}" if b_val is not None else "-"
@@ -296,15 +321,6 @@ def render_per_task_detail(results: list[BenchmarkResult]) -> str:
                d_str = "-"
            lines.append(f"    {label:<22s} {b_str:>4s}  {l_str:>4s}  {d_str:>5s}")
        # Free-form observations
        obs = _get_freeform(r)
        if obs:
            lines.append("")
            for lang, text in obs.items():
                # Wrap long text
                wrapped = text[:120] + ("..." if len(text) > 120 else "")
                lines.append(f"    {lang}: {wrapped}")
    lines.append("")
    return "\n".join(lines)
--- a/main.py
+++ b/main.py
@@ -2,7 +2,9 @@ from __future__ import annotations
 import argparse
 import sys
 import threading
 import tomllib
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from datetime import datetime, timezone
 from pathlib import Path
@@ -16,6 +18,8 @@ from lush_bench.export import export_html
 from lush_bench.report import render_report
 from lush_bench.results import save_result
 _print_lock = threading.Lock()
 PROVIDERS = {
    "anthropic": AnthropicProvider,
@@ -70,39 +74,44 @@ def cmd_list_tasks(args: argparse.Namespace) -> None:
        print(f"  [{task.category:<12s} {task.mode:<7s}] {task.name:20s}  {p.relative_to(Path.cwd())}")
-def cmd_run(args: argparse.Namespace) -> None:
+def _log(msg: str) -> None:
-    config = Config.load()
+    """Thread-safe print."""
-    task_path = Path(args.task)
+    with _print_lock:
        print(msg)
 def _run_task(
    task_path: Path,
    provider_name: str,
    config: Config,
    provider: AnthropicProvider | None = None,
 ) -> BenchmarkResult:
    """Core task runner. Thread-safe — usable from cmd_run or a thread pool."""
    task = load_task(task_path)
-    provider_name = args.provider
+    if provider is None:
-    if provider_name not in PROVIDERS:
+        provider_config = config.provider_configs.get(provider_name, {})
-        print(f"Unknown provider: {provider_name}. Available: {', '.join(PROVIDERS)}")
+        provider = PROVIDERS[provider_name](provider_config)
        sys.exit(1)
    provider_config = config.provider_configs.get(provider_name, {})
    provider = PROVIDERS[provider_name](provider_config)
    timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
-    print(f"Running task: {task.name} ({task.category}/{task.mode}) with {provider.model_name}")
+    _log(f"Running task: {task.name} ({task.category}/{task.mode}) with {provider.model_name}")
    bash_result = None
    lush_result = None
    if task.mode == "solve":
-        # Solve mode: agent writes code in both languages
+        _log(f"  [{task.name}] Solving in bash...")
        print("  Solving in bash...")
        bash_result = solve_task(provider, task, "bash", config)
-        print(f"  Bash: {'PASS' if bash_result.all_passed else 'FAIL'} ({bash_result.agent_turns} turns)")
+        _log(f"  [{task.name}] Bash: {'PASS' if bash_result.all_passed else 'FAIL'} ({bash_result.agent_turns} turns)")
-        print("  Solving in lush...")
+        _log(f"  [{task.name}] Solving in lush...")
        lush_result = solve_task(provider, task, "lush", config)
-        print(f"  Lush: {'PASS' if lush_result.all_passed else 'FAIL'} ({lush_result.agent_turns} turns)")
+        _log(f"  [{task.name}] Lush: {'PASS' if lush_result.all_passed else 'FAIL'} ({lush_result.agent_turns} turns)")
    elif task.mode == "convert":
        # Convert mode: verify provided bash source directly, then convert to lush
        assert task.bash_source, f"Convert-mode task {task.name} missing bash_source"
-        print("  Verifying provided bash source...")
+        _log(f"  [{task.name}] Verifying provided bash source...")
        test_results = evaluate(task, task.bash_source, "bash", config)
        all_passed = all(tr.passed for tr in test_results)
        bash_result = LanguageResult(
@@ -112,16 +121,16 @@ def cmd_run(args: argparse.Namespace) -> None:
            all_passed=all_passed,
            agent_turns=0,
        )
-        print(f"  Bash: {'PASS' if all_passed else 'FAIL'}")
+        _log(f"  [{task.name}] Bash: {'PASS' if all_passed else 'FAIL'}")
-        print("  Converting to lush...")
+        _log(f"  [{task.name}] Converting to lush...")
        lush_result = solve_task(provider, task, "lush", config)
-        print(f"  Lush: {'PASS' if lush_result.all_passed else 'FAIL'} ({lush_result.agent_turns} turns)")
+        _log(f"  [{task.name}] Lush: {'PASS' if lush_result.all_passed else 'FAIL'} ({lush_result.agent_turns} turns)")
    # Run questionnaire for each completed language
    for lang, result in [("bash", bash_result), ("lush", lush_result)]:
        if result and result.solution_code:
-            print(f"  Questionnaire for {lang}...")
+            _log(f"  [{task.name}] Questionnaire for {lang}...")
            result.questionnaire = run_questionnaire(provider, task.name, lang, result.solution_code)
    benchmark = BenchmarkResult(
@@ -136,20 +145,60 @@ def cmd_run(args: argparse.Namespace) -> None:
    )
    result_dir = save_result(benchmark, config.output_dir)
-    print(f"  Results saved to {result_dir}")
+    _log(f"  [{task.name}] Results saved to {result_dir}")
    return benchmark
 def cmd_run(args: argparse.Namespace) -> None:
    config = Config.load()
    provider_name = args.provider
    if provider_name not in PROVIDERS:
        print(f"Unknown provider: {provider_name}. Available: {', '.join(PROVIDERS)}")
        sys.exit(1)
    provider_config = config.provider_configs.get(provider_name, {})
    provider = PROVIDERS[provider_name](provider_config)
    _run_task(Path(args.task), provider_name, config, provider)
 def cmd_run_all(args: argparse.Namespace) -> None:
    config = Config.load()
    paths = find_tasks(args.category, getattr(args, "mode", None))
    if not paths:
        print("No tasks found.")
        return
-    for p in paths:
+    provider_name = args.provider
-        # Reuse cmd_run by constructing a namespace
+    if provider_name not in PROVIDERS:
-        run_args = argparse.Namespace(task=str(p), provider=args.provider)
+        print(f"Unknown provider: {provider_name}. Available: {', '.join(PROVIDERS)}")
-        cmd_run(run_args)
+        sys.exit(1)
-        print()
+
    # Share one provider instance across threads (its rate limiter is thread-safe)
    provider_config = config.provider_configs.get(provider_name, {})
    provider = PROVIDERS[provider_name](provider_config)
    max_workers = args.workers if args.workers is not None else config.max_workers
    print(f"Running {len(paths)} tasks with {max_workers} workers")
    failed: list[str] = []
    with ThreadPoolExecutor(max_workers=max_workers) as pool:
        futures = {
            pool.submit(_run_task, p, provider_name, config, provider): p
            for p in paths
        }
        for future in as_completed(futures):
            task_path = futures[future]
            try:
                future.result()
            except Exception as exc:
                task_name = task_path.stem
                failed.append(task_name)
                _log(f"  [{task_name}] FAILED: {exc}")
    print(f"\nDone. {len(paths) - len(failed)}/{len(paths)} succeeded.")
    if failed:
        print(f"Failed: {', '.join(failed)}")
 def cmd_report(args: argparse.Namespace) -> None:
@@ -183,6 +232,7 @@ def main() -> None:
    ra.add_argument("--category", choices=["algorithm", "pipeline", "environment", "filesystem", "process"], help="Filter by category")
    ra.add_argument("--mode", choices=["solve", "convert"], help="Filter by mode")
    ra.add_argument("--provider", default="anthropic", help="LLM provider")
    ra.add_argument("--workers", type=int, default=None, help="Max parallel tasks (default: from config, typically 4)")
    ra.set_defaults(func=cmd_run_all)
    # report
--- a/report.html
+++ b/report.html
--- a/tasks/environment/path_normalizer.toml
+++ b/tasks/environment/path_normalizer.toml
@@ -0,0 +1,97 @@
 name = "path_normalizer"
 category = "environment"
 mode = "convert"
 description = """
 Read file paths from stdin, one per line. Normalize each path:
 1. Replace a leading "~" with the value of $HOME.
 2. Remove trailing slashes (except for root "/").
 3. Collapse consecutive slashes into one.
 4. Resolve "." components (remove them).
 5. Resolve ".." components (go up one directory level).
 Output the cleaned path, one per line.
 Skip empty lines.
 """
 bash_source = '''
 #!/bin/bash
 while IFS= read -r line || [[ -n "$line" ]]; do
  [[ -z "$line" ]] && continue
  # Expand tilde
  path=$(echo "$line" | sed "s:^~:$HOME:")
  # Collapse multiple slashes
  path=$(echo "$path" | sed 's:/\\+:/:g')
  # Remove trailing slash (but keep root)
  path=$(echo "$path" | sed 's:/$::')
  [[ -z "$path" ]] && path="/"
  # Resolve . and .. components
  IFS='/' read -ra parts <<< "$path"
  result=()
  absolute=""
  if [[ "$path" == /* ]]; then
    absolute="/"
  fi
  for part in "${parts[@]}"; do
    if [[ "$part" == "." || "$part" == "" ]]; then
      continue
    elif [[ "$part" == ".." ]]; then
      if [[ ${#result[@]} -gt 0 && "${result[-1]}" != ".." ]]; then
        unset 'result[${#result[@]}-1]'
      elif [[ -z "$absolute" ]]; then
        result+=("..")
      fi
    else
      result+=("$part")
    fi
  done
  if [[ -n "$absolute" ]]; then
    final="/"
    IFS='/'; final+="${result[*]}"
  else
    IFS='/'; final="${result[*]}"
  fi
  [[ -z "$final" ]] && final="/"
  echo "$final"
 done
 '''
 [[test_cases]]
 description = "Tilde expansion and trailing slashes"
 stdin = """~/Documents/
 ~/projects/code
 /var/log/"""
 expected_stdout = """/Users/testuser/Documents
 /Users/testuser/projects/code
 /var/log"""
 env = { "HOME" = "/Users/testuser" }
 [[test_cases]]
 description = "Resolving . and .. components"
 stdin = """/usr/local/./bin
 /home/user/../shared/docs
 /a/b/c/../../d"""
 expected_stdout = """/usr/local/bin
 /home/shared/docs
 /a/d"""
 [[test_cases]]
 description = "Collapsing multiple slashes"
 stdin = """/usr//local///bin
 /var/log//syslog"""
 expected_stdout = """/usr/local/bin
 /var/log/syslog"""
 [[test_cases]]
 description = "Root and relative paths"
 stdin = """/
 ./config/settings
 ../parent/child"""
 expected_stdout = """/
 config/settings
 ../parent/child"""
--- a/tasks/filesystem/todo_manager.toml
+++ b/tasks/filesystem/todo_manager.toml
@@ -0,0 +1,150 @@
 name = "todo_manager"
 category = "filesystem"
 mode = "convert"
 description = """
 A simple todo list manager. Read commands from stdin, one per line:
  add <task text>   — append the task to the todo file
  list              — print all tasks numbered as "NN). <task>"
  remove <N>        — remove task number N (1-based)
  clear             — remove all tasks
 The todo file is "todo.txt" in the working directory.
 When listing, pad task numbers to two digits (01, 02, …).
 After "add" or "remove", automatically list the remaining tasks.
 If the list is empty, print "No tasks found".
 """
 bash_source = '''
 #!/bin/bash
 TODOFILE="./todo.txt"
 list_tasks() {
  if [ -f "$TODOFILE" ] && [ -s "$TODOFILE" ]; then
    count=1
    IFS=$'\\n'
    while read -r task; do
      num=$count
      if [ $count -lt 10 ]; then num="0$count"; fi
      echo "$num). $task"
      count=$(( count + 1 ))
    done < "$TODOFILE"
  else
    echo "No tasks found"
  fi
 }
 add_task() {
  echo "$1" >> "$TODOFILE"
 }
 remove_task() {
  taskNum=$1
  totalLines=$(wc -l < "$TODOFILE" | tr -d ' ')
  if [ "$taskNum" -gt "$totalLines" ] 2>/dev/null; then
    echo "Error: task number $taskNum does not exist!"
    return 1
  fi
  tmpfile="./todo_tmp.txt"
  count=1
  IFS=$'\\n'
  while read -r task; do
    if [ "$count" -ne "$taskNum" ]; then
      echo "$task" >> "$tmpfile"
    fi
    count=$(( count + 1 ))
  done < "$TODOFILE"
  if [ -f "$tmpfile" ]; then
    mv "$tmpfile" "$TODOFILE"
  else
    > "$TODOFILE"
  fi
  echo "Sucessfully removed task number $taskNum"
 }
 clear_tasks() {
  > "$TODOFILE"
  echo "Tasks cleared."
 }
 if [ ! -f "$TODOFILE" ]; then
  touch "$TODOFILE"
 fi
 while IFS= read -r line || [[ -n "$line" ]]; do
  cmd=$(echo "$line" | cut -d' ' -f1)
  arg=$(echo "$line" | cut -d' ' -f2-)
  case "$cmd" in
    add)
      add_task "$arg"
      list_tasks
      ;;
    list)
      list_tasks
      ;;
    remove)
      remove_task "$arg"
      list_tasks
      ;;
    clear)
      clear_tasks
      ;;
  esac
 done
 '''
 [[test_cases]]
 description = "Add tasks then list"
 stdin = """add Buy groceries
 add Walk the dog
 list"""
 expected_stdout = """01). Buy groceries
 01). Buy groceries
 02). Walk the dog
 01). Buy groceries
 02). Walk the dog"""
 [[test_cases]]
 description = "Add, remove, list"
 stdin = """add First task
 add Second task
 add Third task
 remove 2
 list"""
 expected_stdout = """01). First task
 01). First task
 02). Second task
 01). First task
 02). Second task
 03). Third task
 Sucessfully removed task number 2
 01). First task
 02). Third task
 01). First task
 02). Third task"""
 [[test_cases]]
 description = "Empty list and clear"
 stdin = """list
 add Something
 clear
 list"""
 expected_stdout = """No tasks found
 01). Something
 Tasks cleared.
 No tasks found"""
 [[test_cases]]
 description = "Works with pre-existing todo file"
 stdin = """list
 add Third item
 list"""
 setup_files = { "todo.txt" = "Existing item one\nExisting item two\n" }
 expected_stdout = """01). Existing item one
 02). Existing item two
 01). Existing item one
 02). Existing item two
 03). Third item
 01). Existing item one
 02). Existing item two
 03). Third item"""
 expected_files = { "todo.txt" = "Existing item one\nExisting item two\nThird item\n" }
--- a/tasks/pipeline/currency_converter.toml
+++ b/tasks/pipeline/currency_converter.toml
@@ -0,0 +1,113 @@
 name = "currency_converter"
 category = "pipeline"
 mode = "convert"
 description = """
 A currency converter that reads conversion requests from stdin.
 Each line has the format: AMOUNT FROM TO RATE
  - AMOUNT: a decimal number (e.g., 12.35)
  - FROM: 3-letter currency code
  - TO: 3-letter currency code
  - RATE: the exchange rate from FROM's base to TO's base
 Some currencies are pegged to others at fixed rates:
  BAM is pegged to EUR at 1.95583
  BMD is pegged to USD at 1.0
  BND is pegged to SGD at 1.0
  DJF is pegged to USD at 177.721
  PAB is pegged to USD at 1.0
 When a pegged currency is involved, the conversion must account for the
 peg coefficient. The formula is: result = amount * (rate / coef_from) * coef_to
 where coef is the peg ratio (1 if not pegged).
 Output one line per input: "AMOUNT FROM = RESULT TO" with RESULT
 computed using bc with scale=2.
 For invalid lines (wrong field count or non-numeric amount), output "ERROR: <original line>".
 """
 bash_source = '''
 #!/bin/bash
 pegged_to() {
  case "$1" in
    BAM) echo "EUR:1.95583" ;;
    BMD) echo "USD:1.0" ;;
    BND) echo "SGD:1.0" ;;
    DJF) echo "USD:177.721" ;;
    PAB) echo "USD:1.0" ;;
    *)   echo "NONE:1" ;;
  esac
 }
 while IFS= read -r line || [[ -n "$line" ]]; do
  # Skip empty lines
  [[ -z "$line" ]] && continue
  # Parse fields
  set -- $line
  if [[ $# -ne 4 ]]; then
    echo "ERROR: $line"
    continue
  fi
  amount=$1
  from=$2
  to=$3
  rate=$4
  # Validate amount is numeric
  if [[ ! "$amount" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then
    echo "ERROR: $line"
    continue
  fi
  # Validate rate is numeric
  if [[ ! "$rate" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then
    echo "ERROR: $line"
    continue
  fi
  # Get peg info
  peg_from=$(pegged_to "$from")
  coef_from=$(echo "$peg_from" | cut -d: -f2)
  peg_to=$(pegged_to "$to")
  coef_to=$(echo "$peg_to" | cut -d: -f2)
  # Calculate: result = amount * (rate / coef_from) * coef_to
  result=$(echo "scale=8; $amount * ($rate / $coef_from) * $coef_to" | bc)
  result=$(printf "%.2f" "$result")
  echo "$amount $from = $result $to"
 done
 '''
 [[test_cases]]
 description = "Standard conversion with direct rate"
 stdin = """100 USD EUR 0.92
 50 GBP JPY 188.50"""
 expected_stdout = """100 USD = 92.00 EUR
 50 GBP = 9425.00 JPY"""
 [[test_cases]]
 description = "Pegged currency conversions"
 stdin = """100 BAM USD 1.08
 200 BMD EUR 0.92
 50 USD DJF 1.0"""
 expected_stdout = """100 BAM = 55.22 USD
 200 BMD = 184.00 EUR
 50 USD = 8886.05 DJF"""
 [[test_cases]]
 description = "Invalid input lines"
 stdin = """abc EUR USD 0.92
 100 USD
 100 EUR USD 0.85"""
 expected_stdout = """ERROR: abc EUR USD 0.92
 ERROR: 100 USD
 100 EUR = 85.00 USD"""
 [[test_cases]]
 description = "Pegged-to-pegged conversion"
 stdin = "100 BAM BMD 1.08"
 expected_stdout = "100 BAM = 55.22 BMD"
--- a/tasks/pipeline/locale_weather_url.toml
+++ b/tasks/pipeline/locale_weather_url.toml
@@ -0,0 +1,76 @@
 name = "locale_weather_url"
 category = "pipeline"
 mode = "convert"
 description = """
 Construct weather API URLs from locale and location information.
 Read lines from stdin in the format: LANG_CODE LOCATION
 where LANG_CODE is a 2-letter locale (e.g., "en", "fr", "de")
 and LOCATION is a city/place name (may contain spaces).
 For each line, construct a URL in the format:
  https://LANG.wttr.in/LOCATION
 Where spaces in the location are replaced with "+" characters.
 If LANG_CODE is empty or invalid (not exactly 2 lowercase letters),
 default to "en".
 Skip empty lines. Output one URL per input line.
 """
 bash_source = '''
 #!/bin/bash
 while IFS= read -r line || [[ -n "$line" ]]; do
  [[ -z "$line" ]] && continue
  # Extract lang code (first field)
  lang=$(echo "$line" | awk '{print $1}')
  # Extract location (everything after first field)
  location=$(echo "$line" | sed 's/^[^ ]* *//')
  # Validate lang code: must be exactly 2 lowercase letters
  if [[ ! "$lang" =~ ^[a-z]{2}$ ]]; then
    lang="en"
  fi
  # If location is same as lang (single-word line), skip
  if [[ "$location" == "$lang" || -z "$location" ]]; then
    location=""
  fi
  # Replace spaces with +
  location=$(echo "$location" | tr ' ' '+')
  echo "https://$lang.wttr.in/$location"
 done
 '''
 [[test_cases]]
 description = "Various locales and locations"
 stdin = """en New York
 fr Paris
 de Berlin
 ja Tokyo"""
 expected_stdout = """https://en.wttr.in/New+York
 https://fr.wttr.in/Paris
 https://de.wttr.in/Berlin
 https://ja.wttr.in/Tokyo"""
 [[test_cases]]
 description = "Multi-word locations"
 stdin = """en San Francisco
 es Buenos Aires
 pt Rio de Janeiro"""
 expected_stdout = """https://en.wttr.in/San+Francisco
 https://es.wttr.in/Buenos+Aires
 https://pt.wttr.in/Rio+de+Janeiro"""
 [[test_cases]]
 description = "Invalid or missing locale defaults to en"
 stdin = """ENG London
 123 Moscow
 x Rome"""
 expected_stdout = """https://en.wttr.in/London
 https://en.wttr.in/Moscow
 https://en.wttr.in/Rome"""
--- a/tasks/pipeline/network_info_parser.toml
+++ b/tasks/pipeline/network_info_parser.toml
@@ -0,0 +1,86 @@
 name = "network_info_parser"
 category = "pipeline"
 mode = "convert"
 description = """
 Parse network interface configuration from stdin (in "ip addr show" format)
 and extract a summary of each interface.
 For each interface block, output a line:
  IFACE: <name> IP: <ipv4_addr> MASK: /<prefix_len>
 An interface block starts with a line like:
  2: eth0: <BROADCAST,MULTICAST,UP> mtu 1500 ...
 and contains inet lines like:
  inet 192.168.1.100/24 brd 192.168.1.255 scope global eth0
 If an interface has no inet line, output:
  IFACE: <name> IP: none MASK: none
 Skip the loopback interface (lo).
 """
 bash_source = '''
 #!/bin/bash
 current_iface=""
 found_ip=""
 found_mask=""
 flush_iface() {
  if [[ -n "$current_iface" && "$current_iface" != "lo" ]]; then
    if [[ -n "$found_ip" ]]; then
      echo "IFACE: $current_iface IP: $found_ip MASK: /$found_mask"
    else
      echo "IFACE: $current_iface IP: none MASK: none"
    fi
  fi
 }
 while IFS= read -r line || [[ -n "$line" ]]; do
  # Detect interface line: starts with a number followed by colon
  if echo "$line" | grep -qE '^[0-9]+:'; then
    flush_iface
    current_iface=$(echo "$line" | awk -F: '{print $2}' | sed 's/^[[:space:]]*//' | awk '{print $1}')
    found_ip=""
    found_mask=""
  fi
  # Detect inet line (IPv4 only, not inet6)
  if echo "$line" | grep -qE '^[[:space:]]+inet [0-9]'; then
    ip_cidr=$(echo "$line" | awk '{print $2}')
    found_ip=$(echo "$ip_cidr" | cut -d/ -f1)
    found_mask=$(echo "$ip_cidr" | cut -d/ -f2)
  fi
 done
 flush_iface
 '''
 [[test_cases]]
 description = "Two interfaces with IPs"
 stdin = """1: lo: <LOOPBACK,UP> mtu 65536
    inet 127.0.0.1/8 scope host lo
 2: eth0: <BROADCAST,MULTICAST,UP> mtu 1500
    inet 192.168.1.100/24 brd 192.168.1.255 scope global eth0
 3: wlan0: <BROADCAST,MULTICAST,UP> mtu 1500
    inet 10.0.0.42/16 brd 10.0.255.255 scope global wlan0"""
 expected_stdout = """IFACE: eth0 IP: 192.168.1.100 MASK: /24
 IFACE: wlan0 IP: 10.0.0.42 MASK: /16"""
 [[test_cases]]
 description = "Interface with no IP"
 stdin = """1: lo: <LOOPBACK,UP> mtu 65536
    inet 127.0.0.1/8 scope host lo
 2: eth0: <BROADCAST,MULTICAST,UP> mtu 1500
 3: docker0: <NO-CARRIER,BROADCAST,MULTICAST,UP> mtu 1500
    inet 172.17.0.1/16 brd 172.17.255.255 scope global docker0"""
 expected_stdout = """IFACE: eth0 IP: none MASK: none
 IFACE: docker0 IP: 172.17.0.1 MASK: /16"""
 [[test_cases]]
 description = "Single interface"
 stdin = """1: lo: <LOOPBACK,UP> mtu 65536
    inet 127.0.0.1/8 scope host lo
 2: enp3s0: <BROADCAST,MULTICAST,UP> mtu 9000
    inet 10.10.10.5/8 brd 10.255.255.255 scope global enp3s0"""
 expected_stdout = "IFACE: enp3s0 IP: 10.10.10.5 MASK: /8"
--- a/tasks/pipeline/url_normalizer.toml
+++ b/tasks/pipeline/url_normalizer.toml
@@ -0,0 +1,79 @@
 name = "url_normalizer"
 category = "pipeline"
 mode = "convert"
 description = """
 Read URLs from stdin, one per line. Normalize each URL:
 1. If the URL already starts with "https://", keep it as-is.
 2. If it starts with "http://", keep it as-is.
 3. Otherwise, prepend "http://" to it.
 4. After normalization, validate that the URL matches a basic pattern:
   it must have a protocol (http:// or https://), followed by at least
   one character, a dot, and at least one more character for the domain.
 5. Output the normalized URL, or "INVALID: <original>" for invalid entries.
 Skip empty lines silently.
 """
 bash_source = '''
 #!/bin/bash
 while IFS= read -r line || [[ -n "$line" ]]; do
  # Skip empty lines
  [[ -z "$line" ]] && continue
  # Trim whitespace
  url=$(echo "$line" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
  [[ -z "$url" ]] && continue
  original="$url"
  # Check if it already has https://
  prefix8=$(echo "$url" | cut -c1-8)
  if [[ "$prefix8" == "https://" ]]; then
    normalized="$url"
  else
    prefix7=$(echo "$url" | cut -c1-7)
    if [[ "$prefix7" == "http://" ]]; then
      normalized="$url"
    else
      normalized="http://$url"
    fi
  fi
  # Validate: protocol + something.something
  if echo "$normalized" | grep -qE '^https?://[^/]+\.[^/]+'; then
    echo "$normalized"
  else
    echo "INVALID: $original"
  fi
 done
 '''
 [[test_cases]]
 description = "URLs with and without protocol"
 stdin = """example.com
 http://example.com
 https://example.com
 www.google.com/search?q=test"""
 expected_stdout = """http://example.com
 http://example.com
 https://example.com
 http://www.google.com/search?q=test"""
 [[test_cases]]
 description = "Invalid entries"
 stdin = """notaurl
 https://valid.example.com
 just-a-word"""
 expected_stdout = """INVALID: notaurl
 https://valid.example.com
 INVALID: just-a-word"""
 [[test_cases]]
 description = "Mixed valid and empty lines"
 stdin = """https://secure.site.org/path
 api.service.io:8080
 http://old.site.net"""
 expected_stdout = """https://secure.site.org/path
 http://api.service.io:8080
 http://old.site.net"""