Revamp questionnaire, parallelize run-all, add new tasks

- Replace 6 compound Likert questions with 12 atomic ones grouped by dimension (syntax, expressiveness, data/IO, errors, overall); drop free-form question. Responses now stored as ints, not strings. - Back-compat layer maps legacy keys to new dimensions so existing results still render. - Parallelize run-all with ThreadPoolExecutor (configurable workers) and add a thread-safe min-request-interval rate limiter to the Anthropic provider. - Add new tasks: path_normalizer, todo_manager, currency_converter, locale_weather_url, network_info_parser, url_normalizer.
2026-04-07 19:07:21 +01:00
parent 20e62f60f6
commit 18ce7e57cf
13 changed files with 943 additions and 206 deletions
--- a/lush_bench/config.py
+++ b/lush_bench/config.py
@@ -13,6 +13,7 @@ class Config:
    timeout_seconds: float = 10.0
    normalize_whitespace: bool = True
    output_dir: Path = Path("results")
+    max_workers: int = 4
    provider_configs: dict[str, dict[str, Any]] = field(default_factory=dict)

    @classmethod
@@ -35,5 +36,6 @@ class Config:
            timeout_seconds=agent.get("timeout_seconds", 10.0),
            normalize_whitespace=agent.get("normalize_whitespace", True),
            output_dir=Path(results.get("output_dir", "results")),
+            max_workers=agent.get("max_workers", 4),
            provider_configs=provider_configs,
        )
--- a/lush_bench/export.py
+++ b/lush_bench/export.py
@@ -13,7 +13,6 @@ import matplotlib.ticker as ticker
 from .models import BenchmarkResult
 from .report import (
    LIKERT_QUESTIONS,
-    _get_freeform,
    _get_likert_scores,
    _parse_likert,
    load_latest_results,
@@ -35,7 +34,7 @@ def _fig_to_base64(fig: plt.Figure) -> str:
 def _aggregate_likert(results: list[BenchmarkResult]) -> dict[str, dict[str, float]]:
    """Return {question_key: {bash: avg, lush: avg}}."""
    agg: dict[str, dict[str, list[float]]] = {}
-    for key, _ in LIKERT_QUESTIONS:
+    for key, _, _ in LIKERT_QUESTIONS:
        agg[key] = {"bash": [], "lush": []}
    for r in results:
        scores = _get_likert_scores(r)
@@ -56,11 +55,11 @@ def _aggregate_likert(results: list[BenchmarkResult]) -> dict[str, dict[str, flo
 def chart_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
    """Grouped horizontal bar chart comparing bash vs lush on each Likert metric."""
    avgs = _aggregate_likert(results)
-    labels = [label for _, label in LIKERT_QUESTIONS]
-    bash_vals = [avgs[key]["bash"] for key, _ in LIKERT_QUESTIONS]
-    lush_vals = [avgs[key]["lush"] for key, _ in LIKERT_QUESTIONS]
+    labels = [label for _, label, _ in LIKERT_QUESTIONS]
+    bash_vals = [avgs[key]["bash"] for key, _, _ in LIKERT_QUESTIONS]
+    lush_vals = [avgs[key]["lush"] for key, _, _ in LIKERT_QUESTIONS]

-    fig, ax = plt.subplots(figsize=(8, 4.5))
+    fig, ax = plt.subplots(figsize=(8, 7))
    y = range(len(labels))
    bar_h = 0.35
    bars_bash = ax.barh([i + bar_h / 2 for i in y], bash_vals, bar_h, label="bash", color=BASH_COLOR)
@@ -112,14 +111,14 @@ def chart_turns_comparison(results: list[BenchmarkResult]) -> str:

 def chart_per_task_heatmap(results: list[BenchmarkResult]) -> str:
    """Heatmap showing lush-minus-bash score diff per task and metric."""
-    labels = [label for _, label in LIKERT_QUESTIONS]
+    labels = [label for _, label, _ in LIKERT_QUESTIONS]
    tasks = [r.task_name for r in results]

    data: list[list[float]] = []
    for r in results:
        scores = _get_likert_scores(r)
        row = []
-        for key, _ in LIKERT_QUESTIONS:
+        for key, _, _ in LIKERT_QUESTIONS:
            b = scores[key]["bash"]
            l = scores[key]["lush"]
            if b is not None and l is not None:
@@ -128,11 +127,11 @@ def chart_per_task_heatmap(results: list[BenchmarkResult]) -> str:
                row.append(0.0)
        data.append(row)

-    fig, ax = plt.subplots(figsize=(8, max(4, len(tasks) * 0.45 + 1)))
+    fig, ax = plt.subplots(figsize=(10, max(4, len(tasks) * 0.45 + 1)))
    im = ax.imshow(data, cmap="RdYlGn", aspect="auto", vmin=-3, vmax=3)

    ax.set_xticks(range(len(labels)))
-    ax.set_xticklabels(labels, rotation=35, ha="right", fontsize=8)
+    ax.set_xticklabels(labels, rotation=45, ha="right", fontsize=7)
    ax.set_yticks(range(len(tasks)))
    ax.set_yticklabels(tasks, fontsize=8)

@@ -140,7 +139,7 @@ def chart_per_task_heatmap(results: list[BenchmarkResult]) -> str:
        for j in range(len(labels)):
            val = data[i][j]
            text = f"+{val:.0f}" if val > 0 else f"{val:.0f}" if val < 0 else "0"
-            ax.text(j, i, text, ha="center", va="center", fontsize=8,
+            ax.text(j, i, text, ha="center", va="center", fontsize=7,
                    color="white" if abs(val) >= 2 else "black")

    ax.set_title("Score Difference (Lush - Bash)")
@@ -197,7 +196,7 @@ def chart_per_category_questionnaire(results: list[BenchmarkResult]) -> str:


 def chart_per_category_radar(results: list[BenchmarkResult]) -> list[tuple[str, str]]:
-    """Small-multiples bar charts: one per category showing 6 Likert dimensions for bash vs lush."""
+    """Small-multiples bar charts: one per category showing 12 Likert dimensions for bash vs lush."""
    import numpy as np
    from collections import defaultdict

@@ -206,12 +205,12 @@ def chart_per_category_radar(results: list[BenchmarkResult]) -> list[tuple[str,
        by_cat[r.category].append(r)

    charts: list[tuple[str, str]] = []
-    labels = [label for _, label in LIKERT_QUESTIONS]
+    labels = [label for _, label, _ in LIKERT_QUESTIONS]

    for cat in sorted(by_cat):
        cat_results = by_cat[cat]
        agg: dict[str, dict[str, list[float]]] = {}
-        for key, _ in LIKERT_QUESTIONS:
+        for key, _, _ in LIKERT_QUESTIONS:
            agg[key] = {"bash": [], "lush": []}
        for r in cat_results:
            scores = _get_likert_scores(r)
@@ -221,10 +220,10 @@ def chart_per_category_radar(results: list[BenchmarkResult]) -> list[tuple[str,
                    if val is not None:
                        agg[key][lang].append(val)

-        bash_vals = [sum(agg[k]["bash"]) / len(agg[k]["bash"]) if agg[k]["bash"] else 0.0 for k, _ in LIKERT_QUESTIONS]
-        lush_vals = [sum(agg[k]["lush"]) / len(agg[k]["lush"]) if agg[k]["lush"] else 0.0 for k, _ in LIKERT_QUESTIONS]
+        bash_vals = [sum(agg[k]["bash"]) / len(agg[k]["bash"]) if agg[k]["bash"] else 0.0 for k, _, _ in LIKERT_QUESTIONS]
+        lush_vals = [sum(agg[k]["lush"]) / len(agg[k]["lush"]) if agg[k]["lush"] else 0.0 for k, _, _ in LIKERT_QUESTIONS]

-        fig, ax = plt.subplots(figsize=(6, 3.5))
+        fig, ax = plt.subplots(figsize=(7, 5))
        y = range(len(labels))
        bar_h = 0.35
        ax.barh([i + bar_h / 2 for i in y], bash_vals, bar_h, label="bash", color=BASH_COLOR)
@@ -337,7 +336,7 @@ def _build_detail_html(results: list[BenchmarkResult]) -> str:

        scores = _get_likert_scores(r)
        score_rows = []
-        for key, label in LIKERT_QUESTIONS:
+        for key, label, _ in LIKERT_QUESTIONS:
            b_val = scores[key]["bash"]
            l_val = scores[key]["lush"]
            b_str = f"{b_val:.0f}" if b_val is not None else "-"
@@ -353,11 +352,6 @@ def _build_detail_html(results: list[BenchmarkResult]) -> str:
                              f'<td>{b_str}</td><td>{l_str}</td>'
                              f'<td class="{d_cls}">{d_str}</td></tr>')

-        obs = _get_freeform(r)
-        obs_html = ""
-        for lang, text in obs.items():
-            obs_html += f'<p><strong>{lang}:</strong> {html.escape(text)}</p>\n'
-
        sections.append(f"""
        <div class="task-detail">
            <h3>{html.escape(r.task_name)} <span class="cat">[{r.category}/{r.mode}]</span>
@@ -368,7 +362,6 @@ def _build_detail_html(results: list[BenchmarkResult]) -> str:
                <thead><tr><th>Metric</th><th>Bash</th><th>Lush</th><th>Diff</th></tr></thead>
                <tbody>{"".join(score_rows)}</tbody>
            </table>
-            <div class="observations">{obs_html}</div>
        </div>""")

    return "\n".join(sections)
@@ -424,8 +417,6 @@ def export_html(results_dir: Path, output_path: Path) -> None:
    .scores {{ width: auto; }}
    .scores td:nth-child(n+2) {{ text-align: center; min-width: 50px; }}
    .scores th:nth-child(n+2) {{ text-align: center; }}
-    .observations {{ margin-top: 12px; font-size: 0.85rem; color: #444; }}
-    .observations p {{ margin-bottom: 6px; }}
 </style>
 </head>
 <body>
--- a/lush_bench/providers/anthropic.py
+++ b/lush_bench/providers/anthropic.py
@@ -1,6 +1,8 @@
 from __future__ import annotations

 import os
+import threading
+import time
 from typing import Any

 import anthropic
@@ -17,8 +19,17 @@ class AnthropicProvider:
        self._client = anthropic.Anthropic(api_key=api_key)
        self._model = config.get("model", "claude-sonnet-4-20250514")
        self._max_tokens = config.get("max_tokens", 4096)
+        self._min_request_interval = config.get("min_request_interval", 0.1)
+        self._last_request_time = 0.0
+        self._lock = threading.Lock()

    def send(self, messages: list[Message], system: str = "") -> str:
+        with self._lock:
+            elapsed = time.monotonic() - self._last_request_time
+            if elapsed < self._min_request_interval:
+                time.sleep(self._min_request_interval - elapsed)
+            self._last_request_time = time.monotonic()
+
        api_messages = [{"role": m.role, "content": m.content} for m in messages]
        kwargs: dict[str, Any] = {
            "model": self._model,
--- a/lush_bench/questionnaire.py
+++ b/lush_bench/questionnaire.py
@@ -7,42 +7,38 @@ from .models import QuestionnaireResponse
 from .providers.base import LLMProvider, Message

 QUESTIONS = [
-    {
-        "question": "Readability: The solution is easy to read and understand",
-        "choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
-    },
-    {
-        "question": "Expressiveness: The language provided sufficient constructs to solve the problem naturally",
-        "choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
-    },
-    {
-        "question": "Conciseness: The solution required minimal boilerplate",
-        "choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
-    },
-    {
-        "question": "Error handling: Error handling was straightforward",
-        "choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
-    },
-    {
-        "question": "Overall preference: I would prefer this language for similar tasks",
-        "choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
-    },
-    {
-        "question": "Learning curve: An unfamiliar developer could understand the solution quickly",
-        "choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
-    },
+    # Syntax & Readability
+    {"id": "syntax_clarity", "dimension": "Syntax & Readability", "question": "The language's syntax makes the intent of operations visually obvious"},
+    {"id": "signal_to_noise", "dimension": "Syntax & Readability", "question": "The language keeps boilerplate low — most characters serve the task, not the language"},
+    {"id": "familiar_conventions", "dimension": "Syntax & Readability", "question": "The language follows conventions that developers from other languages would recognize"},
+    # Expressiveness
+    {"id": "builtin_ops", "dimension": "Expressiveness", "question": "The language provides built-in operations for the core task requirements (no workarounds needed)"},
+    {"id": "string_ops", "dimension": "Expressiveness", "question": "The language's string manipulation capabilities are convenient for this task"},
+    {"id": "composition", "dimension": "Expressiveness", "question": "The language makes it easy to compose operations (piping, chaining, nesting)"},
+    # Data & I/O
+    {"id": "io_ergonomics", "dimension": "Data & I/O", "question": "Reading input and producing output is straightforward in this language"},
+    {"id": "data_structures", "dimension": "Data & I/O", "question": "The language's data structures (arrays, maps, variables) are well-suited to this task"},
+    # Error Handling
+    {"id": "error_model", "dimension": "Error Handling", "question": "The language's error handling model is clear and predictable"},
+    {"id": "edge_case_support", "dimension": "Error Handling", "question": "The language makes it easy to handle edge cases (empty input, missing data, type mismatches)"},
+    # Overall
+    {"id": "learnability", "dimension": "Overall", "question": "A developer unfamiliar with this language could learn enough to solve this task quickly"},
+    {"id": "fitness", "dimension": "Overall", "question": "This language is a good fit for this type of task"},
 ]

+CHOICES = ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"]
+

 def build_questionnaire_prompt(
    task_name: str,
    language: str,
    solution_code: str,
 ) -> str:
+    choices_str = ", ".join(f'"{c}"' for c in CHOICES)
+
    questions_text = ""
-    for i, q in enumerate(QUESTIONS, 1):
-        choices_str = ", ".join(f'"{c}"' for c in q["choices"])
-        questions_text += f'  {{"question": "{q["question"]}", "choices": [{choices_str}], "selected": <your choice>}},\n'
+    for q in QUESTIONS:
+        questions_text += f'  {{"id": "{q["id"]}", "question": "{q["question"]}", "selected": <your choice>}},\n'

    return f"""You just solved the task "{task_name}" in {language}. Here is your solution:

@@ -50,11 +46,20 @@ def build_questionnaire_prompt(
 {solution_code}
 ```

-Please evaluate your experience by answering the following questionnaire. Respond with ONLY a JSON array — no other text.
+Rate the **language itself** on each aspect below, not the quality of this particular solution. Consider what the language's design and built-in features afford for this type of task.
+
+Respond with ONLY a JSON array — no other text. For "selected", use one of: {choices_str}

 [
-{questions_text}  {{"question": "Free-form observation about using {language} for this task", "selected": "<your observation>"}}
-]"""
+{questions_text}]"""
+
+
+def _extract_int(value: str) -> int | None:
+    """Extract leading digit from a response like '4 - Agree'."""
+    s = value.strip()
+    if s and s[0].isdigit():
+        return int(s[0])
+    return None


 def parse_questionnaire_response(response: str) -> list[QuestionnaireResponse]:
@@ -70,11 +75,20 @@ def parse_questionnaire_response(response: str) -> list[QuestionnaireResponse]:

    results = []
    for item in data:
+        question_id = item.get("id", item.get("question", ""))
+        raw_selected = item.get("selected", "")
+
+        # Normalize to int
+        if isinstance(raw_selected, int):
+            selected: int | str = raw_selected
+        else:
+            parsed = _extract_int(str(raw_selected))
+            selected = parsed if parsed is not None else raw_selected
+
        results.append(
            QuestionnaireResponse(
-                question=item.get("question", ""),
-                selected=item.get("selected", ""),
-                choices=item.get("choices"),
+                question=question_id,
+                selected=selected,
            )
        )
    return results
--- a/lush_bench/report.py
+++ b/lush_bench/report.py
@@ -5,16 +5,32 @@ from pathlib import Path

 from .models import BenchmarkResult

-# Likert questions in order (must match questionnaire.py QUESTIONS)
+# New 12-item question list: (key, label, dimension)
 LIKERT_QUESTIONS = [
-    ("Readability", "Readability"),
-    ("Expressiveness", "Expressiveness"),
-    ("Conciseness", "Conciseness"),
-    ("Error handling", "Error handling"),
-    ("Overall preference", "Overall preference"),
-    ("Learning curve", "Learning curve"),
+    ("syntax_clarity", "Syntax clarity", "Syntax & Readability"),
+    ("signal_to_noise", "Signal-to-noise", "Syntax & Readability"),
+    ("familiar_conventions", "Familiar conventions", "Syntax & Readability"),
+    ("builtin_ops", "Built-in operations", "Expressiveness"),
+    ("string_ops", "String operations", "Expressiveness"),
+    ("composition", "Composition", "Expressiveness"),
+    ("io_ergonomics", "I/O ergonomics", "Data & I/O"),
+    ("data_structures", "Data structures", "Data & I/O"),
+    ("error_model", "Error model", "Error Handling"),
+    ("edge_case_support", "Edge case support", "Error Handling"),
+    ("learnability", "Learnability", "Overall"),
+    ("fitness", "Fitness for task", "Overall"),
 ]

+# Map old 6 legacy keys to new keys for back-compat with existing results
+LEGACY_KEY_MAP = {
+    "Readability": ["syntax_clarity", "signal_to_noise", "familiar_conventions"],
+    "Expressiveness": ["builtin_ops", "string_ops", "composition"],
+    "Conciseness": ["signal_to_noise"],
+    "Error handling": ["error_model", "edge_case_support"],
+    "Overall preference": ["fitness"],
+    "Learning curve": ["learnability"],
+}
+

 def load_latest_results(results_dir: Path) -> list[BenchmarkResult]:
    """Load results, keeping only the latest run per task name."""
@@ -30,7 +46,7 @@ def load_latest_results(results_dir: Path) -> list[BenchmarkResult]:


 def _parse_likert(selected: str | int) -> int | None:
-    """Extract numeric value from a likert response like '4 - Agree'."""
+    """Extract numeric value from a likert response. Handles int directly or string like '4 - Agree'."""
    if isinstance(selected, int):
        return selected
    s = str(selected).strip()
@@ -40,20 +56,34 @@ def _parse_likert(selected: str | int) -> int | None:


 def _get_likert_scores(result: BenchmarkResult) -> dict[str, dict[str, float | None]]:
-    """Extract likert scores per language. Returns {question_key: {bash: N, lush: N}}."""
+    """Extract likert scores per language. Returns {question_key: {bash: N, lush: N}}.
+
+    Handles both new-format results (exact id match) and legacy results (startswith match
+    mapped to new keys).
+    """
    scores: dict[str, dict[str, float | None]] = {}
-    for key, _ in LIKERT_QUESTIONS:
+    for key, _, _ in LIKERT_QUESTIONS:
        scores[key] = {"bash": None, "lush": None}

    for lang_name, lang_result in [("bash", result.bash_result), ("lush", result.lush_result)]:
        if not lang_result:
            continue
        for q in lang_result.questionnaire:
-            for key, _ in LIKERT_QUESTIONS:
-                if q.question.startswith(key):
+            # Try exact match on new question ids
+            if q.question in scores:
                val = _parse_likert(q.selected)
                if val is not None:
-                        scores[key][lang_name] = float(val)
+                    scores[q.question][lang_name] = float(val)
+                continue
+
+            # Legacy: map old key to new keys (spread the score)
+            for legacy_prefix, new_keys in LEGACY_KEY_MAP.items():
+                if q.question.startswith(legacy_prefix):
+                    val = _parse_likert(q.selected)
+                    if val is not None:
+                        for nk in new_keys:
+                            if scores[nk][lang_name] is None:
+                                scores[nk][lang_name] = float(val)
                    break
    return scores

@@ -64,19 +94,6 @@ def _bar(value: float, max_val: float = 5.0, width: int = 20) -> str:
    return "\u2588" * filled + "\u2591" * (width - filled)


-def _get_freeform(result: BenchmarkResult) -> dict[str, str]:
-    """Extract free-form observations per language."""
-    obs: dict[str, str] = {}
-    for lang_name, lang_result in [("bash", result.bash_result), ("lush", result.lush_result)]:
-        if not lang_result:
-            continue
-        for q in lang_result.questionnaire:
-            if q.question.startswith("Free-form"):
-                obs[lang_name] = str(q.selected)
-                break
-    return obs
-
-
 def render_summary_table(results: list[BenchmarkResult]) -> str:
    """Render the pass/fail + turns overview table."""
    lines: list[str] = []
@@ -123,7 +140,7 @@ def render_summary_table(results: list[BenchmarkResult]) -> str:


 def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
-    """Render aggregated questionnaire scores with bar charts."""
+    """Render aggregated questionnaire scores with bar charts, grouped by dimension."""
    lines: list[str] = []
    lines.append("=" * 78)
    lines.append("  QUESTIONNAIRE SCORES (1-5 Likert, higher = better)")
@@ -132,7 +149,7 @@ def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str:

    # Aggregate scores across all tasks
    agg: dict[str, dict[str, list[float]]] = {}
-    for key, _ in LIKERT_QUESTIONS:
+    for key, _, _ in LIKERT_QUESTIONS:
        agg[key] = {"bash": [], "lush": []}

    for r in results:
@@ -143,7 +160,15 @@ def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
                if val is not None:
                    agg[key][lang].append(val)

-    for key, label in LIKERT_QUESTIONS:
+    # Group by dimension
+    current_dim = None
+    for key, label, dimension in LIKERT_QUESTIONS:
+        if dimension != current_dim:
+            if current_dim is not None:
+                lines.append("")
+            lines.append(f"  [{dimension}]")
+            current_dim = dimension
+
        b_vals = agg[key]["bash"]
        l_vals = agg[key]["lush"]
        b_avg = sum(b_vals) / len(b_vals) if b_vals else 0.0
@@ -154,7 +179,6 @@ def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
        lines.append(f"    {label}")
        lines.append(f"      bash  {_bar(b_avg)} {b_avg:.1f}")
        lines.append(f"      lush  {_bar(l_avg)} {l_avg:.1f}  ({diff_str})")
-        lines.append("")

    # Overall average
    all_bash = [v for key in agg for v in agg[key]["bash"]]
@@ -164,6 +188,7 @@ def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
    diff = l_overall - b_overall
    diff_str = f"+{diff:.1f}" if diff > 0 else f"{diff:.1f}" if diff < 0 else " 0.0"

+    lines.append("")
    lines.append("  " + "-" * 50)
    lines.append(f"  Overall average")
    lines.append(f"    bash  {_bar(b_overall)} {b_overall:.1f}")
@@ -244,7 +269,7 @@ def render_per_category_questionnaire(results: list[BenchmarkResult]) -> str:
        lines.append(f"  {cat}")

        agg: dict[str, dict[str, list[float]]] = {}
-        for key, _ in LIKERT_QUESTIONS:
+        for key, _, _ in LIKERT_QUESTIONS:
            agg[key] = {"bash": [], "lush": []}
        for r in cat_results:
            scores = _get_likert_scores(r)
@@ -254,7 +279,7 @@ def render_per_category_questionnaire(results: list[BenchmarkResult]) -> str:
                    if val is not None:
                        agg[key][lang].append(val)

-        for key, label in LIKERT_QUESTIONS:
+        for key, label, _ in LIKERT_QUESTIONS:
            b_vals = agg[key]["bash"]
            l_vals = agg[key]["lush"]
            b_avg = sum(b_vals) / len(b_vals) if b_vals else 0.0
@@ -284,7 +309,7 @@ def render_per_task_detail(results: list[BenchmarkResult]) -> str:
        scores = _get_likert_scores(r)
        lines.append(f"    {'Metric':<22s} {'Bash':>4s}  {'Lush':>4s}  {'Diff':>5s}")
        lines.append("    " + "-" * 40)
-        for key, label in LIKERT_QUESTIONS:
+        for key, label, _ in LIKERT_QUESTIONS:
            b_val = scores[key]["bash"]
            l_val = scores[key]["lush"]
            b_str = f"{b_val:.0f}" if b_val is not None else "-"
@@ -296,15 +321,6 @@ def render_per_task_detail(results: list[BenchmarkResult]) -> str:
                d_str = "-"
            lines.append(f"    {label:<22s} {b_str:>4s}  {l_str:>4s}  {d_str:>5s}")

-        # Free-form observations
-        obs = _get_freeform(r)
-        if obs:
-            lines.append("")
-            for lang, text in obs.items():
-                # Wrap long text
-                wrapped = text[:120] + ("..." if len(text) > 120 else "")
-                lines.append(f"    {lang}: {wrapped}")
-
    lines.append("")
    return "\n".join(lines)

--- a/main.py
+++ b/main.py
@@ -2,7 +2,9 @@ from __future__ import annotations

 import argparse
 import sys
+import threading
 import tomllib
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from datetime import datetime, timezone
 from pathlib import Path

@@ -16,6 +18,8 @@ from lush_bench.export import export_html
 from lush_bench.report import render_report
 from lush_bench.results import save_result

+_print_lock = threading.Lock()
+

 PROVIDERS = {
    "anthropic": AnthropicProvider,
@@ -70,39 +74,44 @@ def cmd_list_tasks(args: argparse.Namespace) -> None:
        print(f"  [{task.category:<12s} {task.mode:<7s}] {task.name:20s}  {p.relative_to(Path.cwd())}")


-def cmd_run(args: argparse.Namespace) -> None:
-    config = Config.load()
-    task_path = Path(args.task)
+def _log(msg: str) -> None:
+    """Thread-safe print."""
+    with _print_lock:
+        print(msg)
+
+
+def _run_task(
+    task_path: Path,
+    provider_name: str,
+    config: Config,
+    provider: AnthropicProvider | None = None,
+) -> BenchmarkResult:
+    """Core task runner. Thread-safe — usable from cmd_run or a thread pool."""
    task = load_task(task_path)

-    provider_name = args.provider
-    if provider_name not in PROVIDERS:
-        print(f"Unknown provider: {provider_name}. Available: {', '.join(PROVIDERS)}")
-        sys.exit(1)
-
+    if provider is None:
        provider_config = config.provider_configs.get(provider_name, {})
        provider = PROVIDERS[provider_name](provider_config)
+
    timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")

-    print(f"Running task: {task.name} ({task.category}/{task.mode}) with {provider.model_name}")
+    _log(f"Running task: {task.name} ({task.category}/{task.mode}) with {provider.model_name}")

    bash_result = None
    lush_result = None

    if task.mode == "solve":
-        # Solve mode: agent writes code in both languages
-        print("  Solving in bash...")
+        _log(f"  [{task.name}] Solving in bash...")
        bash_result = solve_task(provider, task, "bash", config)
-        print(f"  Bash: {'PASS' if bash_result.all_passed else 'FAIL'} ({bash_result.agent_turns} turns)")
+        _log(f"  [{task.name}] Bash: {'PASS' if bash_result.all_passed else 'FAIL'} ({bash_result.agent_turns} turns)")

-        print("  Solving in lush...")
+        _log(f"  [{task.name}] Solving in lush...")
        lush_result = solve_task(provider, task, "lush", config)
-        print(f"  Lush: {'PASS' if lush_result.all_passed else 'FAIL'} ({lush_result.agent_turns} turns)")
+        _log(f"  [{task.name}] Lush: {'PASS' if lush_result.all_passed else 'FAIL'} ({lush_result.agent_turns} turns)")

    elif task.mode == "convert":
-        # Convert mode: verify provided bash source directly, then convert to lush
        assert task.bash_source, f"Convert-mode task {task.name} missing bash_source"
-        print("  Verifying provided bash source...")
+        _log(f"  [{task.name}] Verifying provided bash source...")
        test_results = evaluate(task, task.bash_source, "bash", config)
        all_passed = all(tr.passed for tr in test_results)
        bash_result = LanguageResult(
@@ -112,16 +121,16 @@ def cmd_run(args: argparse.Namespace) -> None:
            all_passed=all_passed,
            agent_turns=0,
        )
-        print(f"  Bash: {'PASS' if all_passed else 'FAIL'}")
+        _log(f"  [{task.name}] Bash: {'PASS' if all_passed else 'FAIL'}")

-        print("  Converting to lush...")
+        _log(f"  [{task.name}] Converting to lush...")
        lush_result = solve_task(provider, task, "lush", config)
-        print(f"  Lush: {'PASS' if lush_result.all_passed else 'FAIL'} ({lush_result.agent_turns} turns)")
+        _log(f"  [{task.name}] Lush: {'PASS' if lush_result.all_passed else 'FAIL'} ({lush_result.agent_turns} turns)")

    # Run questionnaire for each completed language
    for lang, result in [("bash", bash_result), ("lush", lush_result)]:
        if result and result.solution_code:
-            print(f"  Questionnaire for {lang}...")
+            _log(f"  [{task.name}] Questionnaire for {lang}...")
            result.questionnaire = run_questionnaire(provider, task.name, lang, result.solution_code)

    benchmark = BenchmarkResult(
@@ -136,20 +145,60 @@ def cmd_run(args: argparse.Namespace) -> None:
    )

    result_dir = save_result(benchmark, config.output_dir)
-    print(f"  Results saved to {result_dir}")
+    _log(f"  [{task.name}] Results saved to {result_dir}")
+    return benchmark
+
+
+def cmd_run(args: argparse.Namespace) -> None:
+    config = Config.load()
+
+    provider_name = args.provider
+    if provider_name not in PROVIDERS:
+        print(f"Unknown provider: {provider_name}. Available: {', '.join(PROVIDERS)}")
+        sys.exit(1)
+
+    provider_config = config.provider_configs.get(provider_name, {})
+    provider = PROVIDERS[provider_name](provider_config)
+    _run_task(Path(args.task), provider_name, config, provider)


 def cmd_run_all(args: argparse.Namespace) -> None:
+    config = Config.load()
    paths = find_tasks(args.category, getattr(args, "mode", None))
    if not paths:
        print("No tasks found.")
        return

-    for p in paths:
-        # Reuse cmd_run by constructing a namespace
-        run_args = argparse.Namespace(task=str(p), provider=args.provider)
-        cmd_run(run_args)
-        print()
+    provider_name = args.provider
+    if provider_name not in PROVIDERS:
+        print(f"Unknown provider: {provider_name}. Available: {', '.join(PROVIDERS)}")
+        sys.exit(1)
+
+    # Share one provider instance across threads (its rate limiter is thread-safe)
+    provider_config = config.provider_configs.get(provider_name, {})
+    provider = PROVIDERS[provider_name](provider_config)
+
+    max_workers = args.workers if args.workers is not None else config.max_workers
+    print(f"Running {len(paths)} tasks with {max_workers} workers")
+
+    failed: list[str] = []
+    with ThreadPoolExecutor(max_workers=max_workers) as pool:
+        futures = {
+            pool.submit(_run_task, p, provider_name, config, provider): p
+            for p in paths
+        }
+        for future in as_completed(futures):
+            task_path = futures[future]
+            try:
+                future.result()
+            except Exception as exc:
+                task_name = task_path.stem
+                failed.append(task_name)
+                _log(f"  [{task_name}] FAILED: {exc}")
+
+    print(f"\nDone. {len(paths) - len(failed)}/{len(paths)} succeeded.")
+    if failed:
+        print(f"Failed: {', '.join(failed)}")


 def cmd_report(args: argparse.Namespace) -> None:
@@ -183,6 +232,7 @@ def main() -> None:
    ra.add_argument("--category", choices=["algorithm", "pipeline", "environment", "filesystem", "process"], help="Filter by category")
    ra.add_argument("--mode", choices=["solve", "convert"], help="Filter by mode")
    ra.add_argument("--provider", default="anthropic", help="LLM provider")
+    ra.add_argument("--workers", type=int, default=None, help="Max parallel tasks (default: from config, typically 4)")
    ra.set_defaults(func=cmd_run_all)

    # report
--- a/report.html
+++ b/report.html
--- a/tasks/environment/path_normalizer.toml
+++ b/tasks/environment/path_normalizer.toml
@@ -0,0 +1,97 @@
+name = "path_normalizer"
+category = "environment"
+mode = "convert"
+description = """
+Read file paths from stdin, one per line. Normalize each path:
+1. Replace a leading "~" with the value of $HOME.
+2. Remove trailing slashes (except for root "/").
+3. Collapse consecutive slashes into one.
+4. Resolve "." components (remove them).
+5. Resolve ".." components (go up one directory level).
+Output the cleaned path, one per line.
+Skip empty lines.
+"""
+
+bash_source = '''
+#!/bin/bash
+while IFS= read -r line || [[ -n "$line" ]]; do
+  [[ -z "$line" ]] && continue
+
+  # Expand tilde
+  path=$(echo "$line" | sed "s:^~:$HOME:")
+
+  # Collapse multiple slashes
+  path=$(echo "$path" | sed 's:/\\+:/:g')
+
+  # Remove trailing slash (but keep root)
+  path=$(echo "$path" | sed 's:/$::')
+  [[ -z "$path" ]] && path="/"
+
+  # Resolve . and .. components
+  IFS='/' read -ra parts <<< "$path"
+  result=()
+  absolute=""
+  if [[ "$path" == /* ]]; then
+    absolute="/"
+  fi
+
+  for part in "${parts[@]}"; do
+    if [[ "$part" == "." || "$part" == "" ]]; then
+      continue
+    elif [[ "$part" == ".." ]]; then
+      if [[ ${#result[@]} -gt 0 && "${result[-1]}" != ".." ]]; then
+        unset 'result[${#result[@]}-1]'
+      elif [[ -z "$absolute" ]]; then
+        result+=("..")
+      fi
+    else
+      result+=("$part")
+    fi
+  done
+
+  if [[ -n "$absolute" ]]; then
+    final="/"
+    IFS='/'; final+="${result[*]}"
+  else
+    IFS='/'; final="${result[*]}"
+  fi
+  [[ -z "$final" ]] && final="/"
+
+  echo "$final"
+done
+'''
+
+[[test_cases]]
+description = "Tilde expansion and trailing slashes"
+stdin = """~/Documents/
+~/projects/code
+/var/log/"""
+expected_stdout = """/Users/testuser/Documents
+/Users/testuser/projects/code
+/var/log"""
+env = { "HOME" = "/Users/testuser" }
+
+[[test_cases]]
+description = "Resolving . and .. components"
+stdin = """/usr/local/./bin
+/home/user/../shared/docs
+/a/b/c/../../d"""
+expected_stdout = """/usr/local/bin
+/home/shared/docs
+/a/d"""
+
+[[test_cases]]
+description = "Collapsing multiple slashes"
+stdin = """/usr//local///bin
+/var/log//syslog"""
+expected_stdout = """/usr/local/bin
+/var/log/syslog"""
+
+[[test_cases]]
+description = "Root and relative paths"
+stdin = """/
+./config/settings
+../parent/child"""
+expected_stdout = """/
+config/settings
+../parent/child"""
--- a/tasks/filesystem/todo_manager.toml
+++ b/tasks/filesystem/todo_manager.toml
@@ -0,0 +1,150 @@
+name = "todo_manager"
+category = "filesystem"
+mode = "convert"
+description = """
+A simple todo list manager. Read commands from stdin, one per line:
+  add <task text>   — append the task to the todo file
+  list              — print all tasks numbered as "NN). <task>"
+  remove <N>        — remove task number N (1-based)
+  clear             — remove all tasks
+
+The todo file is "todo.txt" in the working directory.
+When listing, pad task numbers to two digits (01, 02, …).
+After "add" or "remove", automatically list the remaining tasks.
+If the list is empty, print "No tasks found".
+"""
+
+bash_source = '''
+#!/bin/bash
+TODOFILE="./todo.txt"
+
+list_tasks() {
+  if [ -f "$TODOFILE" ] && [ -s "$TODOFILE" ]; then
+    count=1
+    IFS=$'\\n'
+    while read -r task; do
+      num=$count
+      if [ $count -lt 10 ]; then num="0$count"; fi
+      echo "$num). $task"
+      count=$(( count + 1 ))
+    done < "$TODOFILE"
+  else
+    echo "No tasks found"
+  fi
+}
+
+add_task() {
+  echo "$1" >> "$TODOFILE"
+}
+
+remove_task() {
+  taskNum=$1
+  totalLines=$(wc -l < "$TODOFILE" | tr -d ' ')
+  if [ "$taskNum" -gt "$totalLines" ] 2>/dev/null; then
+    echo "Error: task number $taskNum does not exist!"
+    return 1
+  fi
+  tmpfile="./todo_tmp.txt"
+  count=1
+  IFS=$'\\n'
+  while read -r task; do
+    if [ "$count" -ne "$taskNum" ]; then
+      echo "$task" >> "$tmpfile"
+    fi
+    count=$(( count + 1 ))
+  done < "$TODOFILE"
+  if [ -f "$tmpfile" ]; then
+    mv "$tmpfile" "$TODOFILE"
+  else
+    > "$TODOFILE"
+  fi
+  echo "Sucessfully removed task number $taskNum"
+}
+
+clear_tasks() {
+  > "$TODOFILE"
+  echo "Tasks cleared."
+}
+
+if [ ! -f "$TODOFILE" ]; then
+  touch "$TODOFILE"
+fi
+
+while IFS= read -r line || [[ -n "$line" ]]; do
+  cmd=$(echo "$line" | cut -d' ' -f1)
+  arg=$(echo "$line" | cut -d' ' -f2-)
+  case "$cmd" in
+    add)
+      add_task "$arg"
+      list_tasks
+      ;;
+    list)
+      list_tasks
+      ;;
+    remove)
+      remove_task "$arg"
+      list_tasks
+      ;;
+    clear)
+      clear_tasks
+      ;;
+  esac
+done
+'''
+
+[[test_cases]]
+description = "Add tasks then list"
+stdin = """add Buy groceries
+add Walk the dog
+list"""
+expected_stdout = """01). Buy groceries
+01). Buy groceries
+02). Walk the dog
+01). Buy groceries
+02). Walk the dog"""
+
+[[test_cases]]
+description = "Add, remove, list"
+stdin = """add First task
+add Second task
+add Third task
+remove 2
+list"""
+expected_stdout = """01). First task
+01). First task
+02). Second task
+01). First task
+02). Second task
+03). Third task
+Sucessfully removed task number 2
+01). First task
+02). Third task
+01). First task
+02). Third task"""
+
+[[test_cases]]
+description = "Empty list and clear"
+stdin = """list
+add Something
+clear
+list"""
+expected_stdout = """No tasks found
+01). Something
+Tasks cleared.
+No tasks found"""
+
+[[test_cases]]
+description = "Works with pre-existing todo file"
+stdin = """list
+add Third item
+list"""
+setup_files = { "todo.txt" = "Existing item one\nExisting item two\n" }
+expected_stdout = """01). Existing item one
+02). Existing item two
+01). Existing item one
+02). Existing item two
+03). Third item
+01). Existing item one
+02). Existing item two
+03). Third item"""
+expected_files = { "todo.txt" = "Existing item one\nExisting item two\nThird item\n" }
--- a/tasks/pipeline/currency_converter.toml
+++ b/tasks/pipeline/currency_converter.toml
@@ -0,0 +1,113 @@
+name = "currency_converter"
+category = "pipeline"
+mode = "convert"
+description = """
+A currency converter that reads conversion requests from stdin.
+Each line has the format: AMOUNT FROM TO RATE
+  - AMOUNT: a decimal number (e.g., 12.35)
+  - FROM: 3-letter currency code
+  - TO: 3-letter currency code
+  - RATE: the exchange rate from FROM's base to TO's base
+
+Some currencies are pegged to others at fixed rates:
+  BAM is pegged to EUR at 1.95583
+  BMD is pegged to USD at 1.0
+  BND is pegged to SGD at 1.0
+  DJF is pegged to USD at 177.721
+  PAB is pegged to USD at 1.0
+
+When a pegged currency is involved, the conversion must account for the
+peg coefficient. The formula is: result = amount * (rate / coef_from) * coef_to
+where coef is the peg ratio (1 if not pegged).
+
+Output one line per input: "AMOUNT FROM = RESULT TO" with RESULT
+computed using bc with scale=2.
+For invalid lines (wrong field count or non-numeric amount), output "ERROR: <original line>".
+"""
+
+bash_source = '''
+#!/bin/bash
+
+pegged_to() {
+  case "$1" in
+    BAM) echo "EUR:1.95583" ;;
+    BMD) echo "USD:1.0" ;;
+    BND) echo "SGD:1.0" ;;
+    DJF) echo "USD:177.721" ;;
+    PAB) echo "USD:1.0" ;;
+    *)   echo "NONE:1" ;;
+  esac
+}
+
+while IFS= read -r line || [[ -n "$line" ]]; do
+  # Skip empty lines
+  [[ -z "$line" ]] && continue
+
+  # Parse fields
+  set -- $line
+  if [[ $# -ne 4 ]]; then
+    echo "ERROR: $line"
+    continue
+  fi
+
+  amount=$1
+  from=$2
+  to=$3
+  rate=$4
+
+  # Validate amount is numeric
+  if [[ ! "$amount" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then
+    echo "ERROR: $line"
+    continue
+  fi
+
+  # Validate rate is numeric
+  if [[ ! "$rate" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then
+    echo "ERROR: $line"
+    continue
+  fi
+
+  # Get peg info
+  peg_from=$(pegged_to "$from")
+  coef_from=$(echo "$peg_from" | cut -d: -f2)
+
+  peg_to=$(pegged_to "$to")
+  coef_to=$(echo "$peg_to" | cut -d: -f2)
+
+  # Calculate: result = amount * (rate / coef_from) * coef_to
+  result=$(echo "scale=8; $amount * ($rate / $coef_from) * $coef_to" | bc)
+  result=$(printf "%.2f" "$result")
+
+  echo "$amount $from = $result $to"
+done
+'''
+
+[[test_cases]]
+description = "Standard conversion with direct rate"
+stdin = """100 USD EUR 0.92
+50 GBP JPY 188.50"""
+expected_stdout = """100 USD = 92.00 EUR
+50 GBP = 9425.00 JPY"""
+
+[[test_cases]]
+description = "Pegged currency conversions"
+stdin = """100 BAM USD 1.08
+200 BMD EUR 0.92
+50 USD DJF 1.0"""
+expected_stdout = """100 BAM = 55.22 USD
+200 BMD = 184.00 EUR
+50 USD = 8886.05 DJF"""
+
+[[test_cases]]
+description = "Invalid input lines"
+stdin = """abc EUR USD 0.92
+100 USD
+100 EUR USD 0.85"""
+expected_stdout = """ERROR: abc EUR USD 0.92
+ERROR: 100 USD
+100 EUR = 85.00 USD"""
+
+[[test_cases]]
+description = "Pegged-to-pegged conversion"
+stdin = "100 BAM BMD 1.08"
+expected_stdout = "100 BAM = 55.22 BMD"
--- a/tasks/pipeline/locale_weather_url.toml
+++ b/tasks/pipeline/locale_weather_url.toml
@@ -0,0 +1,76 @@
+name = "locale_weather_url"
+category = "pipeline"
+mode = "convert"
+description = """
+Construct weather API URLs from locale and location information.
+Read lines from stdin in the format: LANG_CODE LOCATION
+where LANG_CODE is a 2-letter locale (e.g., "en", "fr", "de")
+and LOCATION is a city/place name (may contain spaces).
+
+For each line, construct a URL in the format:
+  https://LANG.wttr.in/LOCATION
+
+Where spaces in the location are replaced with "+" characters.
+
+If LANG_CODE is empty or invalid (not exactly 2 lowercase letters),
+default to "en".
+
+Skip empty lines. Output one URL per input line.
+"""
+
+bash_source = '''
+#!/bin/bash
+while IFS= read -r line || [[ -n "$line" ]]; do
+  [[ -z "$line" ]] && continue
+
+  # Extract lang code (first field)
+  lang=$(echo "$line" | awk '{print $1}')
+
+  # Extract location (everything after first field)
+  location=$(echo "$line" | sed 's/^[^ ]* *//')
+
+  # Validate lang code: must be exactly 2 lowercase letters
+  if [[ ! "$lang" =~ ^[a-z]{2}$ ]]; then
+    lang="en"
+  fi
+
+  # If location is same as lang (single-word line), skip
+  if [[ "$location" == "$lang" || -z "$location" ]]; then
+    location=""
+  fi
+
+  # Replace spaces with +
+  location=$(echo "$location" | tr ' ' '+')
+
+  echo "https://$lang.wttr.in/$location"
+done
+'''
+
+[[test_cases]]
+description = "Various locales and locations"
+stdin = """en New York
+fr Paris
+de Berlin
+ja Tokyo"""
+expected_stdout = """https://en.wttr.in/New+York
+https://fr.wttr.in/Paris
+https://de.wttr.in/Berlin
+https://ja.wttr.in/Tokyo"""
+
+[[test_cases]]
+description = "Multi-word locations"
+stdin = """en San Francisco
+es Buenos Aires
+pt Rio de Janeiro"""
+expected_stdout = """https://en.wttr.in/San+Francisco
+https://es.wttr.in/Buenos+Aires
+https://pt.wttr.in/Rio+de+Janeiro"""
+
+[[test_cases]]
+description = "Invalid or missing locale defaults to en"
+stdin = """ENG London
+123 Moscow
+x Rome"""
+expected_stdout = """https://en.wttr.in/London
+https://en.wttr.in/Moscow
+https://en.wttr.in/Rome"""
--- a/tasks/pipeline/network_info_parser.toml
+++ b/tasks/pipeline/network_info_parser.toml
@@ -0,0 +1,86 @@
+name = "network_info_parser"
+category = "pipeline"
+mode = "convert"
+description = """
+Parse network interface configuration from stdin (in "ip addr show" format)
+and extract a summary of each interface.
+
+For each interface block, output a line:
+  IFACE: <name> IP: <ipv4_addr> MASK: /<prefix_len>
+
+An interface block starts with a line like:
+  2: eth0: <BROADCAST,MULTICAST,UP> mtu 1500 ...
+and contains inet lines like:
+  inet 192.168.1.100/24 brd 192.168.1.255 scope global eth0
+
+If an interface has no inet line, output:
+  IFACE: <name> IP: none MASK: none
+
+Skip the loopback interface (lo).
+"""
+
+bash_source = '''
+#!/bin/bash
+
+current_iface=""
+found_ip=""
+found_mask=""
+
+flush_iface() {
+  if [[ -n "$current_iface" && "$current_iface" != "lo" ]]; then
+    if [[ -n "$found_ip" ]]; then
+      echo "IFACE: $current_iface IP: $found_ip MASK: /$found_mask"
+    else
+      echo "IFACE: $current_iface IP: none MASK: none"
+    fi
+  fi
+}
+
+while IFS= read -r line || [[ -n "$line" ]]; do
+  # Detect interface line: starts with a number followed by colon
+  if echo "$line" | grep -qE '^[0-9]+:'; then
+    flush_iface
+    current_iface=$(echo "$line" | awk -F: '{print $2}' | sed 's/^[[:space:]]*//' | awk '{print $1}')
+    found_ip=""
+    found_mask=""
+  fi
+
+  # Detect inet line (IPv4 only, not inet6)
+  if echo "$line" | grep -qE '^[[:space:]]+inet [0-9]'; then
+    ip_cidr=$(echo "$line" | awk '{print $2}')
+    found_ip=$(echo "$ip_cidr" | cut -d/ -f1)
+    found_mask=$(echo "$ip_cidr" | cut -d/ -f2)
+  fi
+done
+
+flush_iface
+'''
+
+[[test_cases]]
+description = "Two interfaces with IPs"
+stdin = """1: lo: <LOOPBACK,UP> mtu 65536
+    inet 127.0.0.1/8 scope host lo
+2: eth0: <BROADCAST,MULTICAST,UP> mtu 1500
+    inet 192.168.1.100/24 brd 192.168.1.255 scope global eth0
+3: wlan0: <BROADCAST,MULTICAST,UP> mtu 1500
+    inet 10.0.0.42/16 brd 10.0.255.255 scope global wlan0"""
+expected_stdout = """IFACE: eth0 IP: 192.168.1.100 MASK: /24
+IFACE: wlan0 IP: 10.0.0.42 MASK: /16"""
+
+[[test_cases]]
+description = "Interface with no IP"
+stdin = """1: lo: <LOOPBACK,UP> mtu 65536
+    inet 127.0.0.1/8 scope host lo
+2: eth0: <BROADCAST,MULTICAST,UP> mtu 1500
+3: docker0: <NO-CARRIER,BROADCAST,MULTICAST,UP> mtu 1500
+    inet 172.17.0.1/16 brd 172.17.255.255 scope global docker0"""
+expected_stdout = """IFACE: eth0 IP: none MASK: none
+IFACE: docker0 IP: 172.17.0.1 MASK: /16"""
+
+[[test_cases]]
+description = "Single interface"
+stdin = """1: lo: <LOOPBACK,UP> mtu 65536
+    inet 127.0.0.1/8 scope host lo
+2: enp3s0: <BROADCAST,MULTICAST,UP> mtu 9000
+    inet 10.10.10.5/8 brd 10.255.255.255 scope global enp3s0"""
+expected_stdout = "IFACE: enp3s0 IP: 10.10.10.5 MASK: /8"
--- a/tasks/pipeline/url_normalizer.toml
+++ b/tasks/pipeline/url_normalizer.toml
@@ -0,0 +1,79 @@
+name = "url_normalizer"
+category = "pipeline"
+mode = "convert"
+description = """
+Read URLs from stdin, one per line. Normalize each URL:
+1. If the URL already starts with "https://", keep it as-is.
+2. If it starts with "http://", keep it as-is.
+3. Otherwise, prepend "http://" to it.
+4. After normalization, validate that the URL matches a basic pattern:
+   it must have a protocol (http:// or https://), followed by at least
+   one character, a dot, and at least one more character for the domain.
+5. Output the normalized URL, or "INVALID: <original>" for invalid entries.
+
+Skip empty lines silently.
+"""
+
+bash_source = '''
+#!/bin/bash
+while IFS= read -r line || [[ -n "$line" ]]; do
+  # Skip empty lines
+  [[ -z "$line" ]] && continue
+
+  # Trim whitespace
+  url=$(echo "$line" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
+  [[ -z "$url" ]] && continue
+
+  original="$url"
+
+  # Check if it already has https://
+  prefix8=$(echo "$url" | cut -c1-8)
+  if [[ "$prefix8" == "https://" ]]; then
+    normalized="$url"
+  else
+    prefix7=$(echo "$url" | cut -c1-7)
+    if [[ "$prefix7" == "http://" ]]; then
+      normalized="$url"
+    else
+      normalized="http://$url"
+    fi
+  fi
+
+  # Validate: protocol + something.something
+  if echo "$normalized" | grep -qE '^https?://[^/]+\.[^/]+'; then
+    echo "$normalized"
+  else
+    echo "INVALID: $original"
+  fi
+done
+'''
+
+[[test_cases]]
+description = "URLs with and without protocol"
+stdin = """example.com
+http://example.com
+https://example.com
+www.google.com/search?q=test"""
+expected_stdout = """http://example.com
+http://example.com
+https://example.com
+http://www.google.com/search?q=test"""
+
+[[test_cases]]
+description = "Invalid entries"
+stdin = """notaurl
+https://valid.example.com
+just-a-word"""
+expected_stdout = """INVALID: notaurl
+https://valid.example.com
+INVALID: just-a-word"""
+
+[[test_cases]]
+description = "Mixed valid and empty lines"
+stdin = """https://secure.site.org/path
+
+api.service.io:8080
+http://old.site.net"""
+expected_stdout = """https://secure.site.org/path
+http://api.service.io:8080
+http://old.site.net"""