Revamp questionnaire, parallelize run-all, add new tasks
- Replace 6 compound Likert questions with 12 atomic ones grouped by dimension (syntax, expressiveness, data/IO, errors, overall); drop free-form question. Responses now stored as ints, not strings. - Back-compat layer maps legacy keys to new dimensions so existing results still render. - Parallelize run-all with ThreadPoolExecutor (configurable workers) and add a thread-safe min-request-interval rate limiter to the Anthropic provider. - Add new tasks: path_normalizer, todo_manager, currency_converter, locale_weather_url, network_info_parser, url_normalizer.
This commit is contained in:
@@ -13,6 +13,7 @@ class Config:
|
|||||||
timeout_seconds: float = 10.0
|
timeout_seconds: float = 10.0
|
||||||
normalize_whitespace: bool = True
|
normalize_whitespace: bool = True
|
||||||
output_dir: Path = Path("results")
|
output_dir: Path = Path("results")
|
||||||
|
max_workers: int = 4
|
||||||
provider_configs: dict[str, dict[str, Any]] = field(default_factory=dict)
|
provider_configs: dict[str, dict[str, Any]] = field(default_factory=dict)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@@ -35,5 +36,6 @@ class Config:
|
|||||||
timeout_seconds=agent.get("timeout_seconds", 10.0),
|
timeout_seconds=agent.get("timeout_seconds", 10.0),
|
||||||
normalize_whitespace=agent.get("normalize_whitespace", True),
|
normalize_whitespace=agent.get("normalize_whitespace", True),
|
||||||
output_dir=Path(results.get("output_dir", "results")),
|
output_dir=Path(results.get("output_dir", "results")),
|
||||||
|
max_workers=agent.get("max_workers", 4),
|
||||||
provider_configs=provider_configs,
|
provider_configs=provider_configs,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -13,7 +13,6 @@ import matplotlib.ticker as ticker
|
|||||||
from .models import BenchmarkResult
|
from .models import BenchmarkResult
|
||||||
from .report import (
|
from .report import (
|
||||||
LIKERT_QUESTIONS,
|
LIKERT_QUESTIONS,
|
||||||
_get_freeform,
|
|
||||||
_get_likert_scores,
|
_get_likert_scores,
|
||||||
_parse_likert,
|
_parse_likert,
|
||||||
load_latest_results,
|
load_latest_results,
|
||||||
@@ -35,7 +34,7 @@ def _fig_to_base64(fig: plt.Figure) -> str:
|
|||||||
def _aggregate_likert(results: list[BenchmarkResult]) -> dict[str, dict[str, float]]:
|
def _aggregate_likert(results: list[BenchmarkResult]) -> dict[str, dict[str, float]]:
|
||||||
"""Return {question_key: {bash: avg, lush: avg}}."""
|
"""Return {question_key: {bash: avg, lush: avg}}."""
|
||||||
agg: dict[str, dict[str, list[float]]] = {}
|
agg: dict[str, dict[str, list[float]]] = {}
|
||||||
for key, _ in LIKERT_QUESTIONS:
|
for key, _, _ in LIKERT_QUESTIONS:
|
||||||
agg[key] = {"bash": [], "lush": []}
|
agg[key] = {"bash": [], "lush": []}
|
||||||
for r in results:
|
for r in results:
|
||||||
scores = _get_likert_scores(r)
|
scores = _get_likert_scores(r)
|
||||||
@@ -56,11 +55,11 @@ def _aggregate_likert(results: list[BenchmarkResult]) -> dict[str, dict[str, flo
|
|||||||
def chart_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
|
def chart_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
|
||||||
"""Grouped horizontal bar chart comparing bash vs lush on each Likert metric."""
|
"""Grouped horizontal bar chart comparing bash vs lush on each Likert metric."""
|
||||||
avgs = _aggregate_likert(results)
|
avgs = _aggregate_likert(results)
|
||||||
labels = [label for _, label in LIKERT_QUESTIONS]
|
labels = [label for _, label, _ in LIKERT_QUESTIONS]
|
||||||
bash_vals = [avgs[key]["bash"] for key, _ in LIKERT_QUESTIONS]
|
bash_vals = [avgs[key]["bash"] for key, _, _ in LIKERT_QUESTIONS]
|
||||||
lush_vals = [avgs[key]["lush"] for key, _ in LIKERT_QUESTIONS]
|
lush_vals = [avgs[key]["lush"] for key, _, _ in LIKERT_QUESTIONS]
|
||||||
|
|
||||||
fig, ax = plt.subplots(figsize=(8, 4.5))
|
fig, ax = plt.subplots(figsize=(8, 7))
|
||||||
y = range(len(labels))
|
y = range(len(labels))
|
||||||
bar_h = 0.35
|
bar_h = 0.35
|
||||||
bars_bash = ax.barh([i + bar_h / 2 for i in y], bash_vals, bar_h, label="bash", color=BASH_COLOR)
|
bars_bash = ax.barh([i + bar_h / 2 for i in y], bash_vals, bar_h, label="bash", color=BASH_COLOR)
|
||||||
@@ -112,14 +111,14 @@ def chart_turns_comparison(results: list[BenchmarkResult]) -> str:
|
|||||||
|
|
||||||
def chart_per_task_heatmap(results: list[BenchmarkResult]) -> str:
|
def chart_per_task_heatmap(results: list[BenchmarkResult]) -> str:
|
||||||
"""Heatmap showing lush-minus-bash score diff per task and metric."""
|
"""Heatmap showing lush-minus-bash score diff per task and metric."""
|
||||||
labels = [label for _, label in LIKERT_QUESTIONS]
|
labels = [label for _, label, _ in LIKERT_QUESTIONS]
|
||||||
tasks = [r.task_name for r in results]
|
tasks = [r.task_name for r in results]
|
||||||
|
|
||||||
data: list[list[float]] = []
|
data: list[list[float]] = []
|
||||||
for r in results:
|
for r in results:
|
||||||
scores = _get_likert_scores(r)
|
scores = _get_likert_scores(r)
|
||||||
row = []
|
row = []
|
||||||
for key, _ in LIKERT_QUESTIONS:
|
for key, _, _ in LIKERT_QUESTIONS:
|
||||||
b = scores[key]["bash"]
|
b = scores[key]["bash"]
|
||||||
l = scores[key]["lush"]
|
l = scores[key]["lush"]
|
||||||
if b is not None and l is not None:
|
if b is not None and l is not None:
|
||||||
@@ -128,11 +127,11 @@ def chart_per_task_heatmap(results: list[BenchmarkResult]) -> str:
|
|||||||
row.append(0.0)
|
row.append(0.0)
|
||||||
data.append(row)
|
data.append(row)
|
||||||
|
|
||||||
fig, ax = plt.subplots(figsize=(8, max(4, len(tasks) * 0.45 + 1)))
|
fig, ax = plt.subplots(figsize=(10, max(4, len(tasks) * 0.45 + 1)))
|
||||||
im = ax.imshow(data, cmap="RdYlGn", aspect="auto", vmin=-3, vmax=3)
|
im = ax.imshow(data, cmap="RdYlGn", aspect="auto", vmin=-3, vmax=3)
|
||||||
|
|
||||||
ax.set_xticks(range(len(labels)))
|
ax.set_xticks(range(len(labels)))
|
||||||
ax.set_xticklabels(labels, rotation=35, ha="right", fontsize=8)
|
ax.set_xticklabels(labels, rotation=45, ha="right", fontsize=7)
|
||||||
ax.set_yticks(range(len(tasks)))
|
ax.set_yticks(range(len(tasks)))
|
||||||
ax.set_yticklabels(tasks, fontsize=8)
|
ax.set_yticklabels(tasks, fontsize=8)
|
||||||
|
|
||||||
@@ -140,7 +139,7 @@ def chart_per_task_heatmap(results: list[BenchmarkResult]) -> str:
|
|||||||
for j in range(len(labels)):
|
for j in range(len(labels)):
|
||||||
val = data[i][j]
|
val = data[i][j]
|
||||||
text = f"+{val:.0f}" if val > 0 else f"{val:.0f}" if val < 0 else "0"
|
text = f"+{val:.0f}" if val > 0 else f"{val:.0f}" if val < 0 else "0"
|
||||||
ax.text(j, i, text, ha="center", va="center", fontsize=8,
|
ax.text(j, i, text, ha="center", va="center", fontsize=7,
|
||||||
color="white" if abs(val) >= 2 else "black")
|
color="white" if abs(val) >= 2 else "black")
|
||||||
|
|
||||||
ax.set_title("Score Difference (Lush - Bash)")
|
ax.set_title("Score Difference (Lush - Bash)")
|
||||||
@@ -197,7 +196,7 @@ def chart_per_category_questionnaire(results: list[BenchmarkResult]) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def chart_per_category_radar(results: list[BenchmarkResult]) -> list[tuple[str, str]]:
|
def chart_per_category_radar(results: list[BenchmarkResult]) -> list[tuple[str, str]]:
|
||||||
"""Small-multiples bar charts: one per category showing 6 Likert dimensions for bash vs lush."""
|
"""Small-multiples bar charts: one per category showing 12 Likert dimensions for bash vs lush."""
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
@@ -206,12 +205,12 @@ def chart_per_category_radar(results: list[BenchmarkResult]) -> list[tuple[str,
|
|||||||
by_cat[r.category].append(r)
|
by_cat[r.category].append(r)
|
||||||
|
|
||||||
charts: list[tuple[str, str]] = []
|
charts: list[tuple[str, str]] = []
|
||||||
labels = [label for _, label in LIKERT_QUESTIONS]
|
labels = [label for _, label, _ in LIKERT_QUESTIONS]
|
||||||
|
|
||||||
for cat in sorted(by_cat):
|
for cat in sorted(by_cat):
|
||||||
cat_results = by_cat[cat]
|
cat_results = by_cat[cat]
|
||||||
agg: dict[str, dict[str, list[float]]] = {}
|
agg: dict[str, dict[str, list[float]]] = {}
|
||||||
for key, _ in LIKERT_QUESTIONS:
|
for key, _, _ in LIKERT_QUESTIONS:
|
||||||
agg[key] = {"bash": [], "lush": []}
|
agg[key] = {"bash": [], "lush": []}
|
||||||
for r in cat_results:
|
for r in cat_results:
|
||||||
scores = _get_likert_scores(r)
|
scores = _get_likert_scores(r)
|
||||||
@@ -221,10 +220,10 @@ def chart_per_category_radar(results: list[BenchmarkResult]) -> list[tuple[str,
|
|||||||
if val is not None:
|
if val is not None:
|
||||||
agg[key][lang].append(val)
|
agg[key][lang].append(val)
|
||||||
|
|
||||||
bash_vals = [sum(agg[k]["bash"]) / len(agg[k]["bash"]) if agg[k]["bash"] else 0.0 for k, _ in LIKERT_QUESTIONS]
|
bash_vals = [sum(agg[k]["bash"]) / len(agg[k]["bash"]) if agg[k]["bash"] else 0.0 for k, _, _ in LIKERT_QUESTIONS]
|
||||||
lush_vals = [sum(agg[k]["lush"]) / len(agg[k]["lush"]) if agg[k]["lush"] else 0.0 for k, _ in LIKERT_QUESTIONS]
|
lush_vals = [sum(agg[k]["lush"]) / len(agg[k]["lush"]) if agg[k]["lush"] else 0.0 for k, _, _ in LIKERT_QUESTIONS]
|
||||||
|
|
||||||
fig, ax = plt.subplots(figsize=(6, 3.5))
|
fig, ax = plt.subplots(figsize=(7, 5))
|
||||||
y = range(len(labels))
|
y = range(len(labels))
|
||||||
bar_h = 0.35
|
bar_h = 0.35
|
||||||
ax.barh([i + bar_h / 2 for i in y], bash_vals, bar_h, label="bash", color=BASH_COLOR)
|
ax.barh([i + bar_h / 2 for i in y], bash_vals, bar_h, label="bash", color=BASH_COLOR)
|
||||||
@@ -337,7 +336,7 @@ def _build_detail_html(results: list[BenchmarkResult]) -> str:
|
|||||||
|
|
||||||
scores = _get_likert_scores(r)
|
scores = _get_likert_scores(r)
|
||||||
score_rows = []
|
score_rows = []
|
||||||
for key, label in LIKERT_QUESTIONS:
|
for key, label, _ in LIKERT_QUESTIONS:
|
||||||
b_val = scores[key]["bash"]
|
b_val = scores[key]["bash"]
|
||||||
l_val = scores[key]["lush"]
|
l_val = scores[key]["lush"]
|
||||||
b_str = f"{b_val:.0f}" if b_val is not None else "-"
|
b_str = f"{b_val:.0f}" if b_val is not None else "-"
|
||||||
@@ -353,11 +352,6 @@ def _build_detail_html(results: list[BenchmarkResult]) -> str:
|
|||||||
f'<td>{b_str}</td><td>{l_str}</td>'
|
f'<td>{b_str}</td><td>{l_str}</td>'
|
||||||
f'<td class="{d_cls}">{d_str}</td></tr>')
|
f'<td class="{d_cls}">{d_str}</td></tr>')
|
||||||
|
|
||||||
obs = _get_freeform(r)
|
|
||||||
obs_html = ""
|
|
||||||
for lang, text in obs.items():
|
|
||||||
obs_html += f'<p><strong>{lang}:</strong> {html.escape(text)}</p>\n'
|
|
||||||
|
|
||||||
sections.append(f"""
|
sections.append(f"""
|
||||||
<div class="task-detail">
|
<div class="task-detail">
|
||||||
<h3>{html.escape(r.task_name)} <span class="cat">[{r.category}/{r.mode}]</span>
|
<h3>{html.escape(r.task_name)} <span class="cat">[{r.category}/{r.mode}]</span>
|
||||||
@@ -368,7 +362,6 @@ def _build_detail_html(results: list[BenchmarkResult]) -> str:
|
|||||||
<thead><tr><th>Metric</th><th>Bash</th><th>Lush</th><th>Diff</th></tr></thead>
|
<thead><tr><th>Metric</th><th>Bash</th><th>Lush</th><th>Diff</th></tr></thead>
|
||||||
<tbody>{"".join(score_rows)}</tbody>
|
<tbody>{"".join(score_rows)}</tbody>
|
||||||
</table>
|
</table>
|
||||||
<div class="observations">{obs_html}</div>
|
|
||||||
</div>""")
|
</div>""")
|
||||||
|
|
||||||
return "\n".join(sections)
|
return "\n".join(sections)
|
||||||
@@ -424,8 +417,6 @@ def export_html(results_dir: Path, output_path: Path) -> None:
|
|||||||
.scores {{ width: auto; }}
|
.scores {{ width: auto; }}
|
||||||
.scores td:nth-child(n+2) {{ text-align: center; min-width: 50px; }}
|
.scores td:nth-child(n+2) {{ text-align: center; min-width: 50px; }}
|
||||||
.scores th:nth-child(n+2) {{ text-align: center; }}
|
.scores th:nth-child(n+2) {{ text-align: center; }}
|
||||||
.observations {{ margin-top: 12px; font-size: 0.85rem; color: #444; }}
|
|
||||||
.observations p {{ margin-bottom: 6px; }}
|
|
||||||
</style>
|
</style>
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
|
|||||||
@@ -1,6 +1,8 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
import anthropic
|
import anthropic
|
||||||
@@ -17,8 +19,17 @@ class AnthropicProvider:
|
|||||||
self._client = anthropic.Anthropic(api_key=api_key)
|
self._client = anthropic.Anthropic(api_key=api_key)
|
||||||
self._model = config.get("model", "claude-sonnet-4-20250514")
|
self._model = config.get("model", "claude-sonnet-4-20250514")
|
||||||
self._max_tokens = config.get("max_tokens", 4096)
|
self._max_tokens = config.get("max_tokens", 4096)
|
||||||
|
self._min_request_interval = config.get("min_request_interval", 0.1)
|
||||||
|
self._last_request_time = 0.0
|
||||||
|
self._lock = threading.Lock()
|
||||||
|
|
||||||
def send(self, messages: list[Message], system: str = "") -> str:
|
def send(self, messages: list[Message], system: str = "") -> str:
|
||||||
|
with self._lock:
|
||||||
|
elapsed = time.monotonic() - self._last_request_time
|
||||||
|
if elapsed < self._min_request_interval:
|
||||||
|
time.sleep(self._min_request_interval - elapsed)
|
||||||
|
self._last_request_time = time.monotonic()
|
||||||
|
|
||||||
api_messages = [{"role": m.role, "content": m.content} for m in messages]
|
api_messages = [{"role": m.role, "content": m.content} for m in messages]
|
||||||
kwargs: dict[str, Any] = {
|
kwargs: dict[str, Any] = {
|
||||||
"model": self._model,
|
"model": self._model,
|
||||||
|
|||||||
@@ -7,42 +7,38 @@ from .models import QuestionnaireResponse
|
|||||||
from .providers.base import LLMProvider, Message
|
from .providers.base import LLMProvider, Message
|
||||||
|
|
||||||
QUESTIONS = [
|
QUESTIONS = [
|
||||||
{
|
# Syntax & Readability
|
||||||
"question": "Readability: The solution is easy to read and understand",
|
{"id": "syntax_clarity", "dimension": "Syntax & Readability", "question": "The language's syntax makes the intent of operations visually obvious"},
|
||||||
"choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
|
{"id": "signal_to_noise", "dimension": "Syntax & Readability", "question": "The language keeps boilerplate low — most characters serve the task, not the language"},
|
||||||
},
|
{"id": "familiar_conventions", "dimension": "Syntax & Readability", "question": "The language follows conventions that developers from other languages would recognize"},
|
||||||
{
|
# Expressiveness
|
||||||
"question": "Expressiveness: The language provided sufficient constructs to solve the problem naturally",
|
{"id": "builtin_ops", "dimension": "Expressiveness", "question": "The language provides built-in operations for the core task requirements (no workarounds needed)"},
|
||||||
"choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
|
{"id": "string_ops", "dimension": "Expressiveness", "question": "The language's string manipulation capabilities are convenient for this task"},
|
||||||
},
|
{"id": "composition", "dimension": "Expressiveness", "question": "The language makes it easy to compose operations (piping, chaining, nesting)"},
|
||||||
{
|
# Data & I/O
|
||||||
"question": "Conciseness: The solution required minimal boilerplate",
|
{"id": "io_ergonomics", "dimension": "Data & I/O", "question": "Reading input and producing output is straightforward in this language"},
|
||||||
"choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
|
{"id": "data_structures", "dimension": "Data & I/O", "question": "The language's data structures (arrays, maps, variables) are well-suited to this task"},
|
||||||
},
|
# Error Handling
|
||||||
{
|
{"id": "error_model", "dimension": "Error Handling", "question": "The language's error handling model is clear and predictable"},
|
||||||
"question": "Error handling: Error handling was straightforward",
|
{"id": "edge_case_support", "dimension": "Error Handling", "question": "The language makes it easy to handle edge cases (empty input, missing data, type mismatches)"},
|
||||||
"choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
|
# Overall
|
||||||
},
|
{"id": "learnability", "dimension": "Overall", "question": "A developer unfamiliar with this language could learn enough to solve this task quickly"},
|
||||||
{
|
{"id": "fitness", "dimension": "Overall", "question": "This language is a good fit for this type of task"},
|
||||||
"question": "Overall preference: I would prefer this language for similar tasks",
|
|
||||||
"choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"question": "Learning curve: An unfamiliar developer could understand the solution quickly",
|
|
||||||
"choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
|
|
||||||
},
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
CHOICES = ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"]
|
||||||
|
|
||||||
|
|
||||||
def build_questionnaire_prompt(
|
def build_questionnaire_prompt(
|
||||||
task_name: str,
|
task_name: str,
|
||||||
language: str,
|
language: str,
|
||||||
solution_code: str,
|
solution_code: str,
|
||||||
) -> str:
|
) -> str:
|
||||||
|
choices_str = ", ".join(f'"{c}"' for c in CHOICES)
|
||||||
|
|
||||||
questions_text = ""
|
questions_text = ""
|
||||||
for i, q in enumerate(QUESTIONS, 1):
|
for q in QUESTIONS:
|
||||||
choices_str = ", ".join(f'"{c}"' for c in q["choices"])
|
questions_text += f' {{"id": "{q["id"]}", "question": "{q["question"]}", "selected": <your choice>}},\n'
|
||||||
questions_text += f' {{"question": "{q["question"]}", "choices": [{choices_str}], "selected": <your choice>}},\n'
|
|
||||||
|
|
||||||
return f"""You just solved the task "{task_name}" in {language}. Here is your solution:
|
return f"""You just solved the task "{task_name}" in {language}. Here is your solution:
|
||||||
|
|
||||||
@@ -50,11 +46,20 @@ def build_questionnaire_prompt(
|
|||||||
{solution_code}
|
{solution_code}
|
||||||
```
|
```
|
||||||
|
|
||||||
Please evaluate your experience by answering the following questionnaire. Respond with ONLY a JSON array — no other text.
|
Rate the **language itself** on each aspect below, not the quality of this particular solution. Consider what the language's design and built-in features afford for this type of task.
|
||||||
|
|
||||||
|
Respond with ONLY a JSON array — no other text. For "selected", use one of: {choices_str}
|
||||||
|
|
||||||
[
|
[
|
||||||
{questions_text} {{"question": "Free-form observation about using {language} for this task", "selected": "<your observation>"}}
|
{questions_text}]"""
|
||||||
]"""
|
|
||||||
|
|
||||||
|
def _extract_int(value: str) -> int | None:
|
||||||
|
"""Extract leading digit from a response like '4 - Agree'."""
|
||||||
|
s = value.strip()
|
||||||
|
if s and s[0].isdigit():
|
||||||
|
return int(s[0])
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def parse_questionnaire_response(response: str) -> list[QuestionnaireResponse]:
|
def parse_questionnaire_response(response: str) -> list[QuestionnaireResponse]:
|
||||||
@@ -70,11 +75,20 @@ def parse_questionnaire_response(response: str) -> list[QuestionnaireResponse]:
|
|||||||
|
|
||||||
results = []
|
results = []
|
||||||
for item in data:
|
for item in data:
|
||||||
|
question_id = item.get("id", item.get("question", ""))
|
||||||
|
raw_selected = item.get("selected", "")
|
||||||
|
|
||||||
|
# Normalize to int
|
||||||
|
if isinstance(raw_selected, int):
|
||||||
|
selected: int | str = raw_selected
|
||||||
|
else:
|
||||||
|
parsed = _extract_int(str(raw_selected))
|
||||||
|
selected = parsed if parsed is not None else raw_selected
|
||||||
|
|
||||||
results.append(
|
results.append(
|
||||||
QuestionnaireResponse(
|
QuestionnaireResponse(
|
||||||
question=item.get("question", ""),
|
question=question_id,
|
||||||
selected=item.get("selected", ""),
|
selected=selected,
|
||||||
choices=item.get("choices"),
|
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
return results
|
return results
|
||||||
|
|||||||
@@ -5,16 +5,32 @@ from pathlib import Path
|
|||||||
|
|
||||||
from .models import BenchmarkResult
|
from .models import BenchmarkResult
|
||||||
|
|
||||||
# Likert questions in order (must match questionnaire.py QUESTIONS)
|
# New 12-item question list: (key, label, dimension)
|
||||||
LIKERT_QUESTIONS = [
|
LIKERT_QUESTIONS = [
|
||||||
("Readability", "Readability"),
|
("syntax_clarity", "Syntax clarity", "Syntax & Readability"),
|
||||||
("Expressiveness", "Expressiveness"),
|
("signal_to_noise", "Signal-to-noise", "Syntax & Readability"),
|
||||||
("Conciseness", "Conciseness"),
|
("familiar_conventions", "Familiar conventions", "Syntax & Readability"),
|
||||||
("Error handling", "Error handling"),
|
("builtin_ops", "Built-in operations", "Expressiveness"),
|
||||||
("Overall preference", "Overall preference"),
|
("string_ops", "String operations", "Expressiveness"),
|
||||||
("Learning curve", "Learning curve"),
|
("composition", "Composition", "Expressiveness"),
|
||||||
|
("io_ergonomics", "I/O ergonomics", "Data & I/O"),
|
||||||
|
("data_structures", "Data structures", "Data & I/O"),
|
||||||
|
("error_model", "Error model", "Error Handling"),
|
||||||
|
("edge_case_support", "Edge case support", "Error Handling"),
|
||||||
|
("learnability", "Learnability", "Overall"),
|
||||||
|
("fitness", "Fitness for task", "Overall"),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Map old 6 legacy keys to new keys for back-compat with existing results
|
||||||
|
LEGACY_KEY_MAP = {
|
||||||
|
"Readability": ["syntax_clarity", "signal_to_noise", "familiar_conventions"],
|
||||||
|
"Expressiveness": ["builtin_ops", "string_ops", "composition"],
|
||||||
|
"Conciseness": ["signal_to_noise"],
|
||||||
|
"Error handling": ["error_model", "edge_case_support"],
|
||||||
|
"Overall preference": ["fitness"],
|
||||||
|
"Learning curve": ["learnability"],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def load_latest_results(results_dir: Path) -> list[BenchmarkResult]:
|
def load_latest_results(results_dir: Path) -> list[BenchmarkResult]:
|
||||||
"""Load results, keeping only the latest run per task name."""
|
"""Load results, keeping only the latest run per task name."""
|
||||||
@@ -30,7 +46,7 @@ def load_latest_results(results_dir: Path) -> list[BenchmarkResult]:
|
|||||||
|
|
||||||
|
|
||||||
def _parse_likert(selected: str | int) -> int | None:
|
def _parse_likert(selected: str | int) -> int | None:
|
||||||
"""Extract numeric value from a likert response like '4 - Agree'."""
|
"""Extract numeric value from a likert response. Handles int directly or string like '4 - Agree'."""
|
||||||
if isinstance(selected, int):
|
if isinstance(selected, int):
|
||||||
return selected
|
return selected
|
||||||
s = str(selected).strip()
|
s = str(selected).strip()
|
||||||
@@ -40,20 +56,34 @@ def _parse_likert(selected: str | int) -> int | None:
|
|||||||
|
|
||||||
|
|
||||||
def _get_likert_scores(result: BenchmarkResult) -> dict[str, dict[str, float | None]]:
|
def _get_likert_scores(result: BenchmarkResult) -> dict[str, dict[str, float | None]]:
|
||||||
"""Extract likert scores per language. Returns {question_key: {bash: N, lush: N}}."""
|
"""Extract likert scores per language. Returns {question_key: {bash: N, lush: N}}.
|
||||||
|
|
||||||
|
Handles both new-format results (exact id match) and legacy results (startswith match
|
||||||
|
mapped to new keys).
|
||||||
|
"""
|
||||||
scores: dict[str, dict[str, float | None]] = {}
|
scores: dict[str, dict[str, float | None]] = {}
|
||||||
for key, _ in LIKERT_QUESTIONS:
|
for key, _, _ in LIKERT_QUESTIONS:
|
||||||
scores[key] = {"bash": None, "lush": None}
|
scores[key] = {"bash": None, "lush": None}
|
||||||
|
|
||||||
for lang_name, lang_result in [("bash", result.bash_result), ("lush", result.lush_result)]:
|
for lang_name, lang_result in [("bash", result.bash_result), ("lush", result.lush_result)]:
|
||||||
if not lang_result:
|
if not lang_result:
|
||||||
continue
|
continue
|
||||||
for q in lang_result.questionnaire:
|
for q in lang_result.questionnaire:
|
||||||
for key, _ in LIKERT_QUESTIONS:
|
# Try exact match on new question ids
|
||||||
if q.question.startswith(key):
|
if q.question in scores:
|
||||||
|
val = _parse_likert(q.selected)
|
||||||
|
if val is not None:
|
||||||
|
scores[q.question][lang_name] = float(val)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Legacy: map old key to new keys (spread the score)
|
||||||
|
for legacy_prefix, new_keys in LEGACY_KEY_MAP.items():
|
||||||
|
if q.question.startswith(legacy_prefix):
|
||||||
val = _parse_likert(q.selected)
|
val = _parse_likert(q.selected)
|
||||||
if val is not None:
|
if val is not None:
|
||||||
scores[key][lang_name] = float(val)
|
for nk in new_keys:
|
||||||
|
if scores[nk][lang_name] is None:
|
||||||
|
scores[nk][lang_name] = float(val)
|
||||||
break
|
break
|
||||||
return scores
|
return scores
|
||||||
|
|
||||||
@@ -64,19 +94,6 @@ def _bar(value: float, max_val: float = 5.0, width: int = 20) -> str:
|
|||||||
return "\u2588" * filled + "\u2591" * (width - filled)
|
return "\u2588" * filled + "\u2591" * (width - filled)
|
||||||
|
|
||||||
|
|
||||||
def _get_freeform(result: BenchmarkResult) -> dict[str, str]:
|
|
||||||
"""Extract free-form observations per language."""
|
|
||||||
obs: dict[str, str] = {}
|
|
||||||
for lang_name, lang_result in [("bash", result.bash_result), ("lush", result.lush_result)]:
|
|
||||||
if not lang_result:
|
|
||||||
continue
|
|
||||||
for q in lang_result.questionnaire:
|
|
||||||
if q.question.startswith("Free-form"):
|
|
||||||
obs[lang_name] = str(q.selected)
|
|
||||||
break
|
|
||||||
return obs
|
|
||||||
|
|
||||||
|
|
||||||
def render_summary_table(results: list[BenchmarkResult]) -> str:
|
def render_summary_table(results: list[BenchmarkResult]) -> str:
|
||||||
"""Render the pass/fail + turns overview table."""
|
"""Render the pass/fail + turns overview table."""
|
||||||
lines: list[str] = []
|
lines: list[str] = []
|
||||||
@@ -123,7 +140,7 @@ def render_summary_table(results: list[BenchmarkResult]) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
|
def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
|
||||||
"""Render aggregated questionnaire scores with bar charts."""
|
"""Render aggregated questionnaire scores with bar charts, grouped by dimension."""
|
||||||
lines: list[str] = []
|
lines: list[str] = []
|
||||||
lines.append("=" * 78)
|
lines.append("=" * 78)
|
||||||
lines.append(" QUESTIONNAIRE SCORES (1-5 Likert, higher = better)")
|
lines.append(" QUESTIONNAIRE SCORES (1-5 Likert, higher = better)")
|
||||||
@@ -132,7 +149,7 @@ def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
|
|||||||
|
|
||||||
# Aggregate scores across all tasks
|
# Aggregate scores across all tasks
|
||||||
agg: dict[str, dict[str, list[float]]] = {}
|
agg: dict[str, dict[str, list[float]]] = {}
|
||||||
for key, _ in LIKERT_QUESTIONS:
|
for key, _, _ in LIKERT_QUESTIONS:
|
||||||
agg[key] = {"bash": [], "lush": []}
|
agg[key] = {"bash": [], "lush": []}
|
||||||
|
|
||||||
for r in results:
|
for r in results:
|
||||||
@@ -143,7 +160,15 @@ def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
|
|||||||
if val is not None:
|
if val is not None:
|
||||||
agg[key][lang].append(val)
|
agg[key][lang].append(val)
|
||||||
|
|
||||||
for key, label in LIKERT_QUESTIONS:
|
# Group by dimension
|
||||||
|
current_dim = None
|
||||||
|
for key, label, dimension in LIKERT_QUESTIONS:
|
||||||
|
if dimension != current_dim:
|
||||||
|
if current_dim is not None:
|
||||||
|
lines.append("")
|
||||||
|
lines.append(f" [{dimension}]")
|
||||||
|
current_dim = dimension
|
||||||
|
|
||||||
b_vals = agg[key]["bash"]
|
b_vals = agg[key]["bash"]
|
||||||
l_vals = agg[key]["lush"]
|
l_vals = agg[key]["lush"]
|
||||||
b_avg = sum(b_vals) / len(b_vals) if b_vals else 0.0
|
b_avg = sum(b_vals) / len(b_vals) if b_vals else 0.0
|
||||||
@@ -151,10 +176,9 @@ def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
|
|||||||
diff = l_avg - b_avg
|
diff = l_avg - b_avg
|
||||||
diff_str = f"+{diff:.1f}" if diff > 0 else f"{diff:.1f}" if diff < 0 else " 0.0"
|
diff_str = f"+{diff:.1f}" if diff > 0 else f"{diff:.1f}" if diff < 0 else " 0.0"
|
||||||
|
|
||||||
lines.append(f" {label}")
|
lines.append(f" {label}")
|
||||||
lines.append(f" bash {_bar(b_avg)} {b_avg:.1f}")
|
lines.append(f" bash {_bar(b_avg)} {b_avg:.1f}")
|
||||||
lines.append(f" lush {_bar(l_avg)} {l_avg:.1f} ({diff_str})")
|
lines.append(f" lush {_bar(l_avg)} {l_avg:.1f} ({diff_str})")
|
||||||
lines.append("")
|
|
||||||
|
|
||||||
# Overall average
|
# Overall average
|
||||||
all_bash = [v for key in agg for v in agg[key]["bash"]]
|
all_bash = [v for key in agg for v in agg[key]["bash"]]
|
||||||
@@ -164,6 +188,7 @@ def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
|
|||||||
diff = l_overall - b_overall
|
diff = l_overall - b_overall
|
||||||
diff_str = f"+{diff:.1f}" if diff > 0 else f"{diff:.1f}" if diff < 0 else " 0.0"
|
diff_str = f"+{diff:.1f}" if diff > 0 else f"{diff:.1f}" if diff < 0 else " 0.0"
|
||||||
|
|
||||||
|
lines.append("")
|
||||||
lines.append(" " + "-" * 50)
|
lines.append(" " + "-" * 50)
|
||||||
lines.append(f" Overall average")
|
lines.append(f" Overall average")
|
||||||
lines.append(f" bash {_bar(b_overall)} {b_overall:.1f}")
|
lines.append(f" bash {_bar(b_overall)} {b_overall:.1f}")
|
||||||
@@ -244,7 +269,7 @@ def render_per_category_questionnaire(results: list[BenchmarkResult]) -> str:
|
|||||||
lines.append(f" {cat}")
|
lines.append(f" {cat}")
|
||||||
|
|
||||||
agg: dict[str, dict[str, list[float]]] = {}
|
agg: dict[str, dict[str, list[float]]] = {}
|
||||||
for key, _ in LIKERT_QUESTIONS:
|
for key, _, _ in LIKERT_QUESTIONS:
|
||||||
agg[key] = {"bash": [], "lush": []}
|
agg[key] = {"bash": [], "lush": []}
|
||||||
for r in cat_results:
|
for r in cat_results:
|
||||||
scores = _get_likert_scores(r)
|
scores = _get_likert_scores(r)
|
||||||
@@ -254,7 +279,7 @@ def render_per_category_questionnaire(results: list[BenchmarkResult]) -> str:
|
|||||||
if val is not None:
|
if val is not None:
|
||||||
agg[key][lang].append(val)
|
agg[key][lang].append(val)
|
||||||
|
|
||||||
for key, label in LIKERT_QUESTIONS:
|
for key, label, _ in LIKERT_QUESTIONS:
|
||||||
b_vals = agg[key]["bash"]
|
b_vals = agg[key]["bash"]
|
||||||
l_vals = agg[key]["lush"]
|
l_vals = agg[key]["lush"]
|
||||||
b_avg = sum(b_vals) / len(b_vals) if b_vals else 0.0
|
b_avg = sum(b_vals) / len(b_vals) if b_vals else 0.0
|
||||||
@@ -284,7 +309,7 @@ def render_per_task_detail(results: list[BenchmarkResult]) -> str:
|
|||||||
scores = _get_likert_scores(r)
|
scores = _get_likert_scores(r)
|
||||||
lines.append(f" {'Metric':<22s} {'Bash':>4s} {'Lush':>4s} {'Diff':>5s}")
|
lines.append(f" {'Metric':<22s} {'Bash':>4s} {'Lush':>4s} {'Diff':>5s}")
|
||||||
lines.append(" " + "-" * 40)
|
lines.append(" " + "-" * 40)
|
||||||
for key, label in LIKERT_QUESTIONS:
|
for key, label, _ in LIKERT_QUESTIONS:
|
||||||
b_val = scores[key]["bash"]
|
b_val = scores[key]["bash"]
|
||||||
l_val = scores[key]["lush"]
|
l_val = scores[key]["lush"]
|
||||||
b_str = f"{b_val:.0f}" if b_val is not None else "-"
|
b_str = f"{b_val:.0f}" if b_val is not None else "-"
|
||||||
@@ -296,15 +321,6 @@ def render_per_task_detail(results: list[BenchmarkResult]) -> str:
|
|||||||
d_str = "-"
|
d_str = "-"
|
||||||
lines.append(f" {label:<22s} {b_str:>4s} {l_str:>4s} {d_str:>5s}")
|
lines.append(f" {label:<22s} {b_str:>4s} {l_str:>4s} {d_str:>5s}")
|
||||||
|
|
||||||
# Free-form observations
|
|
||||||
obs = _get_freeform(r)
|
|
||||||
if obs:
|
|
||||||
lines.append("")
|
|
||||||
for lang, text in obs.items():
|
|
||||||
# Wrap long text
|
|
||||||
wrapped = text[:120] + ("..." if len(text) > 120 else "")
|
|
||||||
lines.append(f" {lang}: {wrapped}")
|
|
||||||
|
|
||||||
lines.append("")
|
lines.append("")
|
||||||
return "\n".join(lines)
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|||||||
104
main.py
104
main.py
@@ -2,7 +2,9 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import sys
|
import sys
|
||||||
|
import threading
|
||||||
import tomllib
|
import tomllib
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
@@ -16,6 +18,8 @@ from lush_bench.export import export_html
|
|||||||
from lush_bench.report import render_report
|
from lush_bench.report import render_report
|
||||||
from lush_bench.results import save_result
|
from lush_bench.results import save_result
|
||||||
|
|
||||||
|
_print_lock = threading.Lock()
|
||||||
|
|
||||||
|
|
||||||
PROVIDERS = {
|
PROVIDERS = {
|
||||||
"anthropic": AnthropicProvider,
|
"anthropic": AnthropicProvider,
|
||||||
@@ -70,39 +74,44 @@ def cmd_list_tasks(args: argparse.Namespace) -> None:
|
|||||||
print(f" [{task.category:<12s} {task.mode:<7s}] {task.name:20s} {p.relative_to(Path.cwd())}")
|
print(f" [{task.category:<12s} {task.mode:<7s}] {task.name:20s} {p.relative_to(Path.cwd())}")
|
||||||
|
|
||||||
|
|
||||||
def cmd_run(args: argparse.Namespace) -> None:
|
def _log(msg: str) -> None:
|
||||||
config = Config.load()
|
"""Thread-safe print."""
|
||||||
task_path = Path(args.task)
|
with _print_lock:
|
||||||
|
print(msg)
|
||||||
|
|
||||||
|
|
||||||
|
def _run_task(
|
||||||
|
task_path: Path,
|
||||||
|
provider_name: str,
|
||||||
|
config: Config,
|
||||||
|
provider: AnthropicProvider | None = None,
|
||||||
|
) -> BenchmarkResult:
|
||||||
|
"""Core task runner. Thread-safe — usable from cmd_run or a thread pool."""
|
||||||
task = load_task(task_path)
|
task = load_task(task_path)
|
||||||
|
|
||||||
provider_name = args.provider
|
if provider is None:
|
||||||
if provider_name not in PROVIDERS:
|
provider_config = config.provider_configs.get(provider_name, {})
|
||||||
print(f"Unknown provider: {provider_name}. Available: {', '.join(PROVIDERS)}")
|
provider = PROVIDERS[provider_name](provider_config)
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
provider_config = config.provider_configs.get(provider_name, {})
|
|
||||||
provider = PROVIDERS[provider_name](provider_config)
|
|
||||||
timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
|
timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
|
||||||
|
|
||||||
print(f"Running task: {task.name} ({task.category}/{task.mode}) with {provider.model_name}")
|
_log(f"Running task: {task.name} ({task.category}/{task.mode}) with {provider.model_name}")
|
||||||
|
|
||||||
bash_result = None
|
bash_result = None
|
||||||
lush_result = None
|
lush_result = None
|
||||||
|
|
||||||
if task.mode == "solve":
|
if task.mode == "solve":
|
||||||
# Solve mode: agent writes code in both languages
|
_log(f" [{task.name}] Solving in bash...")
|
||||||
print(" Solving in bash...")
|
|
||||||
bash_result = solve_task(provider, task, "bash", config)
|
bash_result = solve_task(provider, task, "bash", config)
|
||||||
print(f" Bash: {'PASS' if bash_result.all_passed else 'FAIL'} ({bash_result.agent_turns} turns)")
|
_log(f" [{task.name}] Bash: {'PASS' if bash_result.all_passed else 'FAIL'} ({bash_result.agent_turns} turns)")
|
||||||
|
|
||||||
print(" Solving in lush...")
|
_log(f" [{task.name}] Solving in lush...")
|
||||||
lush_result = solve_task(provider, task, "lush", config)
|
lush_result = solve_task(provider, task, "lush", config)
|
||||||
print(f" Lush: {'PASS' if lush_result.all_passed else 'FAIL'} ({lush_result.agent_turns} turns)")
|
_log(f" [{task.name}] Lush: {'PASS' if lush_result.all_passed else 'FAIL'} ({lush_result.agent_turns} turns)")
|
||||||
|
|
||||||
elif task.mode == "convert":
|
elif task.mode == "convert":
|
||||||
# Convert mode: verify provided bash source directly, then convert to lush
|
|
||||||
assert task.bash_source, f"Convert-mode task {task.name} missing bash_source"
|
assert task.bash_source, f"Convert-mode task {task.name} missing bash_source"
|
||||||
print(" Verifying provided bash source...")
|
_log(f" [{task.name}] Verifying provided bash source...")
|
||||||
test_results = evaluate(task, task.bash_source, "bash", config)
|
test_results = evaluate(task, task.bash_source, "bash", config)
|
||||||
all_passed = all(tr.passed for tr in test_results)
|
all_passed = all(tr.passed for tr in test_results)
|
||||||
bash_result = LanguageResult(
|
bash_result = LanguageResult(
|
||||||
@@ -112,16 +121,16 @@ def cmd_run(args: argparse.Namespace) -> None:
|
|||||||
all_passed=all_passed,
|
all_passed=all_passed,
|
||||||
agent_turns=0,
|
agent_turns=0,
|
||||||
)
|
)
|
||||||
print(f" Bash: {'PASS' if all_passed else 'FAIL'}")
|
_log(f" [{task.name}] Bash: {'PASS' if all_passed else 'FAIL'}")
|
||||||
|
|
||||||
print(" Converting to lush...")
|
_log(f" [{task.name}] Converting to lush...")
|
||||||
lush_result = solve_task(provider, task, "lush", config)
|
lush_result = solve_task(provider, task, "lush", config)
|
||||||
print(f" Lush: {'PASS' if lush_result.all_passed else 'FAIL'} ({lush_result.agent_turns} turns)")
|
_log(f" [{task.name}] Lush: {'PASS' if lush_result.all_passed else 'FAIL'} ({lush_result.agent_turns} turns)")
|
||||||
|
|
||||||
# Run questionnaire for each completed language
|
# Run questionnaire for each completed language
|
||||||
for lang, result in [("bash", bash_result), ("lush", lush_result)]:
|
for lang, result in [("bash", bash_result), ("lush", lush_result)]:
|
||||||
if result and result.solution_code:
|
if result and result.solution_code:
|
||||||
print(f" Questionnaire for {lang}...")
|
_log(f" [{task.name}] Questionnaire for {lang}...")
|
||||||
result.questionnaire = run_questionnaire(provider, task.name, lang, result.solution_code)
|
result.questionnaire = run_questionnaire(provider, task.name, lang, result.solution_code)
|
||||||
|
|
||||||
benchmark = BenchmarkResult(
|
benchmark = BenchmarkResult(
|
||||||
@@ -136,20 +145,60 @@ def cmd_run(args: argparse.Namespace) -> None:
|
|||||||
)
|
)
|
||||||
|
|
||||||
result_dir = save_result(benchmark, config.output_dir)
|
result_dir = save_result(benchmark, config.output_dir)
|
||||||
print(f" Results saved to {result_dir}")
|
_log(f" [{task.name}] Results saved to {result_dir}")
|
||||||
|
return benchmark
|
||||||
|
|
||||||
|
|
||||||
|
def cmd_run(args: argparse.Namespace) -> None:
|
||||||
|
config = Config.load()
|
||||||
|
|
||||||
|
provider_name = args.provider
|
||||||
|
if provider_name not in PROVIDERS:
|
||||||
|
print(f"Unknown provider: {provider_name}. Available: {', '.join(PROVIDERS)}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
provider_config = config.provider_configs.get(provider_name, {})
|
||||||
|
provider = PROVIDERS[provider_name](provider_config)
|
||||||
|
_run_task(Path(args.task), provider_name, config, provider)
|
||||||
|
|
||||||
|
|
||||||
def cmd_run_all(args: argparse.Namespace) -> None:
|
def cmd_run_all(args: argparse.Namespace) -> None:
|
||||||
|
config = Config.load()
|
||||||
paths = find_tasks(args.category, getattr(args, "mode", None))
|
paths = find_tasks(args.category, getattr(args, "mode", None))
|
||||||
if not paths:
|
if not paths:
|
||||||
print("No tasks found.")
|
print("No tasks found.")
|
||||||
return
|
return
|
||||||
|
|
||||||
for p in paths:
|
provider_name = args.provider
|
||||||
# Reuse cmd_run by constructing a namespace
|
if provider_name not in PROVIDERS:
|
||||||
run_args = argparse.Namespace(task=str(p), provider=args.provider)
|
print(f"Unknown provider: {provider_name}. Available: {', '.join(PROVIDERS)}")
|
||||||
cmd_run(run_args)
|
sys.exit(1)
|
||||||
print()
|
|
||||||
|
# Share one provider instance across threads (its rate limiter is thread-safe)
|
||||||
|
provider_config = config.provider_configs.get(provider_name, {})
|
||||||
|
provider = PROVIDERS[provider_name](provider_config)
|
||||||
|
|
||||||
|
max_workers = args.workers if args.workers is not None else config.max_workers
|
||||||
|
print(f"Running {len(paths)} tasks with {max_workers} workers")
|
||||||
|
|
||||||
|
failed: list[str] = []
|
||||||
|
with ThreadPoolExecutor(max_workers=max_workers) as pool:
|
||||||
|
futures = {
|
||||||
|
pool.submit(_run_task, p, provider_name, config, provider): p
|
||||||
|
for p in paths
|
||||||
|
}
|
||||||
|
for future in as_completed(futures):
|
||||||
|
task_path = futures[future]
|
||||||
|
try:
|
||||||
|
future.result()
|
||||||
|
except Exception as exc:
|
||||||
|
task_name = task_path.stem
|
||||||
|
failed.append(task_name)
|
||||||
|
_log(f" [{task_name}] FAILED: {exc}")
|
||||||
|
|
||||||
|
print(f"\nDone. {len(paths) - len(failed)}/{len(paths)} succeeded.")
|
||||||
|
if failed:
|
||||||
|
print(f"Failed: {', '.join(failed)}")
|
||||||
|
|
||||||
|
|
||||||
def cmd_report(args: argparse.Namespace) -> None:
|
def cmd_report(args: argparse.Namespace) -> None:
|
||||||
@@ -183,6 +232,7 @@ def main() -> None:
|
|||||||
ra.add_argument("--category", choices=["algorithm", "pipeline", "environment", "filesystem", "process"], help="Filter by category")
|
ra.add_argument("--category", choices=["algorithm", "pipeline", "environment", "filesystem", "process"], help="Filter by category")
|
||||||
ra.add_argument("--mode", choices=["solve", "convert"], help="Filter by mode")
|
ra.add_argument("--mode", choices=["solve", "convert"], help="Filter by mode")
|
||||||
ra.add_argument("--provider", default="anthropic", help="LLM provider")
|
ra.add_argument("--provider", default="anthropic", help="LLM provider")
|
||||||
|
ra.add_argument("--workers", type=int, default=None, help="Max parallel tasks (default: from config, typically 4)")
|
||||||
ra.set_defaults(func=cmd_run_all)
|
ra.set_defaults(func=cmd_run_all)
|
||||||
|
|
||||||
# report
|
# report
|
||||||
|
|||||||
202
report.html
202
report.html
File diff suppressed because one or more lines are too long
97
tasks/environment/path_normalizer.toml
Normal file
97
tasks/environment/path_normalizer.toml
Normal file
@@ -0,0 +1,97 @@
|
|||||||
|
name = "path_normalizer"
|
||||||
|
category = "environment"
|
||||||
|
mode = "convert"
|
||||||
|
description = """
|
||||||
|
Read file paths from stdin, one per line. Normalize each path:
|
||||||
|
1. Replace a leading "~" with the value of $HOME.
|
||||||
|
2. Remove trailing slashes (except for root "/").
|
||||||
|
3. Collapse consecutive slashes into one.
|
||||||
|
4. Resolve "." components (remove them).
|
||||||
|
5. Resolve ".." components (go up one directory level).
|
||||||
|
Output the cleaned path, one per line.
|
||||||
|
Skip empty lines.
|
||||||
|
"""
|
||||||
|
|
||||||
|
bash_source = '''
|
||||||
|
#!/bin/bash
|
||||||
|
while IFS= read -r line || [[ -n "$line" ]]; do
|
||||||
|
[[ -z "$line" ]] && continue
|
||||||
|
|
||||||
|
# Expand tilde
|
||||||
|
path=$(echo "$line" | sed "s:^~:$HOME:")
|
||||||
|
|
||||||
|
# Collapse multiple slashes
|
||||||
|
path=$(echo "$path" | sed 's:/\\+:/:g')
|
||||||
|
|
||||||
|
# Remove trailing slash (but keep root)
|
||||||
|
path=$(echo "$path" | sed 's:/$::')
|
||||||
|
[[ -z "$path" ]] && path="/"
|
||||||
|
|
||||||
|
# Resolve . and .. components
|
||||||
|
IFS='/' read -ra parts <<< "$path"
|
||||||
|
result=()
|
||||||
|
absolute=""
|
||||||
|
if [[ "$path" == /* ]]; then
|
||||||
|
absolute="/"
|
||||||
|
fi
|
||||||
|
|
||||||
|
for part in "${parts[@]}"; do
|
||||||
|
if [[ "$part" == "." || "$part" == "" ]]; then
|
||||||
|
continue
|
||||||
|
elif [[ "$part" == ".." ]]; then
|
||||||
|
if [[ ${#result[@]} -gt 0 && "${result[-1]}" != ".." ]]; then
|
||||||
|
unset 'result[${#result[@]}-1]'
|
||||||
|
elif [[ -z "$absolute" ]]; then
|
||||||
|
result+=("..")
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
result+=("$part")
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [[ -n "$absolute" ]]; then
|
||||||
|
final="/"
|
||||||
|
IFS='/'; final+="${result[*]}"
|
||||||
|
else
|
||||||
|
IFS='/'; final="${result[*]}"
|
||||||
|
fi
|
||||||
|
[[ -z "$final" ]] && final="/"
|
||||||
|
|
||||||
|
echo "$final"
|
||||||
|
done
|
||||||
|
'''
|
||||||
|
|
||||||
|
[[test_cases]]
|
||||||
|
description = "Tilde expansion and trailing slashes"
|
||||||
|
stdin = """~/Documents/
|
||||||
|
~/projects/code
|
||||||
|
/var/log/"""
|
||||||
|
expected_stdout = """/Users/testuser/Documents
|
||||||
|
/Users/testuser/projects/code
|
||||||
|
/var/log"""
|
||||||
|
env = { "HOME" = "/Users/testuser" }
|
||||||
|
|
||||||
|
[[test_cases]]
|
||||||
|
description = "Resolving . and .. components"
|
||||||
|
stdin = """/usr/local/./bin
|
||||||
|
/home/user/../shared/docs
|
||||||
|
/a/b/c/../../d"""
|
||||||
|
expected_stdout = """/usr/local/bin
|
||||||
|
/home/shared/docs
|
||||||
|
/a/d"""
|
||||||
|
|
||||||
|
[[test_cases]]
|
||||||
|
description = "Collapsing multiple slashes"
|
||||||
|
stdin = """/usr//local///bin
|
||||||
|
/var/log//syslog"""
|
||||||
|
expected_stdout = """/usr/local/bin
|
||||||
|
/var/log/syslog"""
|
||||||
|
|
||||||
|
[[test_cases]]
|
||||||
|
description = "Root and relative paths"
|
||||||
|
stdin = """/
|
||||||
|
./config/settings
|
||||||
|
../parent/child"""
|
||||||
|
expected_stdout = """/
|
||||||
|
config/settings
|
||||||
|
../parent/child"""
|
||||||
150
tasks/filesystem/todo_manager.toml
Normal file
150
tasks/filesystem/todo_manager.toml
Normal file
@@ -0,0 +1,150 @@
|
|||||||
|
name = "todo_manager"
|
||||||
|
category = "filesystem"
|
||||||
|
mode = "convert"
|
||||||
|
description = """
|
||||||
|
A simple todo list manager. Read commands from stdin, one per line:
|
||||||
|
add <task text> — append the task to the todo file
|
||||||
|
list — print all tasks numbered as "NN). <task>"
|
||||||
|
remove <N> — remove task number N (1-based)
|
||||||
|
clear — remove all tasks
|
||||||
|
|
||||||
|
The todo file is "todo.txt" in the working directory.
|
||||||
|
When listing, pad task numbers to two digits (01, 02, …).
|
||||||
|
After "add" or "remove", automatically list the remaining tasks.
|
||||||
|
If the list is empty, print "No tasks found".
|
||||||
|
"""
|
||||||
|
|
||||||
|
bash_source = '''
|
||||||
|
#!/bin/bash
|
||||||
|
TODOFILE="./todo.txt"
|
||||||
|
|
||||||
|
list_tasks() {
|
||||||
|
if [ -f "$TODOFILE" ] && [ -s "$TODOFILE" ]; then
|
||||||
|
count=1
|
||||||
|
IFS=$'\\n'
|
||||||
|
while read -r task; do
|
||||||
|
num=$count
|
||||||
|
if [ $count -lt 10 ]; then num="0$count"; fi
|
||||||
|
echo "$num). $task"
|
||||||
|
count=$(( count + 1 ))
|
||||||
|
done < "$TODOFILE"
|
||||||
|
else
|
||||||
|
echo "No tasks found"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
add_task() {
|
||||||
|
echo "$1" >> "$TODOFILE"
|
||||||
|
}
|
||||||
|
|
||||||
|
remove_task() {
|
||||||
|
taskNum=$1
|
||||||
|
totalLines=$(wc -l < "$TODOFILE" | tr -d ' ')
|
||||||
|
if [ "$taskNum" -gt "$totalLines" ] 2>/dev/null; then
|
||||||
|
echo "Error: task number $taskNum does not exist!"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
tmpfile="./todo_tmp.txt"
|
||||||
|
count=1
|
||||||
|
IFS=$'\\n'
|
||||||
|
while read -r task; do
|
||||||
|
if [ "$count" -ne "$taskNum" ]; then
|
||||||
|
echo "$task" >> "$tmpfile"
|
||||||
|
fi
|
||||||
|
count=$(( count + 1 ))
|
||||||
|
done < "$TODOFILE"
|
||||||
|
if [ -f "$tmpfile" ]; then
|
||||||
|
mv "$tmpfile" "$TODOFILE"
|
||||||
|
else
|
||||||
|
> "$TODOFILE"
|
||||||
|
fi
|
||||||
|
echo "Sucessfully removed task number $taskNum"
|
||||||
|
}
|
||||||
|
|
||||||
|
clear_tasks() {
|
||||||
|
> "$TODOFILE"
|
||||||
|
echo "Tasks cleared."
|
||||||
|
}
|
||||||
|
|
||||||
|
if [ ! -f "$TODOFILE" ]; then
|
||||||
|
touch "$TODOFILE"
|
||||||
|
fi
|
||||||
|
|
||||||
|
while IFS= read -r line || [[ -n "$line" ]]; do
|
||||||
|
cmd=$(echo "$line" | cut -d' ' -f1)
|
||||||
|
arg=$(echo "$line" | cut -d' ' -f2-)
|
||||||
|
case "$cmd" in
|
||||||
|
add)
|
||||||
|
add_task "$arg"
|
||||||
|
list_tasks
|
||||||
|
;;
|
||||||
|
list)
|
||||||
|
list_tasks
|
||||||
|
;;
|
||||||
|
remove)
|
||||||
|
remove_task "$arg"
|
||||||
|
list_tasks
|
||||||
|
;;
|
||||||
|
clear)
|
||||||
|
clear_tasks
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
'''
|
||||||
|
|
||||||
|
[[test_cases]]
|
||||||
|
description = "Add tasks then list"
|
||||||
|
stdin = """add Buy groceries
|
||||||
|
add Walk the dog
|
||||||
|
list"""
|
||||||
|
expected_stdout = """01). Buy groceries
|
||||||
|
01). Buy groceries
|
||||||
|
02). Walk the dog
|
||||||
|
01). Buy groceries
|
||||||
|
02). Walk the dog"""
|
||||||
|
|
||||||
|
[[test_cases]]
|
||||||
|
description = "Add, remove, list"
|
||||||
|
stdin = """add First task
|
||||||
|
add Second task
|
||||||
|
add Third task
|
||||||
|
remove 2
|
||||||
|
list"""
|
||||||
|
expected_stdout = """01). First task
|
||||||
|
01). First task
|
||||||
|
02). Second task
|
||||||
|
01). First task
|
||||||
|
02). Second task
|
||||||
|
03). Third task
|
||||||
|
Sucessfully removed task number 2
|
||||||
|
01). First task
|
||||||
|
02). Third task
|
||||||
|
01). First task
|
||||||
|
02). Third task"""
|
||||||
|
|
||||||
|
[[test_cases]]
|
||||||
|
description = "Empty list and clear"
|
||||||
|
stdin = """list
|
||||||
|
add Something
|
||||||
|
clear
|
||||||
|
list"""
|
||||||
|
expected_stdout = """No tasks found
|
||||||
|
01). Something
|
||||||
|
Tasks cleared.
|
||||||
|
No tasks found"""
|
||||||
|
|
||||||
|
[[test_cases]]
|
||||||
|
description = "Works with pre-existing todo file"
|
||||||
|
stdin = """list
|
||||||
|
add Third item
|
||||||
|
list"""
|
||||||
|
setup_files = { "todo.txt" = "Existing item one\nExisting item two\n" }
|
||||||
|
expected_stdout = """01). Existing item one
|
||||||
|
02). Existing item two
|
||||||
|
01). Existing item one
|
||||||
|
02). Existing item two
|
||||||
|
03). Third item
|
||||||
|
01). Existing item one
|
||||||
|
02). Existing item two
|
||||||
|
03). Third item"""
|
||||||
|
expected_files = { "todo.txt" = "Existing item one\nExisting item two\nThird item\n" }
|
||||||
113
tasks/pipeline/currency_converter.toml
Normal file
113
tasks/pipeline/currency_converter.toml
Normal file
@@ -0,0 +1,113 @@
|
|||||||
|
name = "currency_converter"
|
||||||
|
category = "pipeline"
|
||||||
|
mode = "convert"
|
||||||
|
description = """
|
||||||
|
A currency converter that reads conversion requests from stdin.
|
||||||
|
Each line has the format: AMOUNT FROM TO RATE
|
||||||
|
- AMOUNT: a decimal number (e.g., 12.35)
|
||||||
|
- FROM: 3-letter currency code
|
||||||
|
- TO: 3-letter currency code
|
||||||
|
- RATE: the exchange rate from FROM's base to TO's base
|
||||||
|
|
||||||
|
Some currencies are pegged to others at fixed rates:
|
||||||
|
BAM is pegged to EUR at 1.95583
|
||||||
|
BMD is pegged to USD at 1.0
|
||||||
|
BND is pegged to SGD at 1.0
|
||||||
|
DJF is pegged to USD at 177.721
|
||||||
|
PAB is pegged to USD at 1.0
|
||||||
|
|
||||||
|
When a pegged currency is involved, the conversion must account for the
|
||||||
|
peg coefficient. The formula is: result = amount * (rate / coef_from) * coef_to
|
||||||
|
where coef is the peg ratio (1 if not pegged).
|
||||||
|
|
||||||
|
Output one line per input: "AMOUNT FROM = RESULT TO" with RESULT
|
||||||
|
computed using bc with scale=2.
|
||||||
|
For invalid lines (wrong field count or non-numeric amount), output "ERROR: <original line>".
|
||||||
|
"""
|
||||||
|
|
||||||
|
bash_source = '''
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
pegged_to() {
|
||||||
|
case "$1" in
|
||||||
|
BAM) echo "EUR:1.95583" ;;
|
||||||
|
BMD) echo "USD:1.0" ;;
|
||||||
|
BND) echo "SGD:1.0" ;;
|
||||||
|
DJF) echo "USD:177.721" ;;
|
||||||
|
PAB) echo "USD:1.0" ;;
|
||||||
|
*) echo "NONE:1" ;;
|
||||||
|
esac
|
||||||
|
}
|
||||||
|
|
||||||
|
while IFS= read -r line || [[ -n "$line" ]]; do
|
||||||
|
# Skip empty lines
|
||||||
|
[[ -z "$line" ]] && continue
|
||||||
|
|
||||||
|
# Parse fields
|
||||||
|
set -- $line
|
||||||
|
if [[ $# -ne 4 ]]; then
|
||||||
|
echo "ERROR: $line"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
amount=$1
|
||||||
|
from=$2
|
||||||
|
to=$3
|
||||||
|
rate=$4
|
||||||
|
|
||||||
|
# Validate amount is numeric
|
||||||
|
if [[ ! "$amount" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then
|
||||||
|
echo "ERROR: $line"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Validate rate is numeric
|
||||||
|
if [[ ! "$rate" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then
|
||||||
|
echo "ERROR: $line"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Get peg info
|
||||||
|
peg_from=$(pegged_to "$from")
|
||||||
|
coef_from=$(echo "$peg_from" | cut -d: -f2)
|
||||||
|
|
||||||
|
peg_to=$(pegged_to "$to")
|
||||||
|
coef_to=$(echo "$peg_to" | cut -d: -f2)
|
||||||
|
|
||||||
|
# Calculate: result = amount * (rate / coef_from) * coef_to
|
||||||
|
result=$(echo "scale=8; $amount * ($rate / $coef_from) * $coef_to" | bc)
|
||||||
|
result=$(printf "%.2f" "$result")
|
||||||
|
|
||||||
|
echo "$amount $from = $result $to"
|
||||||
|
done
|
||||||
|
'''
|
||||||
|
|
||||||
|
[[test_cases]]
|
||||||
|
description = "Standard conversion with direct rate"
|
||||||
|
stdin = """100 USD EUR 0.92
|
||||||
|
50 GBP JPY 188.50"""
|
||||||
|
expected_stdout = """100 USD = 92.00 EUR
|
||||||
|
50 GBP = 9425.00 JPY"""
|
||||||
|
|
||||||
|
[[test_cases]]
|
||||||
|
description = "Pegged currency conversions"
|
||||||
|
stdin = """100 BAM USD 1.08
|
||||||
|
200 BMD EUR 0.92
|
||||||
|
50 USD DJF 1.0"""
|
||||||
|
expected_stdout = """100 BAM = 55.22 USD
|
||||||
|
200 BMD = 184.00 EUR
|
||||||
|
50 USD = 8886.05 DJF"""
|
||||||
|
|
||||||
|
[[test_cases]]
|
||||||
|
description = "Invalid input lines"
|
||||||
|
stdin = """abc EUR USD 0.92
|
||||||
|
100 USD
|
||||||
|
100 EUR USD 0.85"""
|
||||||
|
expected_stdout = """ERROR: abc EUR USD 0.92
|
||||||
|
ERROR: 100 USD
|
||||||
|
100 EUR = 85.00 USD"""
|
||||||
|
|
||||||
|
[[test_cases]]
|
||||||
|
description = "Pegged-to-pegged conversion"
|
||||||
|
stdin = "100 BAM BMD 1.08"
|
||||||
|
expected_stdout = "100 BAM = 55.22 BMD"
|
||||||
76
tasks/pipeline/locale_weather_url.toml
Normal file
76
tasks/pipeline/locale_weather_url.toml
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
name = "locale_weather_url"
|
||||||
|
category = "pipeline"
|
||||||
|
mode = "convert"
|
||||||
|
description = """
|
||||||
|
Construct weather API URLs from locale and location information.
|
||||||
|
Read lines from stdin in the format: LANG_CODE LOCATION
|
||||||
|
where LANG_CODE is a 2-letter locale (e.g., "en", "fr", "de")
|
||||||
|
and LOCATION is a city/place name (may contain spaces).
|
||||||
|
|
||||||
|
For each line, construct a URL in the format:
|
||||||
|
https://LANG.wttr.in/LOCATION
|
||||||
|
|
||||||
|
Where spaces in the location are replaced with "+" characters.
|
||||||
|
|
||||||
|
If LANG_CODE is empty or invalid (not exactly 2 lowercase letters),
|
||||||
|
default to "en".
|
||||||
|
|
||||||
|
Skip empty lines. Output one URL per input line.
|
||||||
|
"""
|
||||||
|
|
||||||
|
bash_source = '''
|
||||||
|
#!/bin/bash
|
||||||
|
while IFS= read -r line || [[ -n "$line" ]]; do
|
||||||
|
[[ -z "$line" ]] && continue
|
||||||
|
|
||||||
|
# Extract lang code (first field)
|
||||||
|
lang=$(echo "$line" | awk '{print $1}')
|
||||||
|
|
||||||
|
# Extract location (everything after first field)
|
||||||
|
location=$(echo "$line" | sed 's/^[^ ]* *//')
|
||||||
|
|
||||||
|
# Validate lang code: must be exactly 2 lowercase letters
|
||||||
|
if [[ ! "$lang" =~ ^[a-z]{2}$ ]]; then
|
||||||
|
lang="en"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# If location is same as lang (single-word line), skip
|
||||||
|
if [[ "$location" == "$lang" || -z "$location" ]]; then
|
||||||
|
location=""
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Replace spaces with +
|
||||||
|
location=$(echo "$location" | tr ' ' '+')
|
||||||
|
|
||||||
|
echo "https://$lang.wttr.in/$location"
|
||||||
|
done
|
||||||
|
'''
|
||||||
|
|
||||||
|
[[test_cases]]
|
||||||
|
description = "Various locales and locations"
|
||||||
|
stdin = """en New York
|
||||||
|
fr Paris
|
||||||
|
de Berlin
|
||||||
|
ja Tokyo"""
|
||||||
|
expected_stdout = """https://en.wttr.in/New+York
|
||||||
|
https://fr.wttr.in/Paris
|
||||||
|
https://de.wttr.in/Berlin
|
||||||
|
https://ja.wttr.in/Tokyo"""
|
||||||
|
|
||||||
|
[[test_cases]]
|
||||||
|
description = "Multi-word locations"
|
||||||
|
stdin = """en San Francisco
|
||||||
|
es Buenos Aires
|
||||||
|
pt Rio de Janeiro"""
|
||||||
|
expected_stdout = """https://en.wttr.in/San+Francisco
|
||||||
|
https://es.wttr.in/Buenos+Aires
|
||||||
|
https://pt.wttr.in/Rio+de+Janeiro"""
|
||||||
|
|
||||||
|
[[test_cases]]
|
||||||
|
description = "Invalid or missing locale defaults to en"
|
||||||
|
stdin = """ENG London
|
||||||
|
123 Moscow
|
||||||
|
x Rome"""
|
||||||
|
expected_stdout = """https://en.wttr.in/London
|
||||||
|
https://en.wttr.in/Moscow
|
||||||
|
https://en.wttr.in/Rome"""
|
||||||
86
tasks/pipeline/network_info_parser.toml
Normal file
86
tasks/pipeline/network_info_parser.toml
Normal file
@@ -0,0 +1,86 @@
|
|||||||
|
name = "network_info_parser"
|
||||||
|
category = "pipeline"
|
||||||
|
mode = "convert"
|
||||||
|
description = """
|
||||||
|
Parse network interface configuration from stdin (in "ip addr show" format)
|
||||||
|
and extract a summary of each interface.
|
||||||
|
|
||||||
|
For each interface block, output a line:
|
||||||
|
IFACE: <name> IP: <ipv4_addr> MASK: /<prefix_len>
|
||||||
|
|
||||||
|
An interface block starts with a line like:
|
||||||
|
2: eth0: <BROADCAST,MULTICAST,UP> mtu 1500 ...
|
||||||
|
and contains inet lines like:
|
||||||
|
inet 192.168.1.100/24 brd 192.168.1.255 scope global eth0
|
||||||
|
|
||||||
|
If an interface has no inet line, output:
|
||||||
|
IFACE: <name> IP: none MASK: none
|
||||||
|
|
||||||
|
Skip the loopback interface (lo).
|
||||||
|
"""
|
||||||
|
|
||||||
|
bash_source = '''
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
current_iface=""
|
||||||
|
found_ip=""
|
||||||
|
found_mask=""
|
||||||
|
|
||||||
|
flush_iface() {
|
||||||
|
if [[ -n "$current_iface" && "$current_iface" != "lo" ]]; then
|
||||||
|
if [[ -n "$found_ip" ]]; then
|
||||||
|
echo "IFACE: $current_iface IP: $found_ip MASK: /$found_mask"
|
||||||
|
else
|
||||||
|
echo "IFACE: $current_iface IP: none MASK: none"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
while IFS= read -r line || [[ -n "$line" ]]; do
|
||||||
|
# Detect interface line: starts with a number followed by colon
|
||||||
|
if echo "$line" | grep -qE '^[0-9]+:'; then
|
||||||
|
flush_iface
|
||||||
|
current_iface=$(echo "$line" | awk -F: '{print $2}' | sed 's/^[[:space:]]*//' | awk '{print $1}')
|
||||||
|
found_ip=""
|
||||||
|
found_mask=""
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Detect inet line (IPv4 only, not inet6)
|
||||||
|
if echo "$line" | grep -qE '^[[:space:]]+inet [0-9]'; then
|
||||||
|
ip_cidr=$(echo "$line" | awk '{print $2}')
|
||||||
|
found_ip=$(echo "$ip_cidr" | cut -d/ -f1)
|
||||||
|
found_mask=$(echo "$ip_cidr" | cut -d/ -f2)
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
flush_iface
|
||||||
|
'''
|
||||||
|
|
||||||
|
[[test_cases]]
|
||||||
|
description = "Two interfaces with IPs"
|
||||||
|
stdin = """1: lo: <LOOPBACK,UP> mtu 65536
|
||||||
|
inet 127.0.0.1/8 scope host lo
|
||||||
|
2: eth0: <BROADCAST,MULTICAST,UP> mtu 1500
|
||||||
|
inet 192.168.1.100/24 brd 192.168.1.255 scope global eth0
|
||||||
|
3: wlan0: <BROADCAST,MULTICAST,UP> mtu 1500
|
||||||
|
inet 10.0.0.42/16 brd 10.0.255.255 scope global wlan0"""
|
||||||
|
expected_stdout = """IFACE: eth0 IP: 192.168.1.100 MASK: /24
|
||||||
|
IFACE: wlan0 IP: 10.0.0.42 MASK: /16"""
|
||||||
|
|
||||||
|
[[test_cases]]
|
||||||
|
description = "Interface with no IP"
|
||||||
|
stdin = """1: lo: <LOOPBACK,UP> mtu 65536
|
||||||
|
inet 127.0.0.1/8 scope host lo
|
||||||
|
2: eth0: <BROADCAST,MULTICAST,UP> mtu 1500
|
||||||
|
3: docker0: <NO-CARRIER,BROADCAST,MULTICAST,UP> mtu 1500
|
||||||
|
inet 172.17.0.1/16 brd 172.17.255.255 scope global docker0"""
|
||||||
|
expected_stdout = """IFACE: eth0 IP: none MASK: none
|
||||||
|
IFACE: docker0 IP: 172.17.0.1 MASK: /16"""
|
||||||
|
|
||||||
|
[[test_cases]]
|
||||||
|
description = "Single interface"
|
||||||
|
stdin = """1: lo: <LOOPBACK,UP> mtu 65536
|
||||||
|
inet 127.0.0.1/8 scope host lo
|
||||||
|
2: enp3s0: <BROADCAST,MULTICAST,UP> mtu 9000
|
||||||
|
inet 10.10.10.5/8 brd 10.255.255.255 scope global enp3s0"""
|
||||||
|
expected_stdout = "IFACE: enp3s0 IP: 10.10.10.5 MASK: /8"
|
||||||
79
tasks/pipeline/url_normalizer.toml
Normal file
79
tasks/pipeline/url_normalizer.toml
Normal file
@@ -0,0 +1,79 @@
|
|||||||
|
name = "url_normalizer"
|
||||||
|
category = "pipeline"
|
||||||
|
mode = "convert"
|
||||||
|
description = """
|
||||||
|
Read URLs from stdin, one per line. Normalize each URL:
|
||||||
|
1. If the URL already starts with "https://", keep it as-is.
|
||||||
|
2. If it starts with "http://", keep it as-is.
|
||||||
|
3. Otherwise, prepend "http://" to it.
|
||||||
|
4. After normalization, validate that the URL matches a basic pattern:
|
||||||
|
it must have a protocol (http:// or https://), followed by at least
|
||||||
|
one character, a dot, and at least one more character for the domain.
|
||||||
|
5. Output the normalized URL, or "INVALID: <original>" for invalid entries.
|
||||||
|
|
||||||
|
Skip empty lines silently.
|
||||||
|
"""
|
||||||
|
|
||||||
|
bash_source = '''
|
||||||
|
#!/bin/bash
|
||||||
|
while IFS= read -r line || [[ -n "$line" ]]; do
|
||||||
|
# Skip empty lines
|
||||||
|
[[ -z "$line" ]] && continue
|
||||||
|
|
||||||
|
# Trim whitespace
|
||||||
|
url=$(echo "$line" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
|
||||||
|
[[ -z "$url" ]] && continue
|
||||||
|
|
||||||
|
original="$url"
|
||||||
|
|
||||||
|
# Check if it already has https://
|
||||||
|
prefix8=$(echo "$url" | cut -c1-8)
|
||||||
|
if [[ "$prefix8" == "https://" ]]; then
|
||||||
|
normalized="$url"
|
||||||
|
else
|
||||||
|
prefix7=$(echo "$url" | cut -c1-7)
|
||||||
|
if [[ "$prefix7" == "http://" ]]; then
|
||||||
|
normalized="$url"
|
||||||
|
else
|
||||||
|
normalized="http://$url"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Validate: protocol + something.something
|
||||||
|
if echo "$normalized" | grep -qE '^https?://[^/]+\.[^/]+'; then
|
||||||
|
echo "$normalized"
|
||||||
|
else
|
||||||
|
echo "INVALID: $original"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
'''
|
||||||
|
|
||||||
|
[[test_cases]]
|
||||||
|
description = "URLs with and without protocol"
|
||||||
|
stdin = """example.com
|
||||||
|
http://example.com
|
||||||
|
https://example.com
|
||||||
|
www.google.com/search?q=test"""
|
||||||
|
expected_stdout = """http://example.com
|
||||||
|
http://example.com
|
||||||
|
https://example.com
|
||||||
|
http://www.google.com/search?q=test"""
|
||||||
|
|
||||||
|
[[test_cases]]
|
||||||
|
description = "Invalid entries"
|
||||||
|
stdin = """notaurl
|
||||||
|
https://valid.example.com
|
||||||
|
just-a-word"""
|
||||||
|
expected_stdout = """INVALID: notaurl
|
||||||
|
https://valid.example.com
|
||||||
|
INVALID: just-a-word"""
|
||||||
|
|
||||||
|
[[test_cases]]
|
||||||
|
description = "Mixed valid and empty lines"
|
||||||
|
stdin = """https://secure.site.org/path
|
||||||
|
|
||||||
|
api.service.io:8080
|
||||||
|
http://old.site.net"""
|
||||||
|
expected_stdout = """https://secure.site.org/path
|
||||||
|
http://api.service.io:8080
|
||||||
|
http://old.site.net"""
|
||||||
Reference in New Issue
Block a user