diff --git a/lush_bench/agent.py b/lush_bench/agent.py
index 189b989..bbaa8e3 100644
--- a/lush_bench/agent.py
+++ b/lush_bench/agent.py
@@ -112,7 +112,7 @@ def solve_task(
"""Run the agent loop: prompt -> code -> test -> retry."""
system = build_system_prompt(language)
- if task.category == "b" and language == "lush":
+ if task.mode == "convert" and language == "lush":
user_prompt = build_conversion_prompt(task)
else:
user_prompt = build_task_prompt(task, language)
diff --git a/lush_bench/export.py b/lush_bench/export.py
index 5458bd9..87ab917 100644
--- a/lush_bench/export.py
+++ b/lush_bench/export.py
@@ -88,11 +88,11 @@ def chart_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
def chart_turns_comparison(results: list[BenchmarkResult]) -> str:
"""Bar chart of agent turns per task for bash vs lush."""
- # Only include tasks where the agent actually solved (turns > 0)
- cat_a = [r for r in results if r.category == "a"]
- names = [r.task_name for r in cat_a]
- bash_turns = [r.bash_result.agent_turns if r.bash_result else 0 for r in cat_a]
- lush_turns = [r.lush_result.agent_turns if r.lush_result else 0 for r in cat_a]
+ # Only include tasks where the agent actually solved (solve mode)
+ solve = [r for r in results if r.mode == "solve"]
+ names = [r.task_name for r in solve]
+ bash_turns = [r.bash_result.agent_turns if r.bash_result else 0 for r in solve]
+ lush_turns = [r.lush_result.agent_turns if r.lush_result else 0 for r in solve]
fig, ax = plt.subplots(figsize=(8, 4))
x = range(len(names))
@@ -103,7 +103,7 @@ def chart_turns_comparison(results: list[BenchmarkResult]) -> str:
ax.set_xticks(list(x))
ax.set_xticklabels(names, rotation=35, ha="right", fontsize=8)
ax.set_ylabel("Agent Turns")
- ax.set_title("Agent Turns to Solve (Category A)")
+ ax.set_title("Agent Turns to Solve (Solve Mode)")
ax.yaxis.set_major_locator(ticker.MaxNLocator(integer=True))
ax.legend()
ax.grid(axis="y", alpha=0.3)
@@ -148,6 +148,151 @@ def chart_per_task_heatmap(results: list[BenchmarkResult]) -> str:
return _fig_to_base64(fig)
+def chart_per_category_questionnaire(results: list[BenchmarkResult]) -> str:
+ """Grouped bar chart: one cluster per category, bars for bash/lush avg scores."""
+ from collections import defaultdict
+
+ by_cat: dict[str, list[BenchmarkResult]] = defaultdict(list)
+ for r in results:
+ by_cat[r.category].append(r)
+
+ categories = sorted(by_cat)
+ bash_avgs = []
+ lush_avgs = []
+ for cat in categories:
+ b_scores: list[float] = []
+ l_scores: list[float] = []
+ for r in by_cat[cat]:
+ scores = _get_likert_scores(r)
+ for key in scores:
+ if scores[key]["bash"] is not None:
+ b_scores.append(scores[key]["bash"])
+ if scores[key]["lush"] is not None:
+ l_scores.append(scores[key]["lush"])
+ bash_avgs.append(sum(b_scores) / len(b_scores) if b_scores else 0.0)
+ lush_avgs.append(sum(l_scores) / len(l_scores) if l_scores else 0.0)
+
+ fig, ax = plt.subplots(figsize=(8, 4))
+ x = range(len(categories))
+ bar_w = 0.35
+ bars_b = ax.bar([i - bar_w / 2 for i in x], bash_avgs, bar_w, label="bash", color=BASH_COLOR)
+ bars_l = ax.bar([i + bar_w / 2 for i in x], lush_avgs, bar_w, label="lush", color=LUSH_COLOR)
+
+ ax.set_xticks(list(x))
+ ax.set_xticklabels(categories, fontsize=9)
+ ax.set_ylim(0, 5.5)
+ ax.set_ylabel("Avg Score (1-5)")
+ ax.set_title("Questionnaire Scores by Category")
+ ax.legend()
+ ax.grid(axis="y", alpha=0.3)
+
+ for bar in bars_b:
+ ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.08, f"{bar.get_height():.1f}",
+ ha="center", va="bottom", fontsize=8)
+ for bar in bars_l:
+ ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.08, f"{bar.get_height():.1f}",
+ ha="center", va="bottom", fontsize=8)
+
+ return _fig_to_base64(fig)
+
+
+def chart_per_category_radar(results: list[BenchmarkResult]) -> list[tuple[str, str]]:
+ """Small-multiples bar charts: one per category showing 6 Likert dimensions for bash vs lush."""
+ import numpy as np
+ from collections import defaultdict
+
+ by_cat: dict[str, list[BenchmarkResult]] = defaultdict(list)
+ for r in results:
+ by_cat[r.category].append(r)
+
+ charts: list[tuple[str, str]] = []
+ labels = [label for _, label in LIKERT_QUESTIONS]
+
+ for cat in sorted(by_cat):
+ cat_results = by_cat[cat]
+ agg: dict[str, dict[str, list[float]]] = {}
+ for key, _ in LIKERT_QUESTIONS:
+ agg[key] = {"bash": [], "lush": []}
+ for r in cat_results:
+ scores = _get_likert_scores(r)
+ for key in scores:
+ for lang in ("bash", "lush"):
+ val = scores[key][lang]
+ if val is not None:
+ agg[key][lang].append(val)
+
+ bash_vals = [sum(agg[k]["bash"]) / len(agg[k]["bash"]) if agg[k]["bash"] else 0.0 for k, _ in LIKERT_QUESTIONS]
+ lush_vals = [sum(agg[k]["lush"]) / len(agg[k]["lush"]) if agg[k]["lush"] else 0.0 for k, _ in LIKERT_QUESTIONS]
+
+ fig, ax = plt.subplots(figsize=(6, 3.5))
+ y = range(len(labels))
+ bar_h = 0.35
+ ax.barh([i + bar_h / 2 for i in y], bash_vals, bar_h, label="bash", color=BASH_COLOR)
+ ax.barh([i - bar_h / 2 for i in y], lush_vals, bar_h, label="lush", color=LUSH_COLOR)
+ ax.set_yticks(list(y))
+ ax.set_yticklabels(labels, fontsize=8)
+ ax.set_xlim(0, 5.5)
+ ax.set_title(f"{cat}", fontsize=10)
+ ax.legend(fontsize=8, loc="lower right")
+ ax.invert_yaxis()
+ ax.grid(axis="x", alpha=0.3)
+
+ charts.append((cat, _fig_to_base64(fig)))
+
+ return charts
+
+
+def _build_per_category_summary_html(results: list[BenchmarkResult]) -> str:
+ """HTML table: rows=categories, columns=bash/lush pass rate, turns, scores."""
+ from collections import defaultdict
+
+ by_cat: dict[str, list[BenchmarkResult]] = defaultdict(list)
+ for r in results:
+ by_cat[r.category].append(r)
+
+ rows = []
+ for cat in sorted(by_cat):
+ cat_results = by_cat[cat]
+ b_passed = sum(1 for r in cat_results if r.bash_result and r.bash_result.all_passed)
+ l_passed = sum(1 for r in cat_results if r.lush_result and r.lush_result.all_passed)
+ b_total = sum(1 for r in cat_results if r.bash_result)
+ l_total = sum(1 for r in cat_results if r.lush_result)
+
+ b_turn_vals = [r.bash_result.agent_turns for r in cat_results if r.bash_result and r.bash_result.agent_turns > 0]
+ l_turn_vals = [r.lush_result.agent_turns for r in cat_results if r.lush_result and r.lush_result.agent_turns > 0]
+ b_turns_avg = sum(b_turn_vals) / len(b_turn_vals) if b_turn_vals else 0.0
+ l_turns_avg = sum(l_turn_vals) / len(l_turn_vals) if l_turn_vals else 0.0
+
+ b_scores: list[float] = []
+ l_scores: list[float] = []
+ for r in cat_results:
+ scores = _get_likert_scores(r)
+ for key in scores:
+ if scores[key]["bash"] is not None:
+ b_scores.append(scores[key]["bash"])
+ if scores[key]["lush"] is not None:
+ l_scores.append(scores[key]["lush"])
+ b_avg = sum(b_scores) / len(b_scores) if b_scores else 0.0
+ l_avg = sum(l_scores) / len(l_scores) if l_scores else 0.0
+
+ rows.append(f"""
+ | {html.escape(cat)} |
+ {b_passed}/{b_total} | {l_passed}/{l_total} |
+ {b_turns_avg:.1f} | {l_turns_avg:.1f} |
+ {b_avg:.1f} | {l_avg:.1f} |
+
""")
+
+ return f"""
+
+ | Category |
+ Bash Pass | Lush Pass |
+ Bash Avg Turns | Lush Avg Turns |
+ Bash Avg Score | Lush Avg Score |
+
+ {"".join(rows)}
+
"""
+
+
def _build_summary_html(results: list[BenchmarkResult]) -> str:
rows = []
for r in results:
@@ -160,7 +305,7 @@ def _build_summary_html(results: list[BenchmarkResult]) -> str:
b_turns = str(b.agent_turns) if b else "-"
l_turns = str(l.agent_turns) if l else "-"
rows.append(f"""
- | {html.escape(r.task_name)} | {r.category.upper()} |
+ {html.escape(r.task_name)} | {html.escape(r.category)} |
{b_pass} | {b_turns} |
{l_pass} | {l_turns} |
""")
@@ -215,7 +360,7 @@ def _build_detail_html(results: list[BenchmarkResult]) -> str:
sections.append(f"""
-
{html.escape(r.task_name)} [{r.category}]
+ {html.escape(r.task_name)} [{r.category}/{r.mode}]
bash={b_status}
lush={l_status}
@@ -238,7 +383,10 @@ def export_html(results_dir: Path, output_path: Path) -> None:
chart_questionnaire = chart_questionnaire_comparison(results)
chart_turns = chart_turns_comparison(results)
chart_heatmap = chart_per_task_heatmap(results)
+ chart_cat_quest = chart_per_category_questionnaire(results)
+ cat_radar_charts = chart_per_category_radar(results)
summary_table = _build_summary_html(results)
+ cat_summary_table = _build_per_category_summary_html(results)
detail_html = _build_detail_html(results)
model = results[0].model if results else "unknown"
@@ -288,15 +436,24 @@ def export_html(results_dir: Path, output_path: Path) -> None:
Summary
{summary_table}
+Per-Category Summary
+{cat_summary_table}
+
Questionnaire Scores
-Agent Turns (Category A)
+Questionnaire Scores by Category
+
+
+Agent Turns (Solve Mode)
Score Difference Heatmap (Lush - Bash)
+Per-Category Breakdown
+{"".join(f'{cat}
' for cat, img in cat_radar_charts)}
+
Per-Task Detail
{detail_html}
diff --git a/lush_bench/models.py b/lush_bench/models.py
index c3bed68..c22b88b 100644
--- a/lush_bench/models.py
+++ b/lush_bench/models.py
@@ -36,15 +36,17 @@ class TestCase:
@dataclass
class Task:
name: str
- category: str # "a" or "b"
+ category: str # "algorithm", "pipeline", "environment", "filesystem", "process"
description: str
test_cases: list[TestCase]
- bash_source: str | None = None # category B only
+ mode: str = "solve" # "solve" or "convert"
+ bash_source: str | None = None # convert mode only
def to_dict(self) -> dict[str, Any]:
d: dict[str, Any] = {
"name": self.name,
"category": self.category,
+ "mode": self.mode,
"description": self.description,
"test_cases": [tc.to_dict() for tc in self.test_cases],
}
@@ -59,6 +61,7 @@ class Task:
category=d["category"],
description=d["description"],
test_cases=[TestCase.from_dict(tc) for tc in d["test_cases"]],
+ mode=d.get("mode", "solve"),
bash_source=d.get("bash_source"),
)
@@ -180,16 +183,18 @@ class QuestionnaireResponse:
class BenchmarkResult:
task_name: str
category: str
- provider: str
- model: str
- timestamp: str
- bash_result: LanguageResult | None
- lush_result: LanguageResult | None
+ mode: str = "solve" # "solve" or "convert"
+ provider: str = ""
+ model: str = ""
+ timestamp: str = ""
+ bash_result: LanguageResult | None = None
+ lush_result: LanguageResult | None = None
def to_dict(self) -> dict[str, Any]:
return {
"task_name": self.task_name,
"category": self.category,
+ "mode": self.mode,
"provider": self.provider,
"model": self.model,
"timestamp": self.timestamp,
@@ -202,6 +207,7 @@ class BenchmarkResult:
return cls(
task_name=d["task_name"],
category=d["category"],
+ mode=d.get("mode", "solve"),
provider=d["provider"],
model=d["model"],
timestamp=d["timestamp"],
diff --git a/lush_bench/report.py b/lush_bench/report.py
index 1a6a4b0..e696ce3 100644
--- a/lush_bench/report.py
+++ b/lush_bench/report.py
@@ -86,11 +86,11 @@ def render_summary_table(results: list[BenchmarkResult]) -> str:
lines.append("=" * 78)
lines.append("")
- header = f" {'Task':<22s} {'Cat':>3s} {'Bash':^14s} {'Lush':^14s}"
+ header = f" {'Task':<22s} {'Category':<12s} {'Mode':<8s} {'Bash':^14s} {'Lush':^14s}"
lines.append(header)
- sub = f" {'':<22s} {'':>3s} {'pass turns':^14s} {'pass turns':^14s}"
+ sub = f" {'':<22s} {'':<12s} {'':<8s} {'pass turns':^14s} {'pass turns':^14s}"
lines.append(sub)
- lines.append(" " + "-" * 60)
+ lines.append(" " + "-" * 74)
for r in results:
b = r.bash_result
@@ -99,7 +99,7 @@ def render_summary_table(results: list[BenchmarkResult]) -> str:
l_pass = "PASS" if l and l.all_passed else "FAIL" if l else "-"
b_turns = str(b.agent_turns) if b else "-"
l_turns = str(l.agent_turns) if l else "-"
- lines.append(f" {r.task_name:<22s} [{r.category}] {b_pass:>4s} {b_turns:>5s} {l_pass:>4s} {l_turns:>5s}")
+ lines.append(f" {r.task_name:<22s} {r.category:<12s} {r.mode:<8s} {b_pass:>4s} {b_turns:>5s} {l_pass:>4s} {l_turns:>5s}")
# Totals
b_passed = sum(1 for r in results if r.bash_result and r.bash_result.all_passed)
@@ -115,9 +115,9 @@ def render_summary_table(results: list[BenchmarkResult]) -> str:
if l_turn_counts:
l_turns_avg = sum(l_turn_counts) / len(l_turn_counts)
- lines.append(" " + "-" * 60)
- lines.append(f" {'TOTAL':<22s} {b_passed}/{b_total:>2d} {b_turns_avg:>5.1f} {l_passed}/{l_total:>2d} {l_turns_avg:>5.1f}")
- lines.append(f" {'':27s}{'pass avg turns':^14s} {'pass avg turns':^14s}")
+ lines.append(" " + "-" * 74)
+ lines.append(f" {'TOTAL':<22s} {'':<12s} {'':<8s} {b_passed}/{b_total:>2d} {b_turns_avg:>5.1f} {l_passed}/{l_total:>2d} {l_turns_avg:>5.1f}")
+ lines.append(f" {'':<22s} {'':<12s} {'':<8s} {'pass avg turns':^14s} {'pass avg turns':^14s}")
lines.append("")
return "\n".join(lines)
@@ -172,6 +172,101 @@ def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
return "\n".join(lines)
+def render_per_category_summary(results: list[BenchmarkResult]) -> str:
+ """Render per-category breakdown: pass rates, avg turns, avg questionnaire scores."""
+ from collections import defaultdict
+
+ lines: list[str] = []
+ lines.append("=" * 78)
+ lines.append(" PER-CATEGORY SUMMARY")
+ lines.append("=" * 78)
+ lines.append("")
+
+ # Group by category
+ by_cat: dict[str, list[BenchmarkResult]] = defaultdict(list)
+ for r in results:
+ by_cat[r.category].append(r)
+
+ header = f" {'Category':<12s} {'Bash pass':>9s} {'Lush pass':>9s} {'B turns':>7s} {'L turns':>7s} {'B score':>7s} {'L score':>7s}"
+ lines.append(header)
+ lines.append(" " + "-" * 70)
+
+ for cat in sorted(by_cat):
+ cat_results = by_cat[cat]
+ b_passed = sum(1 for r in cat_results if r.bash_result and r.bash_result.all_passed)
+ l_passed = sum(1 for r in cat_results if r.lush_result and r.lush_result.all_passed)
+ b_total = sum(1 for r in cat_results if r.bash_result)
+ l_total = sum(1 for r in cat_results if r.lush_result)
+
+ b_turn_vals = [r.bash_result.agent_turns for r in cat_results if r.bash_result and r.bash_result.agent_turns > 0]
+ l_turn_vals = [r.lush_result.agent_turns for r in cat_results if r.lush_result and r.lush_result.agent_turns > 0]
+ b_turns_avg = sum(b_turn_vals) / len(b_turn_vals) if b_turn_vals else 0.0
+ l_turns_avg = sum(l_turn_vals) / len(l_turn_vals) if l_turn_vals else 0.0
+
+ # Avg questionnaire scores
+ b_scores: list[float] = []
+ l_scores: list[float] = []
+ for r in cat_results:
+ scores = _get_likert_scores(r)
+ for key in scores:
+ if scores[key]["bash"] is not None:
+ b_scores.append(scores[key]["bash"])
+ if scores[key]["lush"] is not None:
+ l_scores.append(scores[key]["lush"])
+ b_avg_score = sum(b_scores) / len(b_scores) if b_scores else 0.0
+ l_avg_score = sum(l_scores) / len(l_scores) if l_scores else 0.0
+
+ lines.append(
+ f" {cat:<12s} {b_passed}/{b_total:>2d} {l_passed}/{l_total:>2d} "
+ f"{b_turns_avg:>5.1f} {l_turns_avg:>5.1f} {b_avg_score:>5.1f} {l_avg_score:>5.1f}"
+ )
+
+ lines.append("")
+ return "\n".join(lines)
+
+
+def render_per_category_questionnaire(results: list[BenchmarkResult]) -> str:
+ """Render per-category Likert averages."""
+ from collections import defaultdict
+
+ lines: list[str] = []
+ lines.append("=" * 78)
+ lines.append(" PER-CATEGORY QUESTIONNAIRE AVERAGES")
+ lines.append("=" * 78)
+ lines.append("")
+
+ by_cat: dict[str, list[BenchmarkResult]] = defaultdict(list)
+ for r in results:
+ by_cat[r.category].append(r)
+
+ for cat in sorted(by_cat):
+ cat_results = by_cat[cat]
+ lines.append(f" {cat}")
+
+ agg: dict[str, dict[str, list[float]]] = {}
+ for key, _ in LIKERT_QUESTIONS:
+ agg[key] = {"bash": [], "lush": []}
+ for r in cat_results:
+ scores = _get_likert_scores(r)
+ for key in scores:
+ for lang in ("bash", "lush"):
+ val = scores[key][lang]
+ if val is not None:
+ agg[key][lang].append(val)
+
+ for key, label in LIKERT_QUESTIONS:
+ b_vals = agg[key]["bash"]
+ l_vals = agg[key]["lush"]
+ b_avg = sum(b_vals) / len(b_vals) if b_vals else 0.0
+ l_avg = sum(l_vals) / len(l_vals) if l_vals else 0.0
+ diff = l_avg - b_avg
+ diff_str = f"+{diff:.1f}" if diff > 0 else f"{diff:.1f}" if diff < 0 else " 0.0"
+ lines.append(f" {label:<22s} bash={b_avg:.1f} lush={l_avg:.1f} ({diff_str})")
+ lines.append("")
+
+ return "\n".join(lines)
+
+
def render_per_task_detail(results: list[BenchmarkResult]) -> str:
"""Render per-task questionnaire breakdown."""
lines: list[str] = []
@@ -183,7 +278,7 @@ def render_per_task_detail(results: list[BenchmarkResult]) -> str:
lines.append("")
b_status = "PASS" if r.bash_result and r.bash_result.all_passed else "FAIL"
l_status = "PASS" if r.lush_result and r.lush_result.all_passed else "FAIL"
- lines.append(f" {r.task_name} [{r.category}] bash={b_status} lush={l_status}")
+ lines.append(f" {r.task_name} [{r.category}/{r.mode}] bash={b_status} lush={l_status}")
lines.append("")
scores = _get_likert_scores(r)
@@ -222,7 +317,9 @@ def render_report(results_dir: Path) -> str:
parts = [
render_summary_table(results),
+ render_per_category_summary(results),
render_questionnaire_comparison(results),
+ render_per_category_questionnaire(results),
render_per_task_detail(results),
]
return "\n".join(parts)
diff --git a/main.py b/main.py
index f587698..77a2db2 100644
--- a/main.py
+++ b/main.py
@@ -39,32 +39,35 @@ def load_task(path: Path) -> Task:
category=raw["category"],
description=raw["description"],
test_cases=test_cases,
+ mode=raw.get("mode", "solve"),
bash_source=raw.get("bash_source"),
)
-def find_tasks(category: str | None = None) -> list[Path]:
+def find_tasks(category: str | None = None, mode: str | None = None) -> list[Path]:
tasks_dir = Path(__file__).parent / "tasks"
paths = []
if category:
- cat_dir = tasks_dir / f"category_{category}"
+ cat_dir = tasks_dir / category
if cat_dir.exists():
paths = sorted(cat_dir.glob("*.toml"))
else:
for cat_dir in sorted(tasks_dir.iterdir()):
if cat_dir.is_dir():
paths.extend(sorted(cat_dir.glob("*.toml")))
+ if mode:
+ paths = [p for p in paths if load_task(p).mode == mode]
return paths
def cmd_list_tasks(args: argparse.Namespace) -> None:
- paths = find_tasks(args.category)
+ paths = find_tasks(args.category, getattr(args, "mode", None))
if not paths:
print("No tasks found.")
return
for p in paths:
task = load_task(p)
- print(f" [{task.category}] {task.name:20s} {p.relative_to(Path.cwd())}")
+ print(f" [{task.category:<12s} {task.mode:<7s}] {task.name:20s} {p.relative_to(Path.cwd())}")
def cmd_run(args: argparse.Namespace) -> None:
@@ -81,13 +84,13 @@ def cmd_run(args: argparse.Namespace) -> None:
provider = PROVIDERS[provider_name](provider_config)
timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
- print(f"Running task: {task.name} (category {task.category}) with {provider.model_name}")
+ print(f"Running task: {task.name} ({task.category}/{task.mode}) with {provider.model_name}")
bash_result = None
lush_result = None
- if task.category == "a":
- # Category A: solve in both languages
+ if task.mode == "solve":
+ # Solve mode: agent writes code in both languages
print(" Solving in bash...")
bash_result = solve_task(provider, task, "bash", config)
print(f" Bash: {'PASS' if bash_result.all_passed else 'FAIL'} ({bash_result.agent_turns} turns)")
@@ -96,9 +99,9 @@ def cmd_run(args: argparse.Namespace) -> None:
lush_result = solve_task(provider, task, "lush", config)
print(f" Lush: {'PASS' if lush_result.all_passed else 'FAIL'} ({lush_result.agent_turns} turns)")
- elif task.category == "b":
- # Category B: verify provided bash source directly, then convert to lush
- assert task.bash_source, f"Category B task {task.name} missing bash_source"
+ elif task.mode == "convert":
+ # Convert mode: verify provided bash source directly, then convert to lush
+ assert task.bash_source, f"Convert-mode task {task.name} missing bash_source"
print(" Verifying provided bash source...")
test_results = evaluate(task, task.bash_source, "bash", config)
all_passed = all(tr.passed for tr in test_results)
@@ -124,6 +127,7 @@ def cmd_run(args: argparse.Namespace) -> None:
benchmark = BenchmarkResult(
task_name=task.name,
category=task.category,
+ mode=task.mode,
provider=provider_name,
model=provider.model_name,
timestamp=timestamp,
@@ -136,7 +140,7 @@ def cmd_run(args: argparse.Namespace) -> None:
def cmd_run_all(args: argparse.Namespace) -> None:
- paths = find_tasks(args.category)
+ paths = find_tasks(args.category, getattr(args, "mode", None))
if not paths:
print("No tasks found.")
return
@@ -164,7 +168,8 @@ def main() -> None:
# list-tasks
ls = sub.add_parser("list-tasks", help="List available tasks")
- ls.add_argument("--category", choices=["a", "b"], help="Filter by category")
+ ls.add_argument("--category", choices=["algorithm", "pipeline", "environment", "filesystem", "process"], help="Filter by category")
+ ls.add_argument("--mode", choices=["solve", "convert"], help="Filter by mode")
ls.set_defaults(func=cmd_list_tasks)
# run
@@ -174,8 +179,9 @@ def main() -> None:
run.set_defaults(func=cmd_run)
# run-all
- ra = sub.add_parser("run-all", help="Run all tasks in a category")
- ra.add_argument("--category", choices=["a", "b"], help="Category to run")
+ ra = sub.add_parser("run-all", help="Run all tasks (optionally filtered)")
+ ra.add_argument("--category", choices=["algorithm", "pipeline", "environment", "filesystem", "process"], help="Filter by category")
+ ra.add_argument("--mode", choices=["solve", "convert"], help="Filter by mode")
ra.add_argument("--provider", default="anthropic", help="LLM provider")
ra.set_defaults(func=cmd_run_all)
diff --git a/report.html b/report.html
index e81723c..79acd1a 100644
--- a/report.html
+++ b/report.html
@@ -47,53 +47,53 @@
Lush | Turns |
- | env_config | A |
+ fizzbuzz | algorithm |
+ PASS | 1 |
+ PASS | 1 |
+
+ | reverse_string | algorithm |
+ PASS | 1 |
+ PASS | 1 |
+
+ | two_sum | algorithm |
+ PASS | 1 |
+ PASS | 1 |
+
+ | env_config | environment |
PASS | 3 |
PASS | 2 |
- | file_organizer | A |
+ env_path_builder | environment |
+ PASS | 0 |
+ PASS | 1 |
+
+ | file_organizer | filesystem |
FAIL | 4 |
PASS | 1 |
- | fizzbuzz | A |
- PASS | 1 |
- PASS | 1 |
-
- | multi_file_search | A |
+ multi_file_search | filesystem |
PASS | 1 |
PASS | 3 |
- | pipeline_transform | A |
+ csv_transform | pipeline |
+ PASS | 0 |
+ PASS | 1 |
+
+ | log_parser | pipeline |
+ PASS | 0 |
+ PASS | 1 |
+
+ | pipeline_transform | pipeline |
PASS | 1 |
PASS | 2 |
- | process_exit_codes | A |
+ pipeline_word_freq | pipeline |
+ PASS | 0 |
+ PASS | 1 |
+
+ | process_exit_codes | process |
PASS | 3 |
PASS | 1 |
-
- | reverse_string | A |
- PASS | 1 |
- PASS | 1 |
-
- | two_sum | A |
- PASS | 1 |
- PASS | 1 |
-
- | csv_transform | B |
- PASS | 0 |
- PASS | 1 |
-
- | env_path_builder | B |
- PASS | 0 |
- PASS | 1 |
-
- | log_parser | B |
- PASS | 0 |
- PASS | 1 |
-
- | pipeline_word_freq | B |
- PASS | 0 |
- PASS | 1 |
| Total | |
@@ -102,47 +102,61 @@
+Per-Category Summary
+
+
+ | Category |
+ Bash Pass | Lush Pass |
+ Bash Avg Turns | Lush Avg Turns |
+ Bash Avg Score | Lush Avg Score |
+
+
+ | algorithm |
+ 3/3 | 3/3 |
+ 1.0 | 1.0 |
+ 3.6 | 4.1 |
+
+ | environment |
+ 2/2 | 2/2 |
+ 3.0 | 1.5 |
+ 3.1 | 3.7 |
+
+ | filesystem |
+ 1/2 | 2/2 |
+ 2.5 | 2.0 |
+ 3.2 | 3.9 |
+
+ | pipeline |
+ 4/4 | 4/4 |
+ 1.0 | 1.2 |
+ 3.6 | 4.0 |
+
+ | process |
+ 1/1 | 1/1 |
+ 3.0 | 1.0 |
+ 3.8 | 4.3 |
+
+
+
Questionnaire Scores
-
Agent Turns (Category A)
-
+
Questionnaire Scores by Category
+
+
+
Agent Turns (Solve Mode)
+
Score Difference Heatmap (Lush - Bash)
-
+
+
+
Per-Category Breakdown
+
algorithm
environment
filesystem
pipeline
process
Per-Task Detail
-
env_config [a]
- bash=PASS
- lush=PASS
-
-
- | Metric | Bash | Lush | Diff |
- | Readability | 3 | 4 | +1 |
| Expressiveness | 3 | 4 | +1 |
| Conciseness | 2 | 3 | +1 |
| Error handling | 2 | 3 | +1 |
| Overall preference | 3 | 4 | +1 |
| Learning curve | 2 | 4 | +2 |
-
-
bash: Bash's regex matching and array handling work well for this task, but the syntax is arcane (${!key} for indirect variable access, BASH_REMATCH array). The printf-pipe-sort pattern for array sorting is a common bash idiom but not intuitive. Error handling would require additional validation that bash doesn't make easy. Good for system administration contexts where bash is expected, but the syntax barriers make it less accessible than higher-level languages.
-
lush: Lush combines Lua's familiarity with shell-like conveniences effectively. The backtick syntax for command execution and built-in functions like lush.envset feel natural. However, the solution required more manual string processing than ideal - having to parse env output and match keys could be more elegant with better built-in environment variable introspection.
-
-
-
-
-
file_organizer [a]
- bash=FAIL
- lush=PASS
-
-
- | Metric | Bash | Lush | Diff |
- | Readability | 3 | 4 | +1 |
| Expressiveness | 2 | 5 | +3 |
| Conciseness | 2 | 4 | +2 |
| Error handling | 2 | 4 | +2 |
| Overall preference | 2 | 4 | +2 |
| Learning curve | 2 | 4 | +2 |
-
-
bash: Bash required several workarounds and awkward constructs: dynamic variable creation with eval (security risk), manual string building for output sorting, complex parameter expansion syntax, and careful escaping. While bash excels at file operations, the lack of proper data structures made this solution verbose and harder to maintain compared to higher-level languages.
-
lush: Lush's shell command interpolation with backticks and the seamless integration with Lua made this file organization task very natural to implement. The ability to execute shell commands inline while maintaining full Lua data structures for processing was particularly elegant for this type of system administration task.
-
-
-
-
-
fizzbuzz [a]
+ fizzbuzz [algorithm/solve]
bash=PASS
lush=PASS
@@ -156,49 +170,7 @@
-
multi_file_search [a]
- bash=PASS
- lush=PASS
-
-
- | Metric | Bash | Lush | Diff |
- | Readability | 4 | 4 | 0 |
| Expressiveness | 5 | 4 | -1 |
| Conciseness | 5 | 3 | -2 |
| Error handling | 3 | 4 | +1 |
| Overall preference | 5 | 4 | -1 |
| Learning curve | 3 | 3 | 0 |
-
-
bash: Bash excels at text processing and file operations like this. The pipeline approach with grep, sed, and sort feels very natural. However, the syntax can be cryptic for newcomers (especially the sort flags), and proper error handling would require additional complexity that might hurt readability.
-
lush: Lush combines Lua's simplicity with shell command integration nicely. The backtick syntax for command execution is intuitive, and the Lua-based file I/O and string processing felt natural. However, the file globbing required some manual parsing and filtering that felt a bit verbose compared to dedicated shell scripting. Overall, it strikes a good balance between scripting convenience and programming language features.
-
-
-
-
-
pipeline_transform [a]
- bash=PASS
- lush=PASS
-
-
- | Metric | Bash | Lush | Diff |
- | Readability | 4 | 4 | 0 |
| Expressiveness | 5 | 4 | -1 |
| Conciseness | 5 | 3 | -2 |
| Error handling | 3 | 4 | +1 |
| Overall preference | 5 | 4 | -1 |
| Learning curve | 3 | 4 | +1 |
-
-
bash: Bash excels at pipeline transformations like this - the pipe operator creates a natural flow that mirrors the logical steps. The regex in sed might be intimidating to newcomers, but the overall structure is intuitive. Error handling is implicit (pipes fail if any command fails) but lacks granular control.
-
lush: Lush felt very much like Lua, which made the text processing task straightforward. The pattern matching and string manipulation functions were intuitive. However, the manual deduplication logic required more verbose code than some other languages might need. Overall, it provided good control and clarity for this pipeline transformation task.
-
-
-
-
-
process_exit_codes [a]
- bash=PASS
- lush=PASS
-
-
- | Metric | Bash | Lush | Diff |
- | Readability | 4 | 5 | +1 |
| Expressiveness | 4 | 5 | +1 |
| Conciseness | 5 | 5 | 0 |
| Error handling | 3 | 3 | 0 |
| Overall preference | 4 | 4 | 0 |
| Learning curve | 3 | 4 | +1 |
-
-
bash: Bash is well-suited for this task since it naturally handles command execution and exit codes. The main challenges are bash-specific syntax quirks like the read loop condition and variable arithmetic. The use of eval introduces potential security concerns but is necessary for dynamic command execution. Overall, bash feels like the right tool for this system administration type task.
-
lush: Lush combines the simplicity of shell scripting with Lua's readability. The command execution syntax `${command}` is intuitive and the ability to access exit codes via .code is clean. The Lua-based control structures and string handling make it much more readable than pure bash while maintaining the shell's natural command execution feel.
-
-
-
-
-
reverse_string [a]
+ reverse_string [algorithm/solve]
bash=PASS
lush=PASS
@@ -212,7 +184,7 @@
-
two_sum [a]
+ two_sum [algorithm/solve]
bash=PASS
lush=PASS
@@ -226,21 +198,21 @@
-
csv_transform [b]
+ env_config [environment/solve]
bash=PASS
lush=PASS
| Metric | Bash | Lush | Diff |
- | Readability | 4 | 4 | 0 |
| Expressiveness | 4 | 4 | 0 |
| Conciseness | 5 | 5 | 0 |
| Error handling | 2 | 3 | +1 |
| Overall preference | 4 | 4 | 0 |
| Learning curve | 3 | 4 | +1 |
+ | Readability | 3 | 4 | +1 |
| Expressiveness | 3 | 4 | +1 |
| Conciseness | 2 | 3 | +1 |
| Error handling | 2 | 3 | +1 |
| Overall preference | 3 | 4 | +1 |
| Learning curve | 2 | 4 | +2 |
-
bash: Bash excels at simple text processing tasks like this CSV transformation. The IFS mechanism and read command make parsing straightforward, and the solution is very concise. However, bash's quirky syntax (like the `|| [[ -n "$name" ]]` construct for handling files without trailing newlines) can be confusing for newcomers. For more complex CSV handling with edge cases, a dedicated tool or language might be better.
-
lush: Lush's Lua-based syntax made the CSV parsing task quite natural. The pattern matching with match() worked well for simple CSV parsing, and the io functions provided clean file handling. The solution was concise and readable, though proper CSV parsing (handling quoted fields, escapes) would require more complex regex or a dedicated library.
+
bash: Bash's regex matching and array handling work well for this task, but the syntax is arcane (${!key} for indirect variable access, BASH_REMATCH array). The printf-pipe-sort pattern for array sorting is a common bash idiom but not intuitive. Error handling would require additional validation that bash doesn't make easy. Good for system administration contexts where bash is expected, but the syntax barriers make it less accessible than higher-level languages.
+
lush: Lush combines Lua's familiarity with shell-like conveniences effectively. The backtick syntax for command execution and built-in functions like lush.envset feel natural. However, the solution required more manual string processing than ideal - having to parse env output and match keys could be more elegant with better built-in environment variable introspection.
-
env_path_builder [b]
+ env_path_builder [environment/convert]
bash=PASS
lush=PASS
@@ -254,7 +226,49 @@
-
log_parser [b]
+ file_organizer [filesystem/solve]
+ bash=FAIL
+ lush=PASS
+
+
+ | Metric | Bash | Lush | Diff |
+ | Readability | 3 | 4 | +1 |
| Expressiveness | 2 | 5 | +3 |
| Conciseness | 2 | 4 | +2 |
| Error handling | 2 | 4 | +2 |
| Overall preference | 2 | 4 | +2 |
| Learning curve | 2 | 4 | +2 |
+
+
bash: Bash required several workarounds and awkward constructs: dynamic variable creation with eval (security risk), manual string building for output sorting, complex parameter expansion syntax, and careful escaping. While bash excels at file operations, the lack of proper data structures made this solution verbose and harder to maintain compared to higher-level languages.
+
lush: Lush's shell command interpolation with backticks and the seamless integration with Lua made this file organization task very natural to implement. The ability to execute shell commands inline while maintaining full Lua data structures for processing was particularly elegant for this type of system administration task.
+
+
+
+
+
multi_file_search [filesystem/solve]
+ bash=PASS
+ lush=PASS
+
+
+ | Metric | Bash | Lush | Diff |
+ | Readability | 4 | 4 | 0 |
| Expressiveness | 5 | 4 | -1 |
| Conciseness | 5 | 3 | -2 |
| Error handling | 3 | 4 | +1 |
| Overall preference | 5 | 4 | -1 |
| Learning curve | 3 | 3 | 0 |
+
+
bash: Bash excels at text processing and file operations like this. The pipeline approach with grep, sed, and sort feels very natural. However, the syntax can be cryptic for newcomers (especially the sort flags), and proper error handling would require additional complexity that might hurt readability.
+
lush: Lush combines Lua's simplicity with shell command integration nicely. The backtick syntax for command execution is intuitive, and the Lua-based file I/O and string processing felt natural. However, the file globbing required some manual parsing and filtering that felt a bit verbose compared to dedicated shell scripting. Overall, it strikes a good balance between scripting convenience and programming language features.
+
+
+
+
+
csv_transform [pipeline/convert]
+ bash=PASS
+ lush=PASS
+
+
+ | Metric | Bash | Lush | Diff |
+ | Readability | 4 | 4 | 0 |
| Expressiveness | 4 | 4 | 0 |
| Conciseness | 5 | 5 | 0 |
| Error handling | 2 | 3 | +1 |
| Overall preference | 4 | 4 | 0 |
| Learning curve | 3 | 4 | +1 |
+
+
bash: Bash excels at simple text processing tasks like this CSV transformation. The IFS mechanism and read command make parsing straightforward, and the solution is very concise. However, bash's quirky syntax (like the `|| [[ -n "$name" ]]` construct for handling files without trailing newlines) can be confusing for newcomers. For more complex CSV handling with edge cases, a dedicated tool or language might be better.
+
lush: Lush's Lua-based syntax made the CSV parsing task quite natural. The pattern matching with match() worked well for simple CSV parsing, and the io functions provided clean file handling. The solution was concise and readable, though proper CSV parsing (handling quoted fields, escapes) would require more complex regex or a dedicated library.
+
+
+
+
+
log_parser [pipeline/convert]
bash=PASS
lush=PASS
@@ -268,7 +282,21 @@
-
pipeline_word_freq [b]
+ pipeline_transform [pipeline/solve]
+ bash=PASS
+ lush=PASS
+
+
+ | Metric | Bash | Lush | Diff |
+ | Readability | 4 | 4 | 0 |
| Expressiveness | 5 | 4 | -1 |
| Conciseness | 5 | 3 | -2 |
| Error handling | 3 | 4 | +1 |
| Overall preference | 5 | 4 | -1 |
| Learning curve | 3 | 4 | +1 |
+
+
bash: Bash excels at pipeline transformations like this - the pipe operator creates a natural flow that mirrors the logical steps. The regex in sed might be intimidating to newcomers, but the overall structure is intuitive. Error handling is implicit (pipes fail if any command fails) but lacks granular control.
+
lush: Lush felt very much like Lua, which made the text processing task straightforward. The pattern matching and string manipulation functions were intuitive. However, the manual deduplication logic required more verbose code than some other languages might need. Overall, it provided good control and clarity for this pipeline transformation task.
+
+
+
+
+
pipeline_word_freq [pipeline/convert]
bash=PASS
lush=PASS
@@ -278,6 +306,20 @@
bash: Bash excels at text processing pipelines with its built-in tools like tr, grep, sort, and uniq. The solution is extremely concise but requires knowledge of Unix command-line tools and their options. The pipe-based approach is natural for this problem, but the syntax can be cryptic with options like 'tr -cs' and 'sort -k1,1rn'. The final while loop with the complex condition is necessary but makes the code less readable.
lush: Lush feels very much like Lua with clean syntax for text processing. The gmatch function for pattern matching and the flexible table structure made the word frequency counting natural. The sorting with custom comparison functions worked smoothly. Overall, it struck a good balance between simplicity and power for this text processing task.
+
+
+
+
+
process_exit_codes [process/solve]
+ bash=PASS
+ lush=PASS
+
+
+ | Metric | Bash | Lush | Diff |
+ | Readability | 4 | 5 | +1 |
| Expressiveness | 4 | 5 | +1 |
| Conciseness | 5 | 5 | 0 |
| Error handling | 3 | 3 | 0 |
| Overall preference | 4 | 4 | 0 |
| Learning curve | 3 | 4 | +1 |
+
+
bash: Bash is well-suited for this task since it naturally handles command execution and exit codes. The main challenges are bash-specific syntax quirks like the read loop condition and variable arithmetic. The use of eval introduces potential security concerns but is necessary for dynamic command execution. Overall, bash feels like the right tool for this system administration type task.
+
lush: Lush combines the simplicity of shell scripting with Lua's readability. The command execution syntax `${command}` is intuitive and the ability to access exit codes via .code is clean. The Lua-based control structures and string handling make it much more readable than pure bash while maintaining the shell's natural command execution feel.
diff --git a/tasks/category_a/fizzbuzz.toml b/tasks/algorithm/fizzbuzz.toml
similarity index 93%
rename from tasks/category_a/fizzbuzz.toml
rename to tasks/algorithm/fizzbuzz.toml
index 5250458..c5a1f77 100644
--- a/tasks/category_a/fizzbuzz.toml
+++ b/tasks/algorithm/fizzbuzz.toml
@@ -1,5 +1,6 @@
name = "fizzbuzz"
-category = "a"
+category = "algorithm"
+mode = "solve"
description = """
Read a single integer N from stdin. Print numbers from 1 to N, one per line.
For multiples of 3, print "Fizz" instead of the number.
diff --git a/tasks/category_a/reverse_string.toml b/tasks/algorithm/reverse_string.toml
similarity index 89%
rename from tasks/category_a/reverse_string.toml
rename to tasks/algorithm/reverse_string.toml
index b570d0e..4d12e87 100644
--- a/tasks/category_a/reverse_string.toml
+++ b/tasks/algorithm/reverse_string.toml
@@ -1,5 +1,6 @@
name = "reverse_string"
-category = "a"
+category = "algorithm"
+mode = "solve"
description = """
Read a single line from stdin and print it reversed to stdout.
"""
diff --git a/tasks/category_a/two_sum.toml b/tasks/algorithm/two_sum.toml
similarity index 93%
rename from tasks/category_a/two_sum.toml
rename to tasks/algorithm/two_sum.toml
index 85fd8c8..87da463 100644
--- a/tasks/category_a/two_sum.toml
+++ b/tasks/algorithm/two_sum.toml
@@ -1,5 +1,6 @@
name = "two_sum"
-category = "a"
+category = "algorithm"
+mode = "solve"
description = """
Read input from stdin. The first line contains a target integer.
The second line contains space-separated integers (the array).
diff --git a/tasks/category_a/env_config.toml b/tasks/environment/env_config.toml
similarity index 95%
rename from tasks/category_a/env_config.toml
rename to tasks/environment/env_config.toml
index c03917a..4adea23 100644
--- a/tasks/category_a/env_config.toml
+++ b/tasks/environment/env_config.toml
@@ -1,5 +1,6 @@
name = "env_config"
-category = "a"
+category = "environment"
+mode = "solve"
description = """
Read a config format from stdin where each line is "KEY=VALUE".
For each line, set an environment variable with that key and value.
diff --git a/tasks/category_b/env_path_builder.toml b/tasks/environment/env_path_builder.toml
similarity index 95%
rename from tasks/category_b/env_path_builder.toml
rename to tasks/environment/env_path_builder.toml
index 29f5d7d..3f505a6 100644
--- a/tasks/category_b/env_path_builder.toml
+++ b/tasks/environment/env_path_builder.toml
@@ -1,5 +1,6 @@
name = "env_path_builder"
-category = "b"
+category = "environment"
+mode = "convert"
description = """
Read directory paths from stdin, one per line.
Append each to the MYPATH environment variable (colon-separated), skipping duplicates.
diff --git a/tasks/category_a/file_organizer.toml b/tasks/filesystem/file_organizer.toml
similarity index 97%
rename from tasks/category_a/file_organizer.toml
rename to tasks/filesystem/file_organizer.toml
index 2e3f2e7..0337b61 100644
--- a/tasks/category_a/file_organizer.toml
+++ b/tasks/filesystem/file_organizer.toml
@@ -1,5 +1,6 @@
name = "file_organizer"
-category = "a"
+category = "filesystem"
+mode = "solve"
description = """
You are given a working directory containing several files with extensions.
Read a list of extension-to-directory mappings from stdin, one per line, in the format:
diff --git a/tasks/category_a/multi_file_search.toml b/tasks/filesystem/multi_file_search.toml
similarity index 96%
rename from tasks/category_a/multi_file_search.toml
rename to tasks/filesystem/multi_file_search.toml
index 746c251..f55533f 100644
--- a/tasks/category_a/multi_file_search.toml
+++ b/tasks/filesystem/multi_file_search.toml
@@ -1,5 +1,6 @@
name = "multi_file_search"
-category = "a"
+category = "filesystem"
+mode = "solve"
description = """
You are given a working directory containing several text files.
Read a search pattern (a simple string, not regex) from stdin.
diff --git a/tasks/category_b/csv_transform.toml b/tasks/pipeline/csv_transform.toml
similarity index 95%
rename from tasks/category_b/csv_transform.toml
rename to tasks/pipeline/csv_transform.toml
index a139f61..fdf1e85 100644
--- a/tasks/category_b/csv_transform.toml
+++ b/tasks/pipeline/csv_transform.toml
@@ -1,5 +1,6 @@
name = "csv_transform"
-category = "b"
+category = "pipeline"
+mode = "convert"
description = """
Read CSV data from stdin. The first line is a header.
Each subsequent line has fields: name,age,city
diff --git a/tasks/category_b/log_parser.toml b/tasks/pipeline/log_parser.toml
similarity index 94%
rename from tasks/category_b/log_parser.toml
rename to tasks/pipeline/log_parser.toml
index 5e268e0..5b8fa40 100644
--- a/tasks/category_b/log_parser.toml
+++ b/tasks/pipeline/log_parser.toml
@@ -1,5 +1,6 @@
name = "log_parser"
-category = "b"
+category = "pipeline"
+mode = "convert"
description = """
Read log lines from stdin. Each line has the format: "LEVEL: message"
where LEVEL is one of ERROR, WARN, INFO.
diff --git a/tasks/category_a/pipeline_transform.toml b/tasks/pipeline/pipeline_transform.toml
similarity index 95%
rename from tasks/category_a/pipeline_transform.toml
rename to tasks/pipeline/pipeline_transform.toml
index ad3c815..b6951db 100644
--- a/tasks/category_a/pipeline_transform.toml
+++ b/tasks/pipeline/pipeline_transform.toml
@@ -1,5 +1,6 @@
name = "pipeline_transform"
-category = "a"
+category = "pipeline"
+mode = "solve"
description = """
Read lines from stdin. Build a pipeline that:
1. Filters to only lines containing the word "error" (case-insensitive)
diff --git a/tasks/category_b/pipeline_word_freq.toml b/tasks/pipeline/pipeline_word_freq.toml
similarity index 96%
rename from tasks/category_b/pipeline_word_freq.toml
rename to tasks/pipeline/pipeline_word_freq.toml
index 376f3f4..b066ef4 100644
--- a/tasks/category_b/pipeline_word_freq.toml
+++ b/tasks/pipeline/pipeline_word_freq.toml
@@ -1,5 +1,6 @@
name = "pipeline_word_freq"
-category = "b"
+category = "pipeline"
+mode = "convert"
description = """
Read text from stdin. Count the frequency of each word (case-insensitive, only alphabetic characters count as words).
Print the top 5 most frequent words in descending order of frequency, in the format:
diff --git a/tasks/category_a/process_exit_codes.toml b/tasks/process/process_exit_codes.toml
similarity index 95%
rename from tasks/category_a/process_exit_codes.toml
rename to tasks/process/process_exit_codes.toml
index a3fc3e0..3dbb635 100644
--- a/tasks/category_a/process_exit_codes.toml
+++ b/tasks/process/process_exit_codes.toml
@@ -1,5 +1,6 @@
name = "process_exit_codes"
-category = "a"
+category = "process"
+mode = "solve"
description = """
Read commands from stdin, one per line. Execute each command as a subprocess.
For each command, print: "command: exit_code" where command is the original command text