Initial commit: Lush vs Bash AI benchmarking framework

Benchmark harness that uses LLM agents to solve shell scripting tasks in both Bash and Lush, then compares correctness and code quality. - CLI with run, run-all, list-tasks, report, and export commands - Agent loop with retry support via Anthropic Claude provider - Test harness executing solutions in sandboxed subprocesses - LLM-driven questionnaire for subjective code quality evaluation - HTML report export with charts (matplotlib) - 8 Category A tasks (write-from-scratch in both languages) - 4 Category B tasks (verify provided Bash, convert to Lush) - Lush language reference for agent context
2026-03-29 17:56:30 +01:00
commit be8d657b24
33 changed files with 3302 additions and 0 deletions
--- a/main.py
+++ b/main.py
@@ -0,0 +1,198 @@
+from __future__ import annotations
+
+import argparse
+import sys
+import tomllib
+from datetime import datetime, timezone
+from pathlib import Path
+
+from lush_bench.agent import solve_task
+from lush_bench.config import Config
+from lush_bench.harness import evaluate
+from lush_bench.models import BenchmarkResult, LanguageResult, Task, TestCase
+from lush_bench.providers.anthropic import AnthropicProvider
+from lush_bench.questionnaire import run_questionnaire
+from lush_bench.export import export_html
+from lush_bench.report import render_report
+from lush_bench.results import save_result
+
+
+PROVIDERS = {
+    "anthropic": AnthropicProvider,
+}
+
+
+def load_task(path: Path) -> Task:
+    raw = tomllib.loads(path.read_text())
+    test_cases = [
+        TestCase(
+            stdin=tc.get("stdin", ""),
+            expected_stdout=tc.get("expected_stdout", ""),
+            env=tc.get("env", {}),
+            setup_files=tc.get("setup_files", {}),
+            expected_files=tc.get("expected_files", {}),
+        )
+        for tc in raw["test_cases"]
+    ]
+    return Task(
+        name=raw["name"],
+        category=raw["category"],
+        description=raw["description"],
+        test_cases=test_cases,
+        bash_source=raw.get("bash_source"),
+    )
+
+
+def find_tasks(category: str | None = None) -> list[Path]:
+    tasks_dir = Path(__file__).parent / "tasks"
+    paths = []
+    if category:
+        cat_dir = tasks_dir / f"category_{category}"
+        if cat_dir.exists():
+            paths = sorted(cat_dir.glob("*.toml"))
+    else:
+        for cat_dir in sorted(tasks_dir.iterdir()):
+            if cat_dir.is_dir():
+                paths.extend(sorted(cat_dir.glob("*.toml")))
+    return paths
+
+
+def cmd_list_tasks(args: argparse.Namespace) -> None:
+    paths = find_tasks(args.category)
+    if not paths:
+        print("No tasks found.")
+        return
+    for p in paths:
+        task = load_task(p)
+        print(f"  [{task.category}] {task.name:20s}  {p.relative_to(Path.cwd())}")
+
+
+def cmd_run(args: argparse.Namespace) -> None:
+    config = Config.load()
+    task_path = Path(args.task)
+    task = load_task(task_path)
+
+    provider_name = args.provider
+    if provider_name not in PROVIDERS:
+        print(f"Unknown provider: {provider_name}. Available: {', '.join(PROVIDERS)}")
+        sys.exit(1)
+
+    provider_config = config.provider_configs.get(provider_name, {})
+    provider = PROVIDERS[provider_name](provider_config)
+    timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
+
+    print(f"Running task: {task.name} (category {task.category}) with {provider.model_name}")
+
+    bash_result = None
+    lush_result = None
+
+    if task.category == "a":
+        # Category A: solve in both languages
+        print("  Solving in bash...")
+        bash_result = solve_task(provider, task, "bash", config)
+        print(f"  Bash: {'PASS' if bash_result.all_passed else 'FAIL'} ({bash_result.agent_turns} turns)")
+
+        print("  Solving in lush...")
+        lush_result = solve_task(provider, task, "lush", config)
+        print(f"  Lush: {'PASS' if lush_result.all_passed else 'FAIL'} ({lush_result.agent_turns} turns)")
+
+    elif task.category == "b":
+        # Category B: verify provided bash source directly, then convert to lush
+        assert task.bash_source, f"Category B task {task.name} missing bash_source"
+        print("  Verifying provided bash source...")
+        test_results = evaluate(task, task.bash_source, "bash", config)
+        all_passed = all(tr.passed for tr in test_results)
+        bash_result = LanguageResult(
+            language="bash",
+            solution_code=task.bash_source,
+            test_results=test_results,
+            all_passed=all_passed,
+            agent_turns=0,
+        )
+        print(f"  Bash: {'PASS' if all_passed else 'FAIL'}")
+
+        print("  Converting to lush...")
+        lush_result = solve_task(provider, task, "lush", config)
+        print(f"  Lush: {'PASS' if lush_result.all_passed else 'FAIL'} ({lush_result.agent_turns} turns)")
+
+    # Run questionnaire for each completed language
+    for lang, result in [("bash", bash_result), ("lush", lush_result)]:
+        if result and result.solution_code:
+            print(f"  Questionnaire for {lang}...")
+            result.questionnaire = run_questionnaire(provider, task.name, lang, result.solution_code)
+
+    benchmark = BenchmarkResult(
+        task_name=task.name,
+        category=task.category,
+        provider=provider_name,
+        model=provider.model_name,
+        timestamp=timestamp,
+        bash_result=bash_result,
+        lush_result=lush_result,
+    )
+
+    result_dir = save_result(benchmark, config.output_dir)
+    print(f"  Results saved to {result_dir}")
+
+
+def cmd_run_all(args: argparse.Namespace) -> None:
+    paths = find_tasks(args.category)
+    if not paths:
+        print("No tasks found.")
+        return
+
+    for p in paths:
+        # Reuse cmd_run by constructing a namespace
+        run_args = argparse.Namespace(task=str(p), provider=args.provider)
+        cmd_run(run_args)
+        print()
+
+
+def cmd_report(args: argparse.Namespace) -> None:
+    print(render_report(Path(args.results_dir)))
+
+
+def cmd_export(args: argparse.Namespace) -> None:
+    output = Path(args.output)
+    export_html(Path(args.results_dir), output)
+    print(f"Report exported to {output}")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Lush vs Bash AI Benchmarking")
+    sub = parser.add_subparsers(dest="command", required=True)
+
+    # list-tasks
+    ls = sub.add_parser("list-tasks", help="List available tasks")
+    ls.add_argument("--category", choices=["a", "b"], help="Filter by category")
+    ls.set_defaults(func=cmd_list_tasks)
+
+    # run
+    run = sub.add_parser("run", help="Run a single task")
+    run.add_argument("--task", required=True, help="Path to task TOML file")
+    run.add_argument("--provider", default="anthropic", help="LLM provider")
+    run.set_defaults(func=cmd_run)
+
+    # run-all
+    ra = sub.add_parser("run-all", help="Run all tasks in a category")
+    ra.add_argument("--category", choices=["a", "b"], help="Category to run")
+    ra.add_argument("--provider", default="anthropic", help="LLM provider")
+    ra.set_defaults(func=cmd_run_all)
+
+    # report
+    rpt = sub.add_parser("report", help="Show results report in terminal")
+    rpt.add_argument("--results-dir", default="results", help="Results directory")
+    rpt.set_defaults(func=cmd_report)
+
+    # export
+    exp = sub.add_parser("export", help="Export HTML report with charts")
+    exp.add_argument("--results-dir", default="results", help="Results directory")
+    exp.add_argument("--output", "-o", default="report.html", help="Output HTML file")
+    exp.set_defaults(func=cmd_export)
+
+    args = parser.parse_args()
+    args.func(args)
+
+
+if __name__ == "__main__":
+    main()