Initial commit: Lush vs Bash AI benchmarking framework
Benchmark harness that uses LLM agents to solve shell scripting tasks in both Bash and Lush, then compares correctness and code quality. - CLI with run, run-all, list-tasks, report, and export commands - Agent loop with retry support via Anthropic Claude provider - Test harness executing solutions in sandboxed subprocesses - LLM-driven questionnaire for subjective code quality evaluation - HTML report export with charts (matplotlib) - 8 Category A tasks (write-from-scratch in both languages) - 4 Category B tasks (verify provided Bash, convert to Lush) - Lush language reference for agent context
This commit is contained in:
198
main.py
Normal file
198
main.py
Normal file
@@ -0,0 +1,198 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
import tomllib
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
from lush_bench.agent import solve_task
|
||||
from lush_bench.config import Config
|
||||
from lush_bench.harness import evaluate
|
||||
from lush_bench.models import BenchmarkResult, LanguageResult, Task, TestCase
|
||||
from lush_bench.providers.anthropic import AnthropicProvider
|
||||
from lush_bench.questionnaire import run_questionnaire
|
||||
from lush_bench.export import export_html
|
||||
from lush_bench.report import render_report
|
||||
from lush_bench.results import save_result
|
||||
|
||||
|
||||
PROVIDERS = {
|
||||
"anthropic": AnthropicProvider,
|
||||
}
|
||||
|
||||
|
||||
def load_task(path: Path) -> Task:
|
||||
raw = tomllib.loads(path.read_text())
|
||||
test_cases = [
|
||||
TestCase(
|
||||
stdin=tc.get("stdin", ""),
|
||||
expected_stdout=tc.get("expected_stdout", ""),
|
||||
env=tc.get("env", {}),
|
||||
setup_files=tc.get("setup_files", {}),
|
||||
expected_files=tc.get("expected_files", {}),
|
||||
)
|
||||
for tc in raw["test_cases"]
|
||||
]
|
||||
return Task(
|
||||
name=raw["name"],
|
||||
category=raw["category"],
|
||||
description=raw["description"],
|
||||
test_cases=test_cases,
|
||||
bash_source=raw.get("bash_source"),
|
||||
)
|
||||
|
||||
|
||||
def find_tasks(category: str | None = None) -> list[Path]:
|
||||
tasks_dir = Path(__file__).parent / "tasks"
|
||||
paths = []
|
||||
if category:
|
||||
cat_dir = tasks_dir / f"category_{category}"
|
||||
if cat_dir.exists():
|
||||
paths = sorted(cat_dir.glob("*.toml"))
|
||||
else:
|
||||
for cat_dir in sorted(tasks_dir.iterdir()):
|
||||
if cat_dir.is_dir():
|
||||
paths.extend(sorted(cat_dir.glob("*.toml")))
|
||||
return paths
|
||||
|
||||
|
||||
def cmd_list_tasks(args: argparse.Namespace) -> None:
|
||||
paths = find_tasks(args.category)
|
||||
if not paths:
|
||||
print("No tasks found.")
|
||||
return
|
||||
for p in paths:
|
||||
task = load_task(p)
|
||||
print(f" [{task.category}] {task.name:20s} {p.relative_to(Path.cwd())}")
|
||||
|
||||
|
||||
def cmd_run(args: argparse.Namespace) -> None:
|
||||
config = Config.load()
|
||||
task_path = Path(args.task)
|
||||
task = load_task(task_path)
|
||||
|
||||
provider_name = args.provider
|
||||
if provider_name not in PROVIDERS:
|
||||
print(f"Unknown provider: {provider_name}. Available: {', '.join(PROVIDERS)}")
|
||||
sys.exit(1)
|
||||
|
||||
provider_config = config.provider_configs.get(provider_name, {})
|
||||
provider = PROVIDERS[provider_name](provider_config)
|
||||
timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
|
||||
|
||||
print(f"Running task: {task.name} (category {task.category}) with {provider.model_name}")
|
||||
|
||||
bash_result = None
|
||||
lush_result = None
|
||||
|
||||
if task.category == "a":
|
||||
# Category A: solve in both languages
|
||||
print(" Solving in bash...")
|
||||
bash_result = solve_task(provider, task, "bash", config)
|
||||
print(f" Bash: {'PASS' if bash_result.all_passed else 'FAIL'} ({bash_result.agent_turns} turns)")
|
||||
|
||||
print(" Solving in lush...")
|
||||
lush_result = solve_task(provider, task, "lush", config)
|
||||
print(f" Lush: {'PASS' if lush_result.all_passed else 'FAIL'} ({lush_result.agent_turns} turns)")
|
||||
|
||||
elif task.category == "b":
|
||||
# Category B: verify provided bash source directly, then convert to lush
|
||||
assert task.bash_source, f"Category B task {task.name} missing bash_source"
|
||||
print(" Verifying provided bash source...")
|
||||
test_results = evaluate(task, task.bash_source, "bash", config)
|
||||
all_passed = all(tr.passed for tr in test_results)
|
||||
bash_result = LanguageResult(
|
||||
language="bash",
|
||||
solution_code=task.bash_source,
|
||||
test_results=test_results,
|
||||
all_passed=all_passed,
|
||||
agent_turns=0,
|
||||
)
|
||||
print(f" Bash: {'PASS' if all_passed else 'FAIL'}")
|
||||
|
||||
print(" Converting to lush...")
|
||||
lush_result = solve_task(provider, task, "lush", config)
|
||||
print(f" Lush: {'PASS' if lush_result.all_passed else 'FAIL'} ({lush_result.agent_turns} turns)")
|
||||
|
||||
# Run questionnaire for each completed language
|
||||
for lang, result in [("bash", bash_result), ("lush", lush_result)]:
|
||||
if result and result.solution_code:
|
||||
print(f" Questionnaire for {lang}...")
|
||||
result.questionnaire = run_questionnaire(provider, task.name, lang, result.solution_code)
|
||||
|
||||
benchmark = BenchmarkResult(
|
||||
task_name=task.name,
|
||||
category=task.category,
|
||||
provider=provider_name,
|
||||
model=provider.model_name,
|
||||
timestamp=timestamp,
|
||||
bash_result=bash_result,
|
||||
lush_result=lush_result,
|
||||
)
|
||||
|
||||
result_dir = save_result(benchmark, config.output_dir)
|
||||
print(f" Results saved to {result_dir}")
|
||||
|
||||
|
||||
def cmd_run_all(args: argparse.Namespace) -> None:
|
||||
paths = find_tasks(args.category)
|
||||
if not paths:
|
||||
print("No tasks found.")
|
||||
return
|
||||
|
||||
for p in paths:
|
||||
# Reuse cmd_run by constructing a namespace
|
||||
run_args = argparse.Namespace(task=str(p), provider=args.provider)
|
||||
cmd_run(run_args)
|
||||
print()
|
||||
|
||||
|
||||
def cmd_report(args: argparse.Namespace) -> None:
|
||||
print(render_report(Path(args.results_dir)))
|
||||
|
||||
|
||||
def cmd_export(args: argparse.Namespace) -> None:
|
||||
output = Path(args.output)
|
||||
export_html(Path(args.results_dir), output)
|
||||
print(f"Report exported to {output}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Lush vs Bash AI Benchmarking")
|
||||
sub = parser.add_subparsers(dest="command", required=True)
|
||||
|
||||
# list-tasks
|
||||
ls = sub.add_parser("list-tasks", help="List available tasks")
|
||||
ls.add_argument("--category", choices=["a", "b"], help="Filter by category")
|
||||
ls.set_defaults(func=cmd_list_tasks)
|
||||
|
||||
# run
|
||||
run = sub.add_parser("run", help="Run a single task")
|
||||
run.add_argument("--task", required=True, help="Path to task TOML file")
|
||||
run.add_argument("--provider", default="anthropic", help="LLM provider")
|
||||
run.set_defaults(func=cmd_run)
|
||||
|
||||
# run-all
|
||||
ra = sub.add_parser("run-all", help="Run all tasks in a category")
|
||||
ra.add_argument("--category", choices=["a", "b"], help="Category to run")
|
||||
ra.add_argument("--provider", default="anthropic", help="LLM provider")
|
||||
ra.set_defaults(func=cmd_run_all)
|
||||
|
||||
# report
|
||||
rpt = sub.add_parser("report", help="Show results report in terminal")
|
||||
rpt.add_argument("--results-dir", default="results", help="Results directory")
|
||||
rpt.set_defaults(func=cmd_report)
|
||||
|
||||
# export
|
||||
exp = sub.add_parser("export", help="Export HTML report with charts")
|
||||
exp.add_argument("--results-dir", default="results", help="Results directory")
|
||||
exp.add_argument("--output", "-o", default="report.html", help="Output HTML file")
|
||||
exp.set_defaults(func=cmd_export)
|
||||
|
||||
args = parser.parse_args()
|
||||
args.func(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user