Files
lush_grading/main.py
Cormac Shannon 20e62f60f6 Reorganize task categories from opaque a/b to descriptive names
Replace category_a/category_b directories with algorithm, pipeline,
environment, filesystem, and process. Add separate mode field (solve/convert)
to decouple orchestration from capability grouping. Add per-category
summary and questionnaire breakdowns to both terminal report and HTML export.
2026-03-29 20:59:01 +01:00

205 lines
7.2 KiB
Python

from __future__ import annotations
import argparse
import sys
import tomllib
from datetime import datetime, timezone
from pathlib import Path
from lush_bench.agent import solve_task
from lush_bench.config import Config
from lush_bench.harness import evaluate
from lush_bench.models import BenchmarkResult, LanguageResult, Task, TestCase
from lush_bench.providers.anthropic import AnthropicProvider
from lush_bench.questionnaire import run_questionnaire
from lush_bench.export import export_html
from lush_bench.report import render_report
from lush_bench.results import save_result
PROVIDERS = {
"anthropic": AnthropicProvider,
}
def load_task(path: Path) -> Task:
raw = tomllib.loads(path.read_text())
test_cases = [
TestCase(
stdin=tc.get("stdin", ""),
expected_stdout=tc.get("expected_stdout", ""),
env=tc.get("env", {}),
setup_files=tc.get("setup_files", {}),
expected_files=tc.get("expected_files", {}),
)
for tc in raw["test_cases"]
]
return Task(
name=raw["name"],
category=raw["category"],
description=raw["description"],
test_cases=test_cases,
mode=raw.get("mode", "solve"),
bash_source=raw.get("bash_source"),
)
def find_tasks(category: str | None = None, mode: str | None = None) -> list[Path]:
tasks_dir = Path(__file__).parent / "tasks"
paths = []
if category:
cat_dir = tasks_dir / category
if cat_dir.exists():
paths = sorted(cat_dir.glob("*.toml"))
else:
for cat_dir in sorted(tasks_dir.iterdir()):
if cat_dir.is_dir():
paths.extend(sorted(cat_dir.glob("*.toml")))
if mode:
paths = [p for p in paths if load_task(p).mode == mode]
return paths
def cmd_list_tasks(args: argparse.Namespace) -> None:
paths = find_tasks(args.category, getattr(args, "mode", None))
if not paths:
print("No tasks found.")
return
for p in paths:
task = load_task(p)
print(f" [{task.category:<12s} {task.mode:<7s}] {task.name:20s} {p.relative_to(Path.cwd())}")
def cmd_run(args: argparse.Namespace) -> None:
config = Config.load()
task_path = Path(args.task)
task = load_task(task_path)
provider_name = args.provider
if provider_name not in PROVIDERS:
print(f"Unknown provider: {provider_name}. Available: {', '.join(PROVIDERS)}")
sys.exit(1)
provider_config = config.provider_configs.get(provider_name, {})
provider = PROVIDERS[provider_name](provider_config)
timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
print(f"Running task: {task.name} ({task.category}/{task.mode}) with {provider.model_name}")
bash_result = None
lush_result = None
if task.mode == "solve":
# Solve mode: agent writes code in both languages
print(" Solving in bash...")
bash_result = solve_task(provider, task, "bash", config)
print(f" Bash: {'PASS' if bash_result.all_passed else 'FAIL'} ({bash_result.agent_turns} turns)")
print(" Solving in lush...")
lush_result = solve_task(provider, task, "lush", config)
print(f" Lush: {'PASS' if lush_result.all_passed else 'FAIL'} ({lush_result.agent_turns} turns)")
elif task.mode == "convert":
# Convert mode: verify provided bash source directly, then convert to lush
assert task.bash_source, f"Convert-mode task {task.name} missing bash_source"
print(" Verifying provided bash source...")
test_results = evaluate(task, task.bash_source, "bash", config)
all_passed = all(tr.passed for tr in test_results)
bash_result = LanguageResult(
language="bash",
solution_code=task.bash_source,
test_results=test_results,
all_passed=all_passed,
agent_turns=0,
)
print(f" Bash: {'PASS' if all_passed else 'FAIL'}")
print(" Converting to lush...")
lush_result = solve_task(provider, task, "lush", config)
print(f" Lush: {'PASS' if lush_result.all_passed else 'FAIL'} ({lush_result.agent_turns} turns)")
# Run questionnaire for each completed language
for lang, result in [("bash", bash_result), ("lush", lush_result)]:
if result and result.solution_code:
print(f" Questionnaire for {lang}...")
result.questionnaire = run_questionnaire(provider, task.name, lang, result.solution_code)
benchmark = BenchmarkResult(
task_name=task.name,
category=task.category,
mode=task.mode,
provider=provider_name,
model=provider.model_name,
timestamp=timestamp,
bash_result=bash_result,
lush_result=lush_result,
)
result_dir = save_result(benchmark, config.output_dir)
print(f" Results saved to {result_dir}")
def cmd_run_all(args: argparse.Namespace) -> None:
paths = find_tasks(args.category, getattr(args, "mode", None))
if not paths:
print("No tasks found.")
return
for p in paths:
# Reuse cmd_run by constructing a namespace
run_args = argparse.Namespace(task=str(p), provider=args.provider)
cmd_run(run_args)
print()
def cmd_report(args: argparse.Namespace) -> None:
print(render_report(Path(args.results_dir)))
def cmd_export(args: argparse.Namespace) -> None:
output = Path(args.output)
export_html(Path(args.results_dir), output)
print(f"Report exported to {output}")
def main() -> None:
parser = argparse.ArgumentParser(description="Lush vs Bash AI Benchmarking")
sub = parser.add_subparsers(dest="command", required=True)
# list-tasks
ls = sub.add_parser("list-tasks", help="List available tasks")
ls.add_argument("--category", choices=["algorithm", "pipeline", "environment", "filesystem", "process"], help="Filter by category")
ls.add_argument("--mode", choices=["solve", "convert"], help="Filter by mode")
ls.set_defaults(func=cmd_list_tasks)
# run
run = sub.add_parser("run", help="Run a single task")
run.add_argument("--task", required=True, help="Path to task TOML file")
run.add_argument("--provider", default="anthropic", help="LLM provider")
run.set_defaults(func=cmd_run)
# run-all
ra = sub.add_parser("run-all", help="Run all tasks (optionally filtered)")
ra.add_argument("--category", choices=["algorithm", "pipeline", "environment", "filesystem", "process"], help="Filter by category")
ra.add_argument("--mode", choices=["solve", "convert"], help="Filter by mode")
ra.add_argument("--provider", default="anthropic", help="LLM provider")
ra.set_defaults(func=cmd_run_all)
# report
rpt = sub.add_parser("report", help="Show results report in terminal")
rpt.add_argument("--results-dir", default="results", help="Results directory")
rpt.set_defaults(func=cmd_report)
# export
exp = sub.add_parser("export", help="Export HTML report with charts")
exp.add_argument("--results-dir", default="results", help="Results directory")
exp.add_argument("--output", "-o", default="report.html", help="Output HTML file")
exp.set_defaults(func=cmd_export)
args = parser.parse_args()
args.func(args)
if __name__ == "__main__":
main()