Reorganize task categories from opaque a/b to descriptive names
Replace category_a/category_b directories with algorithm, pipeline, environment, filesystem, and process. Add separate mode field (solve/convert) to decouple orchestration from capability grouping. Add per-category summary and questionnaire breakdowns to both terminal report and HTML export.
This commit is contained in:
34
main.py
34
main.py
@@ -39,32 +39,35 @@ def load_task(path: Path) -> Task:
|
||||
category=raw["category"],
|
||||
description=raw["description"],
|
||||
test_cases=test_cases,
|
||||
mode=raw.get("mode", "solve"),
|
||||
bash_source=raw.get("bash_source"),
|
||||
)
|
||||
|
||||
|
||||
def find_tasks(category: str | None = None) -> list[Path]:
|
||||
def find_tasks(category: str | None = None, mode: str | None = None) -> list[Path]:
|
||||
tasks_dir = Path(__file__).parent / "tasks"
|
||||
paths = []
|
||||
if category:
|
||||
cat_dir = tasks_dir / f"category_{category}"
|
||||
cat_dir = tasks_dir / category
|
||||
if cat_dir.exists():
|
||||
paths = sorted(cat_dir.glob("*.toml"))
|
||||
else:
|
||||
for cat_dir in sorted(tasks_dir.iterdir()):
|
||||
if cat_dir.is_dir():
|
||||
paths.extend(sorted(cat_dir.glob("*.toml")))
|
||||
if mode:
|
||||
paths = [p for p in paths if load_task(p).mode == mode]
|
||||
return paths
|
||||
|
||||
|
||||
def cmd_list_tasks(args: argparse.Namespace) -> None:
|
||||
paths = find_tasks(args.category)
|
||||
paths = find_tasks(args.category, getattr(args, "mode", None))
|
||||
if not paths:
|
||||
print("No tasks found.")
|
||||
return
|
||||
for p in paths:
|
||||
task = load_task(p)
|
||||
print(f" [{task.category}] {task.name:20s} {p.relative_to(Path.cwd())}")
|
||||
print(f" [{task.category:<12s} {task.mode:<7s}] {task.name:20s} {p.relative_to(Path.cwd())}")
|
||||
|
||||
|
||||
def cmd_run(args: argparse.Namespace) -> None:
|
||||
@@ -81,13 +84,13 @@ def cmd_run(args: argparse.Namespace) -> None:
|
||||
provider = PROVIDERS[provider_name](provider_config)
|
||||
timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
|
||||
|
||||
print(f"Running task: {task.name} (category {task.category}) with {provider.model_name}")
|
||||
print(f"Running task: {task.name} ({task.category}/{task.mode}) with {provider.model_name}")
|
||||
|
||||
bash_result = None
|
||||
lush_result = None
|
||||
|
||||
if task.category == "a":
|
||||
# Category A: solve in both languages
|
||||
if task.mode == "solve":
|
||||
# Solve mode: agent writes code in both languages
|
||||
print(" Solving in bash...")
|
||||
bash_result = solve_task(provider, task, "bash", config)
|
||||
print(f" Bash: {'PASS' if bash_result.all_passed else 'FAIL'} ({bash_result.agent_turns} turns)")
|
||||
@@ -96,9 +99,9 @@ def cmd_run(args: argparse.Namespace) -> None:
|
||||
lush_result = solve_task(provider, task, "lush", config)
|
||||
print(f" Lush: {'PASS' if lush_result.all_passed else 'FAIL'} ({lush_result.agent_turns} turns)")
|
||||
|
||||
elif task.category == "b":
|
||||
# Category B: verify provided bash source directly, then convert to lush
|
||||
assert task.bash_source, f"Category B task {task.name} missing bash_source"
|
||||
elif task.mode == "convert":
|
||||
# Convert mode: verify provided bash source directly, then convert to lush
|
||||
assert task.bash_source, f"Convert-mode task {task.name} missing bash_source"
|
||||
print(" Verifying provided bash source...")
|
||||
test_results = evaluate(task, task.bash_source, "bash", config)
|
||||
all_passed = all(tr.passed for tr in test_results)
|
||||
@@ -124,6 +127,7 @@ def cmd_run(args: argparse.Namespace) -> None:
|
||||
benchmark = BenchmarkResult(
|
||||
task_name=task.name,
|
||||
category=task.category,
|
||||
mode=task.mode,
|
||||
provider=provider_name,
|
||||
model=provider.model_name,
|
||||
timestamp=timestamp,
|
||||
@@ -136,7 +140,7 @@ def cmd_run(args: argparse.Namespace) -> None:
|
||||
|
||||
|
||||
def cmd_run_all(args: argparse.Namespace) -> None:
|
||||
paths = find_tasks(args.category)
|
||||
paths = find_tasks(args.category, getattr(args, "mode", None))
|
||||
if not paths:
|
||||
print("No tasks found.")
|
||||
return
|
||||
@@ -164,7 +168,8 @@ def main() -> None:
|
||||
|
||||
# list-tasks
|
||||
ls = sub.add_parser("list-tasks", help="List available tasks")
|
||||
ls.add_argument("--category", choices=["a", "b"], help="Filter by category")
|
||||
ls.add_argument("--category", choices=["algorithm", "pipeline", "environment", "filesystem", "process"], help="Filter by category")
|
||||
ls.add_argument("--mode", choices=["solve", "convert"], help="Filter by mode")
|
||||
ls.set_defaults(func=cmd_list_tasks)
|
||||
|
||||
# run
|
||||
@@ -174,8 +179,9 @@ def main() -> None:
|
||||
run.set_defaults(func=cmd_run)
|
||||
|
||||
# run-all
|
||||
ra = sub.add_parser("run-all", help="Run all tasks in a category")
|
||||
ra.add_argument("--category", choices=["a", "b"], help="Category to run")
|
||||
ra = sub.add_parser("run-all", help="Run all tasks (optionally filtered)")
|
||||
ra.add_argument("--category", choices=["algorithm", "pipeline", "environment", "filesystem", "process"], help="Filter by category")
|
||||
ra.add_argument("--mode", choices=["solve", "convert"], help="Filter by mode")
|
||||
ra.add_argument("--provider", default="anthropic", help="LLM provider")
|
||||
ra.set_defaults(func=cmd_run_all)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user