Revamp questionnaire, parallelize run-all, add new tasks
- Replace 6 compound Likert questions with 12 atomic ones grouped by dimension (syntax, expressiveness, data/IO, errors, overall); drop free-form question. Responses now stored as ints, not strings. - Back-compat layer maps legacy keys to new dimensions so existing results still render. - Parallelize run-all with ThreadPoolExecutor (configurable workers) and add a thread-safe min-request-interval rate limiter to the Anthropic provider. - Add new tasks: path_normalizer, todo_manager, currency_converter, locale_weather_url, network_info_parser, url_normalizer.
This commit is contained in:
104
main.py
104
main.py
@@ -2,7 +2,9 @@ from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
import threading
|
||||
import tomllib
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
@@ -16,6 +18,8 @@ from lush_bench.export import export_html
|
||||
from lush_bench.report import render_report
|
||||
from lush_bench.results import save_result
|
||||
|
||||
_print_lock = threading.Lock()
|
||||
|
||||
|
||||
PROVIDERS = {
|
||||
"anthropic": AnthropicProvider,
|
||||
@@ -70,39 +74,44 @@ def cmd_list_tasks(args: argparse.Namespace) -> None:
|
||||
print(f" [{task.category:<12s} {task.mode:<7s}] {task.name:20s} {p.relative_to(Path.cwd())}")
|
||||
|
||||
|
||||
def cmd_run(args: argparse.Namespace) -> None:
|
||||
config = Config.load()
|
||||
task_path = Path(args.task)
|
||||
def _log(msg: str) -> None:
|
||||
"""Thread-safe print."""
|
||||
with _print_lock:
|
||||
print(msg)
|
||||
|
||||
|
||||
def _run_task(
|
||||
task_path: Path,
|
||||
provider_name: str,
|
||||
config: Config,
|
||||
provider: AnthropicProvider | None = None,
|
||||
) -> BenchmarkResult:
|
||||
"""Core task runner. Thread-safe — usable from cmd_run or a thread pool."""
|
||||
task = load_task(task_path)
|
||||
|
||||
provider_name = args.provider
|
||||
if provider_name not in PROVIDERS:
|
||||
print(f"Unknown provider: {provider_name}. Available: {', '.join(PROVIDERS)}")
|
||||
sys.exit(1)
|
||||
if provider is None:
|
||||
provider_config = config.provider_configs.get(provider_name, {})
|
||||
provider = PROVIDERS[provider_name](provider_config)
|
||||
|
||||
provider_config = config.provider_configs.get(provider_name, {})
|
||||
provider = PROVIDERS[provider_name](provider_config)
|
||||
timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
|
||||
|
||||
print(f"Running task: {task.name} ({task.category}/{task.mode}) with {provider.model_name}")
|
||||
_log(f"Running task: {task.name} ({task.category}/{task.mode}) with {provider.model_name}")
|
||||
|
||||
bash_result = None
|
||||
lush_result = None
|
||||
|
||||
if task.mode == "solve":
|
||||
# Solve mode: agent writes code in both languages
|
||||
print(" Solving in bash...")
|
||||
_log(f" [{task.name}] Solving in bash...")
|
||||
bash_result = solve_task(provider, task, "bash", config)
|
||||
print(f" Bash: {'PASS' if bash_result.all_passed else 'FAIL'} ({bash_result.agent_turns} turns)")
|
||||
_log(f" [{task.name}] Bash: {'PASS' if bash_result.all_passed else 'FAIL'} ({bash_result.agent_turns} turns)")
|
||||
|
||||
print(" Solving in lush...")
|
||||
_log(f" [{task.name}] Solving in lush...")
|
||||
lush_result = solve_task(provider, task, "lush", config)
|
||||
print(f" Lush: {'PASS' if lush_result.all_passed else 'FAIL'} ({lush_result.agent_turns} turns)")
|
||||
_log(f" [{task.name}] Lush: {'PASS' if lush_result.all_passed else 'FAIL'} ({lush_result.agent_turns} turns)")
|
||||
|
||||
elif task.mode == "convert":
|
||||
# Convert mode: verify provided bash source directly, then convert to lush
|
||||
assert task.bash_source, f"Convert-mode task {task.name} missing bash_source"
|
||||
print(" Verifying provided bash source...")
|
||||
_log(f" [{task.name}] Verifying provided bash source...")
|
||||
test_results = evaluate(task, task.bash_source, "bash", config)
|
||||
all_passed = all(tr.passed for tr in test_results)
|
||||
bash_result = LanguageResult(
|
||||
@@ -112,16 +121,16 @@ def cmd_run(args: argparse.Namespace) -> None:
|
||||
all_passed=all_passed,
|
||||
agent_turns=0,
|
||||
)
|
||||
print(f" Bash: {'PASS' if all_passed else 'FAIL'}")
|
||||
_log(f" [{task.name}] Bash: {'PASS' if all_passed else 'FAIL'}")
|
||||
|
||||
print(" Converting to lush...")
|
||||
_log(f" [{task.name}] Converting to lush...")
|
||||
lush_result = solve_task(provider, task, "lush", config)
|
||||
print(f" Lush: {'PASS' if lush_result.all_passed else 'FAIL'} ({lush_result.agent_turns} turns)")
|
||||
_log(f" [{task.name}] Lush: {'PASS' if lush_result.all_passed else 'FAIL'} ({lush_result.agent_turns} turns)")
|
||||
|
||||
# Run questionnaire for each completed language
|
||||
for lang, result in [("bash", bash_result), ("lush", lush_result)]:
|
||||
if result and result.solution_code:
|
||||
print(f" Questionnaire for {lang}...")
|
||||
_log(f" [{task.name}] Questionnaire for {lang}...")
|
||||
result.questionnaire = run_questionnaire(provider, task.name, lang, result.solution_code)
|
||||
|
||||
benchmark = BenchmarkResult(
|
||||
@@ -136,20 +145,60 @@ def cmd_run(args: argparse.Namespace) -> None:
|
||||
)
|
||||
|
||||
result_dir = save_result(benchmark, config.output_dir)
|
||||
print(f" Results saved to {result_dir}")
|
||||
_log(f" [{task.name}] Results saved to {result_dir}")
|
||||
return benchmark
|
||||
|
||||
|
||||
def cmd_run(args: argparse.Namespace) -> None:
|
||||
config = Config.load()
|
||||
|
||||
provider_name = args.provider
|
||||
if provider_name not in PROVIDERS:
|
||||
print(f"Unknown provider: {provider_name}. Available: {', '.join(PROVIDERS)}")
|
||||
sys.exit(1)
|
||||
|
||||
provider_config = config.provider_configs.get(provider_name, {})
|
||||
provider = PROVIDERS[provider_name](provider_config)
|
||||
_run_task(Path(args.task), provider_name, config, provider)
|
||||
|
||||
|
||||
def cmd_run_all(args: argparse.Namespace) -> None:
|
||||
config = Config.load()
|
||||
paths = find_tasks(args.category, getattr(args, "mode", None))
|
||||
if not paths:
|
||||
print("No tasks found.")
|
||||
return
|
||||
|
||||
for p in paths:
|
||||
# Reuse cmd_run by constructing a namespace
|
||||
run_args = argparse.Namespace(task=str(p), provider=args.provider)
|
||||
cmd_run(run_args)
|
||||
print()
|
||||
provider_name = args.provider
|
||||
if provider_name not in PROVIDERS:
|
||||
print(f"Unknown provider: {provider_name}. Available: {', '.join(PROVIDERS)}")
|
||||
sys.exit(1)
|
||||
|
||||
# Share one provider instance across threads (its rate limiter is thread-safe)
|
||||
provider_config = config.provider_configs.get(provider_name, {})
|
||||
provider = PROVIDERS[provider_name](provider_config)
|
||||
|
||||
max_workers = args.workers if args.workers is not None else config.max_workers
|
||||
print(f"Running {len(paths)} tasks with {max_workers} workers")
|
||||
|
||||
failed: list[str] = []
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as pool:
|
||||
futures = {
|
||||
pool.submit(_run_task, p, provider_name, config, provider): p
|
||||
for p in paths
|
||||
}
|
||||
for future in as_completed(futures):
|
||||
task_path = futures[future]
|
||||
try:
|
||||
future.result()
|
||||
except Exception as exc:
|
||||
task_name = task_path.stem
|
||||
failed.append(task_name)
|
||||
_log(f" [{task_name}] FAILED: {exc}")
|
||||
|
||||
print(f"\nDone. {len(paths) - len(failed)}/{len(paths)} succeeded.")
|
||||
if failed:
|
||||
print(f"Failed: {', '.join(failed)}")
|
||||
|
||||
|
||||
def cmd_report(args: argparse.Namespace) -> None:
|
||||
@@ -183,6 +232,7 @@ def main() -> None:
|
||||
ra.add_argument("--category", choices=["algorithm", "pipeline", "environment", "filesystem", "process"], help="Filter by category")
|
||||
ra.add_argument("--mode", choices=["solve", "convert"], help="Filter by mode")
|
||||
ra.add_argument("--provider", default="anthropic", help="LLM provider")
|
||||
ra.add_argument("--workers", type=int, default=None, help="Max parallel tasks (default: from config, typically 4)")
|
||||
ra.set_defaults(func=cmd_run_all)
|
||||
|
||||
# report
|
||||
|
||||
Reference in New Issue
Block a user