Benchmark harness that uses LLM agents to solve shell scripting tasks in both Bash and Lush, then compares correctness and code quality. - CLI with run, run-all, list-tasks, report, and export commands - Agent loop with retry support via Anthropic Claude provider - Test harness executing solutions in sandboxed subprocesses - LLM-driven questionnaire for subjective code quality evaluation - HTML report export with charts (matplotlib) - 8 Category A tasks (write-from-scratch in both languages) - 4 Category B tasks (verify provided Bash, convert to Lush) - Lush language reference for agent context
30 lines
952 B
Python
30 lines
952 B
Python
from __future__ import annotations
|
|
|
|
import json
|
|
from pathlib import Path
|
|
|
|
from .models import BenchmarkResult
|
|
|
|
|
|
def save_result(result: BenchmarkResult, output_dir: Path) -> Path:
|
|
dir_name = f"{result.timestamp}_{result.task_name}_{result.provider}"
|
|
result_dir = output_dir / dir_name
|
|
result_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Save JSON
|
|
with open(result_dir / "result.json", "w") as f:
|
|
json.dump(result.to_dict(), f, indent=2)
|
|
|
|
# Save solution files
|
|
if result.bash_result and result.bash_result.solution_code:
|
|
(result_dir / "solution.sh").write_text(result.bash_result.solution_code)
|
|
if result.lush_result and result.lush_result.solution_code:
|
|
(result_dir / "solution.lua").write_text(result.lush_result.solution_code)
|
|
|
|
return result_dir
|
|
|
|
|
|
def load_result(path: Path) -> BenchmarkResult:
|
|
with open(path / "result.json") as f:
|
|
return BenchmarkResult.from_dict(json.load(f))
|