Files
lush_grading/lush_bench/results.py
Cormac Shannon be8d657b24 Initial commit: Lush vs Bash AI benchmarking framework
Benchmark harness that uses LLM agents to solve shell scripting tasks
in both Bash and Lush, then compares correctness and code quality.

- CLI with run, run-all, list-tasks, report, and export commands
- Agent loop with retry support via Anthropic Claude provider
- Test harness executing solutions in sandboxed subprocesses
- LLM-driven questionnaire for subjective code quality evaluation
- HTML report export with charts (matplotlib)
- 8 Category A tasks (write-from-scratch in both languages)
- 4 Category B tasks (verify provided Bash, convert to Lush)
- Lush language reference for agent context
2026-03-29 17:56:30 +01:00

30 lines
952 B
Python

from __future__ import annotations
import json
from pathlib import Path
from .models import BenchmarkResult
def save_result(result: BenchmarkResult, output_dir: Path) -> Path:
dir_name = f"{result.timestamp}_{result.task_name}_{result.provider}"
result_dir = output_dir / dir_name
result_dir.mkdir(parents=True, exist_ok=True)
# Save JSON
with open(result_dir / "result.json", "w") as f:
json.dump(result.to_dict(), f, indent=2)
# Save solution files
if result.bash_result and result.bash_result.solution_code:
(result_dir / "solution.sh").write_text(result.bash_result.solution_code)
if result.lush_result and result.lush_result.solution_code:
(result_dir / "solution.lua").write_text(result.lush_result.solution_code)
return result_dir
def load_result(path: Path) -> BenchmarkResult:
with open(path / "result.json") as f:
return BenchmarkResult.from_dict(json.load(f))