Files
lush_grading/lush_bench/harness.py
Cormac Shannon be8d657b24 Initial commit: Lush vs Bash AI benchmarking framework
Benchmark harness that uses LLM agents to solve shell scripting tasks
in both Bash and Lush, then compares correctness and code quality.

- CLI with run, run-all, list-tasks, report, and export commands
- Agent loop with retry support via Anthropic Claude provider
- Test harness executing solutions in sandboxed subprocesses
- LLM-driven questionnaire for subjective code quality evaluation
- HTML report export with charts (matplotlib)
- 8 Category A tasks (write-from-scratch in both languages)
- 4 Category B tasks (verify provided Bash, convert to Lush)
- Lush language reference for agent context
2026-03-29 17:56:30 +01:00

157 lines
4.5 KiB
Python

from __future__ import annotations
import os
import shutil
import subprocess
import tempfile
import time
from pathlib import Path
from .config import Config
from .models import RunOutput, Task, TestCase, TestResult
# Minimal base env — keeps scripts deterministic
BASE_ENV_KEYS = {"PATH", "HOME", "USER", "LANG", "TERM", "TMPDIR"}
def _build_env(test_case: TestCase) -> dict[str, str]:
"""Build a controlled environment: base host vars + test-specific vars."""
env = {k: v for k, v in os.environ.items() if k in BASE_ENV_KEYS}
env.update(test_case.env)
return env
def run_script(
command: list[str],
script: Path,
stdin: str,
timeout: float,
cwd: Path,
env: dict[str, str],
) -> RunOutput:
start = time.monotonic()
try:
result = subprocess.run(
[*command, str(script)],
input=stdin,
capture_output=True,
text=True,
timeout=timeout,
cwd=cwd,
env=env,
)
elapsed_ms = (time.monotonic() - start) * 1000
return RunOutput(
stdout=result.stdout,
stderr=result.stderr,
exit_code=result.returncode,
runtime_ms=elapsed_ms,
)
except subprocess.TimeoutExpired:
elapsed_ms = (time.monotonic() - start) * 1000
return RunOutput(
stdout="",
stderr="Timeout exceeded",
exit_code=-1,
runtime_ms=elapsed_ms,
)
def normalize(s: str) -> str:
return s.strip()
def _setup_sandbox(tc: TestCase) -> Path:
"""Create a temp directory and populate it with setup files."""
sandbox = Path(tempfile.mkdtemp(prefix="lush_bench_"))
for filename, content in tc.setup_files.items():
filepath = sandbox / filename
filepath.parent.mkdir(parents=True, exist_ok=True)
filepath.write_text(content)
return sandbox
def _check_expected_files(
sandbox: Path,
tc: TestCase,
do_normalize: bool,
) -> dict[str, dict[str, str]]:
"""Compare expected files against sandbox contents. Returns mismatches."""
mismatches: dict[str, dict[str, str]] = {}
for filename, expected_content in tc.expected_files.items():
filepath = sandbox / filename
if not filepath.exists():
mismatches[filename] = {
"expected": expected_content,
"actual": "<file not found>",
}
continue
actual_content = filepath.read_text()
expected = expected_content
actual = actual_content
if do_normalize:
expected = normalize(expected)
actual = normalize(actual)
if actual != expected:
mismatches[filename] = {
"expected": expected_content,
"actual": actual_content,
}
return mismatches
def evaluate(
task: Task,
code: str,
language: str,
config: Config,
) -> list[TestResult]:
suffix = ".sh" if language == "bash" else ".lua"
results: list[TestResult] = []
for i, tc in enumerate(task.test_cases):
sandbox = _setup_sandbox(tc)
try:
# Write script into the sandbox
script_path = sandbox / f"solution{suffix}"
script_path.write_text(code)
env = _build_env(tc)
if language == "bash":
command = ["bash"]
else:
command = [str(config.lush_binary)]
output = run_script(
command, script_path, tc.stdin, config.timeout_seconds, sandbox, env
)
actual = output.stdout
expected = tc.expected_stdout
if config.normalize_whitespace:
actual = normalize(actual)
expected = normalize(expected)
stdout_ok = actual == expected
file_mismatches = _check_expected_files(
sandbox, tc, config.normalize_whitespace
)
passed = stdout_ok and not file_mismatches
results.append(
TestResult(
test_case_index=i,
passed=passed,
actual_stdout=output.stdout,
expected_stdout=tc.expected_stdout,
stderr=output.stderr,
exit_code=output.exit_code,
file_mismatches=file_mismatches,
)
)
finally:
shutil.rmtree(sandbox, ignore_errors=True)
return results