Benchmark harness that uses LLM agents to solve shell scripting tasks in both Bash and Lush, then compares correctness and code quality. - CLI with run, run-all, list-tasks, report, and export commands - Agent loop with retry support via Anthropic Claude provider - Test harness executing solutions in sandboxed subprocesses - LLM-driven questionnaire for subjective code quality evaluation - HTML report export with charts (matplotlib) - 8 Category A tasks (write-from-scratch in both languages) - 4 Category B tasks (verify provided Bash, convert to Lush) - Lush language reference for agent context
157 lines
4.5 KiB
Python
157 lines
4.5 KiB
Python
from __future__ import annotations
|
|
|
|
import os
|
|
import shutil
|
|
import subprocess
|
|
import tempfile
|
|
import time
|
|
from pathlib import Path
|
|
|
|
from .config import Config
|
|
from .models import RunOutput, Task, TestCase, TestResult
|
|
|
|
# Minimal base env — keeps scripts deterministic
|
|
BASE_ENV_KEYS = {"PATH", "HOME", "USER", "LANG", "TERM", "TMPDIR"}
|
|
|
|
|
|
def _build_env(test_case: TestCase) -> dict[str, str]:
|
|
"""Build a controlled environment: base host vars + test-specific vars."""
|
|
env = {k: v for k, v in os.environ.items() if k in BASE_ENV_KEYS}
|
|
env.update(test_case.env)
|
|
return env
|
|
|
|
|
|
def run_script(
|
|
command: list[str],
|
|
script: Path,
|
|
stdin: str,
|
|
timeout: float,
|
|
cwd: Path,
|
|
env: dict[str, str],
|
|
) -> RunOutput:
|
|
start = time.monotonic()
|
|
try:
|
|
result = subprocess.run(
|
|
[*command, str(script)],
|
|
input=stdin,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=timeout,
|
|
cwd=cwd,
|
|
env=env,
|
|
)
|
|
elapsed_ms = (time.monotonic() - start) * 1000
|
|
return RunOutput(
|
|
stdout=result.stdout,
|
|
stderr=result.stderr,
|
|
exit_code=result.returncode,
|
|
runtime_ms=elapsed_ms,
|
|
)
|
|
except subprocess.TimeoutExpired:
|
|
elapsed_ms = (time.monotonic() - start) * 1000
|
|
return RunOutput(
|
|
stdout="",
|
|
stderr="Timeout exceeded",
|
|
exit_code=-1,
|
|
runtime_ms=elapsed_ms,
|
|
)
|
|
|
|
|
|
def normalize(s: str) -> str:
|
|
return s.strip()
|
|
|
|
|
|
def _setup_sandbox(tc: TestCase) -> Path:
|
|
"""Create a temp directory and populate it with setup files."""
|
|
sandbox = Path(tempfile.mkdtemp(prefix="lush_bench_"))
|
|
for filename, content in tc.setup_files.items():
|
|
filepath = sandbox / filename
|
|
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
filepath.write_text(content)
|
|
return sandbox
|
|
|
|
|
|
def _check_expected_files(
|
|
sandbox: Path,
|
|
tc: TestCase,
|
|
do_normalize: bool,
|
|
) -> dict[str, dict[str, str]]:
|
|
"""Compare expected files against sandbox contents. Returns mismatches."""
|
|
mismatches: dict[str, dict[str, str]] = {}
|
|
for filename, expected_content in tc.expected_files.items():
|
|
filepath = sandbox / filename
|
|
if not filepath.exists():
|
|
mismatches[filename] = {
|
|
"expected": expected_content,
|
|
"actual": "<file not found>",
|
|
}
|
|
continue
|
|
actual_content = filepath.read_text()
|
|
expected = expected_content
|
|
actual = actual_content
|
|
if do_normalize:
|
|
expected = normalize(expected)
|
|
actual = normalize(actual)
|
|
if actual != expected:
|
|
mismatches[filename] = {
|
|
"expected": expected_content,
|
|
"actual": actual_content,
|
|
}
|
|
return mismatches
|
|
|
|
|
|
def evaluate(
|
|
task: Task,
|
|
code: str,
|
|
language: str,
|
|
config: Config,
|
|
) -> list[TestResult]:
|
|
suffix = ".sh" if language == "bash" else ".lua"
|
|
results: list[TestResult] = []
|
|
|
|
for i, tc in enumerate(task.test_cases):
|
|
sandbox = _setup_sandbox(tc)
|
|
try:
|
|
# Write script into the sandbox
|
|
script_path = sandbox / f"solution{suffix}"
|
|
script_path.write_text(code)
|
|
|
|
env = _build_env(tc)
|
|
|
|
if language == "bash":
|
|
command = ["bash"]
|
|
else:
|
|
command = [str(config.lush_binary)]
|
|
|
|
output = run_script(
|
|
command, script_path, tc.stdin, config.timeout_seconds, sandbox, env
|
|
)
|
|
|
|
actual = output.stdout
|
|
expected = tc.expected_stdout
|
|
if config.normalize_whitespace:
|
|
actual = normalize(actual)
|
|
expected = normalize(expected)
|
|
|
|
stdout_ok = actual == expected
|
|
file_mismatches = _check_expected_files(
|
|
sandbox, tc, config.normalize_whitespace
|
|
)
|
|
passed = stdout_ok and not file_mismatches
|
|
|
|
results.append(
|
|
TestResult(
|
|
test_case_index=i,
|
|
passed=passed,
|
|
actual_stdout=output.stdout,
|
|
expected_stdout=tc.expected_stdout,
|
|
stderr=output.stderr,
|
|
exit_code=output.exit_code,
|
|
file_mismatches=file_mismatches,
|
|
)
|
|
)
|
|
finally:
|
|
shutil.rmtree(sandbox, ignore_errors=True)
|
|
|
|
return results
|