Initial commit: Lush vs Bash AI benchmarking framework
Benchmark harness that uses LLM agents to solve shell scripting tasks in both Bash and Lush, then compares correctness and code quality. - CLI with run, run-all, list-tasks, report, and export commands - Agent loop with retry support via Anthropic Claude provider - Test harness executing solutions in sandboxed subprocesses - LLM-driven questionnaire for subjective code quality evaluation - HTML report export with charts (matplotlib) - 8 Category A tasks (write-from-scratch in both languages) - 4 Category B tasks (verify provided Bash, convert to Lush) - Lush language reference for agent context
This commit is contained in:
156
lush_bench/harness.py
Normal file
156
lush_bench/harness.py
Normal file
@@ -0,0 +1,156 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from .config import Config
|
||||
from .models import RunOutput, Task, TestCase, TestResult
|
||||
|
||||
# Minimal base env — keeps scripts deterministic
|
||||
BASE_ENV_KEYS = {"PATH", "HOME", "USER", "LANG", "TERM", "TMPDIR"}
|
||||
|
||||
|
||||
def _build_env(test_case: TestCase) -> dict[str, str]:
|
||||
"""Build a controlled environment: base host vars + test-specific vars."""
|
||||
env = {k: v for k, v in os.environ.items() if k in BASE_ENV_KEYS}
|
||||
env.update(test_case.env)
|
||||
return env
|
||||
|
||||
|
||||
def run_script(
|
||||
command: list[str],
|
||||
script: Path,
|
||||
stdin: str,
|
||||
timeout: float,
|
||||
cwd: Path,
|
||||
env: dict[str, str],
|
||||
) -> RunOutput:
|
||||
start = time.monotonic()
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[*command, str(script)],
|
||||
input=stdin,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout,
|
||||
cwd=cwd,
|
||||
env=env,
|
||||
)
|
||||
elapsed_ms = (time.monotonic() - start) * 1000
|
||||
return RunOutput(
|
||||
stdout=result.stdout,
|
||||
stderr=result.stderr,
|
||||
exit_code=result.returncode,
|
||||
runtime_ms=elapsed_ms,
|
||||
)
|
||||
except subprocess.TimeoutExpired:
|
||||
elapsed_ms = (time.monotonic() - start) * 1000
|
||||
return RunOutput(
|
||||
stdout="",
|
||||
stderr="Timeout exceeded",
|
||||
exit_code=-1,
|
||||
runtime_ms=elapsed_ms,
|
||||
)
|
||||
|
||||
|
||||
def normalize(s: str) -> str:
|
||||
return s.strip()
|
||||
|
||||
|
||||
def _setup_sandbox(tc: TestCase) -> Path:
|
||||
"""Create a temp directory and populate it with setup files."""
|
||||
sandbox = Path(tempfile.mkdtemp(prefix="lush_bench_"))
|
||||
for filename, content in tc.setup_files.items():
|
||||
filepath = sandbox / filename
|
||||
filepath.parent.mkdir(parents=True, exist_ok=True)
|
||||
filepath.write_text(content)
|
||||
return sandbox
|
||||
|
||||
|
||||
def _check_expected_files(
|
||||
sandbox: Path,
|
||||
tc: TestCase,
|
||||
do_normalize: bool,
|
||||
) -> dict[str, dict[str, str]]:
|
||||
"""Compare expected files against sandbox contents. Returns mismatches."""
|
||||
mismatches: dict[str, dict[str, str]] = {}
|
||||
for filename, expected_content in tc.expected_files.items():
|
||||
filepath = sandbox / filename
|
||||
if not filepath.exists():
|
||||
mismatches[filename] = {
|
||||
"expected": expected_content,
|
||||
"actual": "<file not found>",
|
||||
}
|
||||
continue
|
||||
actual_content = filepath.read_text()
|
||||
expected = expected_content
|
||||
actual = actual_content
|
||||
if do_normalize:
|
||||
expected = normalize(expected)
|
||||
actual = normalize(actual)
|
||||
if actual != expected:
|
||||
mismatches[filename] = {
|
||||
"expected": expected_content,
|
||||
"actual": actual_content,
|
||||
}
|
||||
return mismatches
|
||||
|
||||
|
||||
def evaluate(
|
||||
task: Task,
|
||||
code: str,
|
||||
language: str,
|
||||
config: Config,
|
||||
) -> list[TestResult]:
|
||||
suffix = ".sh" if language == "bash" else ".lua"
|
||||
results: list[TestResult] = []
|
||||
|
||||
for i, tc in enumerate(task.test_cases):
|
||||
sandbox = _setup_sandbox(tc)
|
||||
try:
|
||||
# Write script into the sandbox
|
||||
script_path = sandbox / f"solution{suffix}"
|
||||
script_path.write_text(code)
|
||||
|
||||
env = _build_env(tc)
|
||||
|
||||
if language == "bash":
|
||||
command = ["bash"]
|
||||
else:
|
||||
command = [str(config.lush_binary)]
|
||||
|
||||
output = run_script(
|
||||
command, script_path, tc.stdin, config.timeout_seconds, sandbox, env
|
||||
)
|
||||
|
||||
actual = output.stdout
|
||||
expected = tc.expected_stdout
|
||||
if config.normalize_whitespace:
|
||||
actual = normalize(actual)
|
||||
expected = normalize(expected)
|
||||
|
||||
stdout_ok = actual == expected
|
||||
file_mismatches = _check_expected_files(
|
||||
sandbox, tc, config.normalize_whitespace
|
||||
)
|
||||
passed = stdout_ok and not file_mismatches
|
||||
|
||||
results.append(
|
||||
TestResult(
|
||||
test_case_index=i,
|
||||
passed=passed,
|
||||
actual_stdout=output.stdout,
|
||||
expected_stdout=tc.expected_stdout,
|
||||
stderr=output.stderr,
|
||||
exit_code=output.exit_code,
|
||||
file_mismatches=file_mismatches,
|
||||
)
|
||||
)
|
||||
finally:
|
||||
shutil.rmtree(sandbox, ignore_errors=True)
|
||||
|
||||
return results
|
||||
Reference in New Issue
Block a user