Initial commit: Lush vs Bash AI benchmarking framework

Benchmark harness that uses LLM agents to solve shell scripting tasks in both Bash and Lush, then compares correctness and code quality. - CLI with run, run-all, list-tasks, report, and export commands - Agent loop with retry support via Anthropic Claude provider - Test harness executing solutions in sandboxed subprocesses - LLM-driven questionnaire for subjective code quality evaluation - HTML report export with charts (matplotlib) - 8 Category A tasks (write-from-scratch in both languages) - 4 Category B tasks (verify provided Bash, convert to Lush) - Lush language reference for agent context
2026-03-29 17:56:30 +01:00
commit be8d657b24
33 changed files with 3302 additions and 0 deletions
--- a/lush_bench/harness.py
+++ b/lush_bench/harness.py
@@ -0,0 +1,156 @@
+from __future__ import annotations
+
+import os
+import shutil
+import subprocess
+import tempfile
+import time
+from pathlib import Path
+
+from .config import Config
+from .models import RunOutput, Task, TestCase, TestResult
+
+# Minimal base env — keeps scripts deterministic
+BASE_ENV_KEYS = {"PATH", "HOME", "USER", "LANG", "TERM", "TMPDIR"}
+
+
+def _build_env(test_case: TestCase) -> dict[str, str]:
+    """Build a controlled environment: base host vars + test-specific vars."""
+    env = {k: v for k, v in os.environ.items() if k in BASE_ENV_KEYS}
+    env.update(test_case.env)
+    return env
+
+
+def run_script(
+    command: list[str],
+    script: Path,
+    stdin: str,
+    timeout: float,
+    cwd: Path,
+    env: dict[str, str],
+) -> RunOutput:
+    start = time.monotonic()
+    try:
+        result = subprocess.run(
+            [*command, str(script)],
+            input=stdin,
+            capture_output=True,
+            text=True,
+            timeout=timeout,
+            cwd=cwd,
+            env=env,
+        )
+        elapsed_ms = (time.monotonic() - start) * 1000
+        return RunOutput(
+            stdout=result.stdout,
+            stderr=result.stderr,
+            exit_code=result.returncode,
+            runtime_ms=elapsed_ms,
+        )
+    except subprocess.TimeoutExpired:
+        elapsed_ms = (time.monotonic() - start) * 1000
+        return RunOutput(
+            stdout="",
+            stderr="Timeout exceeded",
+            exit_code=-1,
+            runtime_ms=elapsed_ms,
+        )
+
+
+def normalize(s: str) -> str:
+    return s.strip()
+
+
+def _setup_sandbox(tc: TestCase) -> Path:
+    """Create a temp directory and populate it with setup files."""
+    sandbox = Path(tempfile.mkdtemp(prefix="lush_bench_"))
+    for filename, content in tc.setup_files.items():
+        filepath = sandbox / filename
+        filepath.parent.mkdir(parents=True, exist_ok=True)
+        filepath.write_text(content)
+    return sandbox
+
+
+def _check_expected_files(
+    sandbox: Path,
+    tc: TestCase,
+    do_normalize: bool,
+) -> dict[str, dict[str, str]]:
+    """Compare expected files against sandbox contents. Returns mismatches."""
+    mismatches: dict[str, dict[str, str]] = {}
+    for filename, expected_content in tc.expected_files.items():
+        filepath = sandbox / filename
+        if not filepath.exists():
+            mismatches[filename] = {
+                "expected": expected_content,
+                "actual": "<file not found>",
+            }
+            continue
+        actual_content = filepath.read_text()
+        expected = expected_content
+        actual = actual_content
+        if do_normalize:
+            expected = normalize(expected)
+            actual = normalize(actual)
+        if actual != expected:
+            mismatches[filename] = {
+                "expected": expected_content,
+                "actual": actual_content,
+            }
+    return mismatches
+
+
+def evaluate(
+    task: Task,
+    code: str,
+    language: str,
+    config: Config,
+) -> list[TestResult]:
+    suffix = ".sh" if language == "bash" else ".lua"
+    results: list[TestResult] = []
+
+    for i, tc in enumerate(task.test_cases):
+        sandbox = _setup_sandbox(tc)
+        try:
+            # Write script into the sandbox
+            script_path = sandbox / f"solution{suffix}"
+            script_path.write_text(code)
+
+            env = _build_env(tc)
+
+            if language == "bash":
+                command = ["bash"]
+            else:
+                command = [str(config.lush_binary)]
+
+            output = run_script(
+                command, script_path, tc.stdin, config.timeout_seconds, sandbox, env
+            )
+
+            actual = output.stdout
+            expected = tc.expected_stdout
+            if config.normalize_whitespace:
+                actual = normalize(actual)
+                expected = normalize(expected)
+
+            stdout_ok = actual == expected
+            file_mismatches = _check_expected_files(
+                sandbox, tc, config.normalize_whitespace
+            )
+            passed = stdout_ok and not file_mismatches
+
+            results.append(
+                TestResult(
+                    test_case_index=i,
+                    passed=passed,
+                    actual_stdout=output.stdout,
+                    expected_stdout=tc.expected_stdout,
+                    stderr=output.stderr,
+                    exit_code=output.exit_code,
+                    file_mismatches=file_mismatches,
+                )
+            )
+        finally:
+            shutil.rmtree(sandbox, ignore_errors=True)
+
+    return results