from __future__ import annotations import os import shutil import subprocess import tempfile import time from pathlib import Path from .config import Config from .models import RunOutput, Task, TestCase, TestResult # Minimal base env — keeps scripts deterministic BASE_ENV_KEYS = {"PATH", "HOME", "USER", "LANG", "TERM", "TMPDIR"} def _build_env(test_case: TestCase) -> dict[str, str]: """Build a controlled environment: base host vars + test-specific vars.""" env = {k: v for k, v in os.environ.items() if k in BASE_ENV_KEYS} env.update(test_case.env) return env def run_script( command: list[str], script: Path, stdin: str, timeout: float, cwd: Path, env: dict[str, str], ) -> RunOutput: start = time.monotonic() try: result = subprocess.run( [*command, str(script)], input=stdin, capture_output=True, text=True, timeout=timeout, cwd=cwd, env=env, ) elapsed_ms = (time.monotonic() - start) * 1000 return RunOutput( stdout=result.stdout, stderr=result.stderr, exit_code=result.returncode, runtime_ms=elapsed_ms, ) except subprocess.TimeoutExpired: elapsed_ms = (time.monotonic() - start) * 1000 return RunOutput( stdout="", stderr="Timeout exceeded", exit_code=-1, runtime_ms=elapsed_ms, ) def normalize(s: str) -> str: return s.strip() def _setup_sandbox(tc: TestCase) -> Path: """Create a temp directory and populate it with setup files.""" sandbox = Path(tempfile.mkdtemp(prefix="lush_bench_")) for filename, content in tc.setup_files.items(): filepath = sandbox / filename filepath.parent.mkdir(parents=True, exist_ok=True) filepath.write_text(content) return sandbox def _check_expected_files( sandbox: Path, tc: TestCase, do_normalize: bool, ) -> dict[str, dict[str, str]]: """Compare expected files against sandbox contents. Returns mismatches.""" mismatches: dict[str, dict[str, str]] = {} for filename, expected_content in tc.expected_files.items(): filepath = sandbox / filename if not filepath.exists(): mismatches[filename] = { "expected": expected_content, "actual": "", } continue actual_content = filepath.read_text() expected = expected_content actual = actual_content if do_normalize: expected = normalize(expected) actual = normalize(actual) if actual != expected: mismatches[filename] = { "expected": expected_content, "actual": actual_content, } return mismatches def evaluate( task: Task, code: str, language: str, config: Config, ) -> list[TestResult]: suffix = ".sh" if language == "bash" else ".lua" results: list[TestResult] = [] for i, tc in enumerate(task.test_cases): sandbox = _setup_sandbox(tc) try: # Write script into the sandbox script_path = sandbox / f"solution{suffix}" script_path.write_text(code) env = _build_env(tc) if language == "bash": command = ["bash"] else: command = [str(config.lush_binary)] output = run_script( command, script_path, tc.stdin, config.timeout_seconds, sandbox, env ) actual = output.stdout expected = tc.expected_stdout if config.normalize_whitespace: actual = normalize(actual) expected = normalize(expected) stdout_ok = actual == expected file_mismatches = _check_expected_files( sandbox, tc, config.normalize_whitespace ) passed = stdout_ok and not file_mismatches results.append( TestResult( test_case_index=i, passed=passed, actual_stdout=output.stdout, expected_stdout=tc.expected_stdout, stderr=output.stderr, exit_code=output.exit_code, file_mismatches=file_mismatches, ) ) finally: shutil.rmtree(sandbox, ignore_errors=True) return results