lush_grading/lush_bench/harness.py

from __future__ import annotations

import os
import shutil
import subprocess
import tempfile
import time
from pathlib import Path

from .config import Config
from .models import RunOutput, Task, TestCase, TestResult

# Minimal base env — keeps scripts deterministic
BASE_ENV_KEYS = {"PATH", "HOME", "USER", "LANG", "TERM", "TMPDIR"}


def _build_env(test_case: TestCase) -> dict[str, str]:
    """Build a controlled environment: base host vars + test-specific vars."""
    env = {k: v for k, v in os.environ.items() if k in BASE_ENV_KEYS}
    env.update(test_case.env)
    return env


def run_script(
    command: list[str],
    script: Path,
    stdin: str,
    timeout: float,
    cwd: Path,
    env: dict[str, str],
) -> RunOutput:
    start = time.monotonic()
    try:
        result = subprocess.run(
            [*command, str(script)],
            input=stdin,
            capture_output=True,
            text=True,
            timeout=timeout,
            cwd=cwd,
            env=env,
        )
        elapsed_ms = (time.monotonic() - start) * 1000
        return RunOutput(
            stdout=result.stdout,
            stderr=result.stderr,
            exit_code=result.returncode,
            runtime_ms=elapsed_ms,
        )
    except subprocess.TimeoutExpired:
        elapsed_ms = (time.monotonic() - start) * 1000
        return RunOutput(
            stdout="",
            stderr="Timeout exceeded",
            exit_code=-1,
            runtime_ms=elapsed_ms,
        )


def normalize(s: str) -> str:
    return s.strip()


def _setup_sandbox(tc: TestCase) -> Path:
    """Create a temp directory and populate it with setup files."""
    sandbox = Path(tempfile.mkdtemp(prefix="lush_bench_"))
    for filename, content in tc.setup_files.items():
        filepath = sandbox / filename
        filepath.parent.mkdir(parents=True, exist_ok=True)
        filepath.write_text(content)
    return sandbox


def _check_expected_files(
    sandbox: Path,
    tc: TestCase,
    do_normalize: bool,
) -> dict[str, dict[str, str]]:
    """Compare expected files against sandbox contents. Returns mismatches."""
    mismatches: dict[str, dict[str, str]] = {}
    for filename, expected_content in tc.expected_files.items():
        filepath = sandbox / filename
        if not filepath.exists():
            mismatches[filename] = {
                "expected": expected_content,
                "actual": "<file not found>",
            }
            continue
        actual_content = filepath.read_text()
        expected = expected_content
        actual = actual_content
        if do_normalize:
            expected = normalize(expected)
            actual = normalize(actual)
        if actual != expected:
            mismatches[filename] = {
                "expected": expected_content,
                "actual": actual_content,
            }
    return mismatches


def evaluate(
    task: Task,
    code: str,
    language: str,
    config: Config,
) -> list[TestResult]:
    suffix = ".sh" if language == "bash" else ".lua"
    results: list[TestResult] = []

    for i, tc in enumerate(task.test_cases):
        sandbox = _setup_sandbox(tc)
        try:
            # Write script into the sandbox
            script_path = sandbox / f"solution{suffix}"
            script_path.write_text(code)

            env = _build_env(tc)

            if language == "bash":
                command = ["bash"]
            else:
                command = [str(config.lush_binary)]

            output = run_script(
                command, script_path, tc.stdin, config.timeout_seconds, sandbox, env
            )

            actual = output.stdout
            expected = tc.expected_stdout
            if config.normalize_whitespace:
                actual = normalize(actual)
                expected = normalize(expected)

            stdout_ok = actual == expected
            file_mismatches = _check_expected_files(
                sandbox, tc, config.normalize_whitespace
            )
            passed = stdout_ok and not file_mismatches

            results.append(
                TestResult(
                    test_case_index=i,
                    passed=passed,
                    actual_stdout=output.stdout,
                    expected_stdout=tc.expected_stdout,
                    stderr=output.stderr,
                    exit_code=output.exit_code,
                    file_mismatches=file_mismatches,
                )
            )
        finally:
            shutil.rmtree(sandbox, ignore_errors=True)

    return results