Initial commit: Lush vs Bash AI benchmarking framework

Benchmark harness that uses LLM agents to solve shell scripting tasks in both Bash and Lush, then compares correctness and code quality. - CLI with run, run-all, list-tasks, report, and export commands - Agent loop with retry support via Anthropic Claude provider - Test harness executing solutions in sandboxed subprocesses - LLM-driven questionnaire for subjective code quality evaluation - HTML report export with charts (matplotlib) - 8 Category A tasks (write-from-scratch in both languages) - 4 Category B tasks (verify provided Bash, convert to Lush) - Lush language reference for agent context
2026-03-29 17:56:30 +01:00
commit be8d657b24
33 changed files with 3302 additions and 0 deletions
--- a/lush_bench/models.py
+++ b/lush_bench/models.py
@@ -0,0 +1,210 @@
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+
+@dataclass
+class TestCase:
+    stdin: str
+    expected_stdout: str
+    env: dict[str, str] = field(default_factory=dict)
+    setup_files: dict[str, str] = field(default_factory=dict)
+    expected_files: dict[str, str] = field(default_factory=dict)
+
+    def to_dict(self) -> dict[str, Any]:
+        d: dict[str, Any] = {"stdin": self.stdin, "expected_stdout": self.expected_stdout}
+        if self.env:
+            d["env"] = self.env
+        if self.setup_files:
+            d["setup_files"] = self.setup_files
+        if self.expected_files:
+            d["expected_files"] = self.expected_files
+        return d
+
+    @classmethod
+    def from_dict(cls, d: dict[str, Any]) -> TestCase:
+        return cls(
+            stdin=d["stdin"],
+            expected_stdout=d["expected_stdout"],
+            env=d.get("env", {}),
+            setup_files=d.get("setup_files", {}),
+            expected_files=d.get("expected_files", {}),
+        )
+
+
+@dataclass
+class Task:
+    name: str
+    category: str  # "a" or "b"
+    description: str
+    test_cases: list[TestCase]
+    bash_source: str | None = None  # category B only
+
+    def to_dict(self) -> dict[str, Any]:
+        d: dict[str, Any] = {
+            "name": self.name,
+            "category": self.category,
+            "description": self.description,
+            "test_cases": [tc.to_dict() for tc in self.test_cases],
+        }
+        if self.bash_source is not None:
+            d["bash_source"] = self.bash_source
+        return d
+
+    @classmethod
+    def from_dict(cls, d: dict[str, Any]) -> Task:
+        return cls(
+            name=d["name"],
+            category=d["category"],
+            description=d["description"],
+            test_cases=[TestCase.from_dict(tc) for tc in d["test_cases"]],
+            bash_source=d.get("bash_source"),
+        )
+
+
+@dataclass
+class RunOutput:
+    stdout: str
+    stderr: str
+    exit_code: int
+    runtime_ms: float
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "stdout": self.stdout,
+            "stderr": self.stderr,
+            "exit_code": self.exit_code,
+            "runtime_ms": self.runtime_ms,
+        }
+
+    @classmethod
+    def from_dict(cls, d: dict[str, Any]) -> RunOutput:
+        return cls(
+            stdout=d["stdout"],
+            stderr=d["stderr"],
+            exit_code=d["exit_code"],
+            runtime_ms=d["runtime_ms"],
+        )
+
+
+@dataclass
+class TestResult:
+    test_case_index: int
+    passed: bool
+    actual_stdout: str
+    expected_stdout: str
+    stderr: str
+    exit_code: int
+    file_mismatches: dict[str, dict[str, str]] = field(default_factory=dict)
+
+    def to_dict(self) -> dict[str, Any]:
+        d: dict[str, Any] = {
+            "test_case_index": self.test_case_index,
+            "passed": self.passed,
+            "actual_stdout": self.actual_stdout,
+            "expected_stdout": self.expected_stdout,
+            "stderr": self.stderr,
+            "exit_code": self.exit_code,
+        }
+        if self.file_mismatches:
+            d["file_mismatches"] = self.file_mismatches
+        return d
+
+    @classmethod
+    def from_dict(cls, d: dict[str, Any]) -> TestResult:
+        return cls(
+            test_case_index=d["test_case_index"],
+            passed=d["passed"],
+            actual_stdout=d["actual_stdout"],
+            expected_stdout=d["expected_stdout"],
+            stderr=d["stderr"],
+            exit_code=d["exit_code"],
+            file_mismatches=d.get("file_mismatches", {}),
+        )
+
+
+@dataclass
+class LanguageResult:
+    language: str
+    solution_code: str
+    test_results: list[TestResult]
+    all_passed: bool
+    agent_turns: int
+    questionnaire: list[QuestionnaireResponse] = field(default_factory=list)
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "language": self.language,
+            "solution_code": self.solution_code,
+            "test_results": [tr.to_dict() for tr in self.test_results],
+            "all_passed": self.all_passed,
+            "agent_turns": self.agent_turns,
+            "questionnaire": [q.to_dict() for q in self.questionnaire],
+        }
+
+    @classmethod
+    def from_dict(cls, d: dict[str, Any]) -> LanguageResult:
+        return cls(
+            language=d["language"],
+            solution_code=d["solution_code"],
+            test_results=[TestResult.from_dict(tr) for tr in d["test_results"]],
+            all_passed=d["all_passed"],
+            agent_turns=d["agent_turns"],
+            questionnaire=[QuestionnaireResponse.from_dict(q) for q in d.get("questionnaire", [])],
+        )
+
+
+@dataclass
+class QuestionnaireResponse:
+    question: str
+    selected: str | int
+    choices: list[str] | None = None
+
+    def to_dict(self) -> dict[str, Any]:
+        d: dict[str, Any] = {"question": self.question, "selected": self.selected}
+        if self.choices is not None:
+            d["choices"] = self.choices
+        return d
+
+    @classmethod
+    def from_dict(cls, d: dict[str, Any]) -> QuestionnaireResponse:
+        return cls(
+            question=d["question"],
+            selected=d["selected"],
+            choices=d.get("choices"),
+        )
+
+
+@dataclass
+class BenchmarkResult:
+    task_name: str
+    category: str
+    provider: str
+    model: str
+    timestamp: str
+    bash_result: LanguageResult | None
+    lush_result: LanguageResult | None
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "task_name": self.task_name,
+            "category": self.category,
+            "provider": self.provider,
+            "model": self.model,
+            "timestamp": self.timestamp,
+            "bash_result": self.bash_result.to_dict() if self.bash_result else None,
+            "lush_result": self.lush_result.to_dict() if self.lush_result else None,
+        }
+
+    @classmethod
+    def from_dict(cls, d: dict[str, Any]) -> BenchmarkResult:
+        return cls(
+            task_name=d["task_name"],
+            category=d["category"],
+            provider=d["provider"],
+            model=d["model"],
+            timestamp=d["timestamp"],
+            bash_result=LanguageResult.from_dict(d["bash_result"]) if d.get("bash_result") else None,
+            lush_result=LanguageResult.from_dict(d["lush_result"]) if d.get("lush_result") else None,
+        )