Files
lush_grading/lush_bench/models.py
Cormac Shannon be8d657b24 Initial commit: Lush vs Bash AI benchmarking framework
Benchmark harness that uses LLM agents to solve shell scripting tasks
in both Bash and Lush, then compares correctness and code quality.

- CLI with run, run-all, list-tasks, report, and export commands
- Agent loop with retry support via Anthropic Claude provider
- Test harness executing solutions in sandboxed subprocesses
- LLM-driven questionnaire for subjective code quality evaluation
- HTML report export with charts (matplotlib)
- 8 Category A tasks (write-from-scratch in both languages)
- 4 Category B tasks (verify provided Bash, convert to Lush)
- Lush language reference for agent context
2026-03-29 17:56:30 +01:00

211 lines
6.2 KiB
Python

from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any
@dataclass
class TestCase:
stdin: str
expected_stdout: str
env: dict[str, str] = field(default_factory=dict)
setup_files: dict[str, str] = field(default_factory=dict)
expected_files: dict[str, str] = field(default_factory=dict)
def to_dict(self) -> dict[str, Any]:
d: dict[str, Any] = {"stdin": self.stdin, "expected_stdout": self.expected_stdout}
if self.env:
d["env"] = self.env
if self.setup_files:
d["setup_files"] = self.setup_files
if self.expected_files:
d["expected_files"] = self.expected_files
return d
@classmethod
def from_dict(cls, d: dict[str, Any]) -> TestCase:
return cls(
stdin=d["stdin"],
expected_stdout=d["expected_stdout"],
env=d.get("env", {}),
setup_files=d.get("setup_files", {}),
expected_files=d.get("expected_files", {}),
)
@dataclass
class Task:
name: str
category: str # "a" or "b"
description: str
test_cases: list[TestCase]
bash_source: str | None = None # category B only
def to_dict(self) -> dict[str, Any]:
d: dict[str, Any] = {
"name": self.name,
"category": self.category,
"description": self.description,
"test_cases": [tc.to_dict() for tc in self.test_cases],
}
if self.bash_source is not None:
d["bash_source"] = self.bash_source
return d
@classmethod
def from_dict(cls, d: dict[str, Any]) -> Task:
return cls(
name=d["name"],
category=d["category"],
description=d["description"],
test_cases=[TestCase.from_dict(tc) for tc in d["test_cases"]],
bash_source=d.get("bash_source"),
)
@dataclass
class RunOutput:
stdout: str
stderr: str
exit_code: int
runtime_ms: float
def to_dict(self) -> dict[str, Any]:
return {
"stdout": self.stdout,
"stderr": self.stderr,
"exit_code": self.exit_code,
"runtime_ms": self.runtime_ms,
}
@classmethod
def from_dict(cls, d: dict[str, Any]) -> RunOutput:
return cls(
stdout=d["stdout"],
stderr=d["stderr"],
exit_code=d["exit_code"],
runtime_ms=d["runtime_ms"],
)
@dataclass
class TestResult:
test_case_index: int
passed: bool
actual_stdout: str
expected_stdout: str
stderr: str
exit_code: int
file_mismatches: dict[str, dict[str, str]] = field(default_factory=dict)
def to_dict(self) -> dict[str, Any]:
d: dict[str, Any] = {
"test_case_index": self.test_case_index,
"passed": self.passed,
"actual_stdout": self.actual_stdout,
"expected_stdout": self.expected_stdout,
"stderr": self.stderr,
"exit_code": self.exit_code,
}
if self.file_mismatches:
d["file_mismatches"] = self.file_mismatches
return d
@classmethod
def from_dict(cls, d: dict[str, Any]) -> TestResult:
return cls(
test_case_index=d["test_case_index"],
passed=d["passed"],
actual_stdout=d["actual_stdout"],
expected_stdout=d["expected_stdout"],
stderr=d["stderr"],
exit_code=d["exit_code"],
file_mismatches=d.get("file_mismatches", {}),
)
@dataclass
class LanguageResult:
language: str
solution_code: str
test_results: list[TestResult]
all_passed: bool
agent_turns: int
questionnaire: list[QuestionnaireResponse] = field(default_factory=list)
def to_dict(self) -> dict[str, Any]:
return {
"language": self.language,
"solution_code": self.solution_code,
"test_results": [tr.to_dict() for tr in self.test_results],
"all_passed": self.all_passed,
"agent_turns": self.agent_turns,
"questionnaire": [q.to_dict() for q in self.questionnaire],
}
@classmethod
def from_dict(cls, d: dict[str, Any]) -> LanguageResult:
return cls(
language=d["language"],
solution_code=d["solution_code"],
test_results=[TestResult.from_dict(tr) for tr in d["test_results"]],
all_passed=d["all_passed"],
agent_turns=d["agent_turns"],
questionnaire=[QuestionnaireResponse.from_dict(q) for q in d.get("questionnaire", [])],
)
@dataclass
class QuestionnaireResponse:
question: str
selected: str | int
choices: list[str] | None = None
def to_dict(self) -> dict[str, Any]:
d: dict[str, Any] = {"question": self.question, "selected": self.selected}
if self.choices is not None:
d["choices"] = self.choices
return d
@classmethod
def from_dict(cls, d: dict[str, Any]) -> QuestionnaireResponse:
return cls(
question=d["question"],
selected=d["selected"],
choices=d.get("choices"),
)
@dataclass
class BenchmarkResult:
task_name: str
category: str
provider: str
model: str
timestamp: str
bash_result: LanguageResult | None
lush_result: LanguageResult | None
def to_dict(self) -> dict[str, Any]:
return {
"task_name": self.task_name,
"category": self.category,
"provider": self.provider,
"model": self.model,
"timestamp": self.timestamp,
"bash_result": self.bash_result.to_dict() if self.bash_result else None,
"lush_result": self.lush_result.to_dict() if self.lush_result else None,
}
@classmethod
def from_dict(cls, d: dict[str, Any]) -> BenchmarkResult:
return cls(
task_name=d["task_name"],
category=d["category"],
provider=d["provider"],
model=d["model"],
timestamp=d["timestamp"],
bash_result=LanguageResult.from_dict(d["bash_result"]) if d.get("bash_result") else None,
lush_result=LanguageResult.from_dict(d["lush_result"]) if d.get("lush_result") else None,
)