Initial commit: Lush vs Bash AI benchmarking framework
Benchmark harness that uses LLM agents to solve shell scripting tasks in both Bash and Lush, then compares correctness and code quality. - CLI with run, run-all, list-tasks, report, and export commands - Agent loop with retry support via Anthropic Claude provider - Test harness executing solutions in sandboxed subprocesses - LLM-driven questionnaire for subjective code quality evaluation - HTML report export with charts (matplotlib) - 8 Category A tasks (write-from-scratch in both languages) - 4 Category B tasks (verify provided Bash, convert to Lush) - Lush language reference for agent context
This commit is contained in:
210
lush_bench/models.py
Normal file
210
lush_bench/models.py
Normal file
@@ -0,0 +1,210 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
|
||||
@dataclass
|
||||
class TestCase:
|
||||
stdin: str
|
||||
expected_stdout: str
|
||||
env: dict[str, str] = field(default_factory=dict)
|
||||
setup_files: dict[str, str] = field(default_factory=dict)
|
||||
expected_files: dict[str, str] = field(default_factory=dict)
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
d: dict[str, Any] = {"stdin": self.stdin, "expected_stdout": self.expected_stdout}
|
||||
if self.env:
|
||||
d["env"] = self.env
|
||||
if self.setup_files:
|
||||
d["setup_files"] = self.setup_files
|
||||
if self.expected_files:
|
||||
d["expected_files"] = self.expected_files
|
||||
return d
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, d: dict[str, Any]) -> TestCase:
|
||||
return cls(
|
||||
stdin=d["stdin"],
|
||||
expected_stdout=d["expected_stdout"],
|
||||
env=d.get("env", {}),
|
||||
setup_files=d.get("setup_files", {}),
|
||||
expected_files=d.get("expected_files", {}),
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Task:
|
||||
name: str
|
||||
category: str # "a" or "b"
|
||||
description: str
|
||||
test_cases: list[TestCase]
|
||||
bash_source: str | None = None # category B only
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
d: dict[str, Any] = {
|
||||
"name": self.name,
|
||||
"category": self.category,
|
||||
"description": self.description,
|
||||
"test_cases": [tc.to_dict() for tc in self.test_cases],
|
||||
}
|
||||
if self.bash_source is not None:
|
||||
d["bash_source"] = self.bash_source
|
||||
return d
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, d: dict[str, Any]) -> Task:
|
||||
return cls(
|
||||
name=d["name"],
|
||||
category=d["category"],
|
||||
description=d["description"],
|
||||
test_cases=[TestCase.from_dict(tc) for tc in d["test_cases"]],
|
||||
bash_source=d.get("bash_source"),
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class RunOutput:
|
||||
stdout: str
|
||||
stderr: str
|
||||
exit_code: int
|
||||
runtime_ms: float
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"stdout": self.stdout,
|
||||
"stderr": self.stderr,
|
||||
"exit_code": self.exit_code,
|
||||
"runtime_ms": self.runtime_ms,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, d: dict[str, Any]) -> RunOutput:
|
||||
return cls(
|
||||
stdout=d["stdout"],
|
||||
stderr=d["stderr"],
|
||||
exit_code=d["exit_code"],
|
||||
runtime_ms=d["runtime_ms"],
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TestResult:
|
||||
test_case_index: int
|
||||
passed: bool
|
||||
actual_stdout: str
|
||||
expected_stdout: str
|
||||
stderr: str
|
||||
exit_code: int
|
||||
file_mismatches: dict[str, dict[str, str]] = field(default_factory=dict)
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
d: dict[str, Any] = {
|
||||
"test_case_index": self.test_case_index,
|
||||
"passed": self.passed,
|
||||
"actual_stdout": self.actual_stdout,
|
||||
"expected_stdout": self.expected_stdout,
|
||||
"stderr": self.stderr,
|
||||
"exit_code": self.exit_code,
|
||||
}
|
||||
if self.file_mismatches:
|
||||
d["file_mismatches"] = self.file_mismatches
|
||||
return d
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, d: dict[str, Any]) -> TestResult:
|
||||
return cls(
|
||||
test_case_index=d["test_case_index"],
|
||||
passed=d["passed"],
|
||||
actual_stdout=d["actual_stdout"],
|
||||
expected_stdout=d["expected_stdout"],
|
||||
stderr=d["stderr"],
|
||||
exit_code=d["exit_code"],
|
||||
file_mismatches=d.get("file_mismatches", {}),
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class LanguageResult:
|
||||
language: str
|
||||
solution_code: str
|
||||
test_results: list[TestResult]
|
||||
all_passed: bool
|
||||
agent_turns: int
|
||||
questionnaire: list[QuestionnaireResponse] = field(default_factory=list)
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"language": self.language,
|
||||
"solution_code": self.solution_code,
|
||||
"test_results": [tr.to_dict() for tr in self.test_results],
|
||||
"all_passed": self.all_passed,
|
||||
"agent_turns": self.agent_turns,
|
||||
"questionnaire": [q.to_dict() for q in self.questionnaire],
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, d: dict[str, Any]) -> LanguageResult:
|
||||
return cls(
|
||||
language=d["language"],
|
||||
solution_code=d["solution_code"],
|
||||
test_results=[TestResult.from_dict(tr) for tr in d["test_results"]],
|
||||
all_passed=d["all_passed"],
|
||||
agent_turns=d["agent_turns"],
|
||||
questionnaire=[QuestionnaireResponse.from_dict(q) for q in d.get("questionnaire", [])],
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class QuestionnaireResponse:
|
||||
question: str
|
||||
selected: str | int
|
||||
choices: list[str] | None = None
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
d: dict[str, Any] = {"question": self.question, "selected": self.selected}
|
||||
if self.choices is not None:
|
||||
d["choices"] = self.choices
|
||||
return d
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, d: dict[str, Any]) -> QuestionnaireResponse:
|
||||
return cls(
|
||||
question=d["question"],
|
||||
selected=d["selected"],
|
||||
choices=d.get("choices"),
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class BenchmarkResult:
|
||||
task_name: str
|
||||
category: str
|
||||
provider: str
|
||||
model: str
|
||||
timestamp: str
|
||||
bash_result: LanguageResult | None
|
||||
lush_result: LanguageResult | None
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"task_name": self.task_name,
|
||||
"category": self.category,
|
||||
"provider": self.provider,
|
||||
"model": self.model,
|
||||
"timestamp": self.timestamp,
|
||||
"bash_result": self.bash_result.to_dict() if self.bash_result else None,
|
||||
"lush_result": self.lush_result.to_dict() if self.lush_result else None,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, d: dict[str, Any]) -> BenchmarkResult:
|
||||
return cls(
|
||||
task_name=d["task_name"],
|
||||
category=d["category"],
|
||||
provider=d["provider"],
|
||||
model=d["model"],
|
||||
timestamp=d["timestamp"],
|
||||
bash_result=LanguageResult.from_dict(d["bash_result"]) if d.get("bash_result") else None,
|
||||
lush_result=LanguageResult.from_dict(d["lush_result"]) if d.get("lush_result") else None,
|
||||
)
|
||||
Reference in New Issue
Block a user