Benchmark harness that uses LLM agents to solve shell scripting tasks in both Bash and Lush, then compares correctness and code quality. - CLI with run, run-all, list-tasks, report, and export commands - Agent loop with retry support via Anthropic Claude provider - Test harness executing solutions in sandboxed subprocesses - LLM-driven questionnaire for subjective code quality evaluation - HTML report export with charts (matplotlib) - 8 Category A tasks (write-from-scratch in both languages) - 4 Category B tasks (verify provided Bash, convert to Lush) - Lush language reference for agent context
211 lines
6.2 KiB
Python
211 lines
6.2 KiB
Python
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass, field
|
|
from typing import Any
|
|
|
|
|
|
@dataclass
|
|
class TestCase:
|
|
stdin: str
|
|
expected_stdout: str
|
|
env: dict[str, str] = field(default_factory=dict)
|
|
setup_files: dict[str, str] = field(default_factory=dict)
|
|
expected_files: dict[str, str] = field(default_factory=dict)
|
|
|
|
def to_dict(self) -> dict[str, Any]:
|
|
d: dict[str, Any] = {"stdin": self.stdin, "expected_stdout": self.expected_stdout}
|
|
if self.env:
|
|
d["env"] = self.env
|
|
if self.setup_files:
|
|
d["setup_files"] = self.setup_files
|
|
if self.expected_files:
|
|
d["expected_files"] = self.expected_files
|
|
return d
|
|
|
|
@classmethod
|
|
def from_dict(cls, d: dict[str, Any]) -> TestCase:
|
|
return cls(
|
|
stdin=d["stdin"],
|
|
expected_stdout=d["expected_stdout"],
|
|
env=d.get("env", {}),
|
|
setup_files=d.get("setup_files", {}),
|
|
expected_files=d.get("expected_files", {}),
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class Task:
|
|
name: str
|
|
category: str # "a" or "b"
|
|
description: str
|
|
test_cases: list[TestCase]
|
|
bash_source: str | None = None # category B only
|
|
|
|
def to_dict(self) -> dict[str, Any]:
|
|
d: dict[str, Any] = {
|
|
"name": self.name,
|
|
"category": self.category,
|
|
"description": self.description,
|
|
"test_cases": [tc.to_dict() for tc in self.test_cases],
|
|
}
|
|
if self.bash_source is not None:
|
|
d["bash_source"] = self.bash_source
|
|
return d
|
|
|
|
@classmethod
|
|
def from_dict(cls, d: dict[str, Any]) -> Task:
|
|
return cls(
|
|
name=d["name"],
|
|
category=d["category"],
|
|
description=d["description"],
|
|
test_cases=[TestCase.from_dict(tc) for tc in d["test_cases"]],
|
|
bash_source=d.get("bash_source"),
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class RunOutput:
|
|
stdout: str
|
|
stderr: str
|
|
exit_code: int
|
|
runtime_ms: float
|
|
|
|
def to_dict(self) -> dict[str, Any]:
|
|
return {
|
|
"stdout": self.stdout,
|
|
"stderr": self.stderr,
|
|
"exit_code": self.exit_code,
|
|
"runtime_ms": self.runtime_ms,
|
|
}
|
|
|
|
@classmethod
|
|
def from_dict(cls, d: dict[str, Any]) -> RunOutput:
|
|
return cls(
|
|
stdout=d["stdout"],
|
|
stderr=d["stderr"],
|
|
exit_code=d["exit_code"],
|
|
runtime_ms=d["runtime_ms"],
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class TestResult:
|
|
test_case_index: int
|
|
passed: bool
|
|
actual_stdout: str
|
|
expected_stdout: str
|
|
stderr: str
|
|
exit_code: int
|
|
file_mismatches: dict[str, dict[str, str]] = field(default_factory=dict)
|
|
|
|
def to_dict(self) -> dict[str, Any]:
|
|
d: dict[str, Any] = {
|
|
"test_case_index": self.test_case_index,
|
|
"passed": self.passed,
|
|
"actual_stdout": self.actual_stdout,
|
|
"expected_stdout": self.expected_stdout,
|
|
"stderr": self.stderr,
|
|
"exit_code": self.exit_code,
|
|
}
|
|
if self.file_mismatches:
|
|
d["file_mismatches"] = self.file_mismatches
|
|
return d
|
|
|
|
@classmethod
|
|
def from_dict(cls, d: dict[str, Any]) -> TestResult:
|
|
return cls(
|
|
test_case_index=d["test_case_index"],
|
|
passed=d["passed"],
|
|
actual_stdout=d["actual_stdout"],
|
|
expected_stdout=d["expected_stdout"],
|
|
stderr=d["stderr"],
|
|
exit_code=d["exit_code"],
|
|
file_mismatches=d.get("file_mismatches", {}),
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class LanguageResult:
|
|
language: str
|
|
solution_code: str
|
|
test_results: list[TestResult]
|
|
all_passed: bool
|
|
agent_turns: int
|
|
questionnaire: list[QuestionnaireResponse] = field(default_factory=list)
|
|
|
|
def to_dict(self) -> dict[str, Any]:
|
|
return {
|
|
"language": self.language,
|
|
"solution_code": self.solution_code,
|
|
"test_results": [tr.to_dict() for tr in self.test_results],
|
|
"all_passed": self.all_passed,
|
|
"agent_turns": self.agent_turns,
|
|
"questionnaire": [q.to_dict() for q in self.questionnaire],
|
|
}
|
|
|
|
@classmethod
|
|
def from_dict(cls, d: dict[str, Any]) -> LanguageResult:
|
|
return cls(
|
|
language=d["language"],
|
|
solution_code=d["solution_code"],
|
|
test_results=[TestResult.from_dict(tr) for tr in d["test_results"]],
|
|
all_passed=d["all_passed"],
|
|
agent_turns=d["agent_turns"],
|
|
questionnaire=[QuestionnaireResponse.from_dict(q) for q in d.get("questionnaire", [])],
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class QuestionnaireResponse:
|
|
question: str
|
|
selected: str | int
|
|
choices: list[str] | None = None
|
|
|
|
def to_dict(self) -> dict[str, Any]:
|
|
d: dict[str, Any] = {"question": self.question, "selected": self.selected}
|
|
if self.choices is not None:
|
|
d["choices"] = self.choices
|
|
return d
|
|
|
|
@classmethod
|
|
def from_dict(cls, d: dict[str, Any]) -> QuestionnaireResponse:
|
|
return cls(
|
|
question=d["question"],
|
|
selected=d["selected"],
|
|
choices=d.get("choices"),
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class BenchmarkResult:
|
|
task_name: str
|
|
category: str
|
|
provider: str
|
|
model: str
|
|
timestamp: str
|
|
bash_result: LanguageResult | None
|
|
lush_result: LanguageResult | None
|
|
|
|
def to_dict(self) -> dict[str, Any]:
|
|
return {
|
|
"task_name": self.task_name,
|
|
"category": self.category,
|
|
"provider": self.provider,
|
|
"model": self.model,
|
|
"timestamp": self.timestamp,
|
|
"bash_result": self.bash_result.to_dict() if self.bash_result else None,
|
|
"lush_result": self.lush_result.to_dict() if self.lush_result else None,
|
|
}
|
|
|
|
@classmethod
|
|
def from_dict(cls, d: dict[str, Any]) -> BenchmarkResult:
|
|
return cls(
|
|
task_name=d["task_name"],
|
|
category=d["category"],
|
|
provider=d["provider"],
|
|
model=d["model"],
|
|
timestamp=d["timestamp"],
|
|
bash_result=LanguageResult.from_dict(d["bash_result"]) if d.get("bash_result") else None,
|
|
lush_result=LanguageResult.from_dict(d["lush_result"]) if d.get("lush_result") else None,
|
|
)
|