from __future__ import annotations from dataclasses import dataclass, field from typing import Any @dataclass class TestCase: stdin: str expected_stdout: str env: dict[str, str] = field(default_factory=dict) setup_files: dict[str, str] = field(default_factory=dict) expected_files: dict[str, str] = field(default_factory=dict) def to_dict(self) -> dict[str, Any]: d: dict[str, Any] = {"stdin": self.stdin, "expected_stdout": self.expected_stdout} if self.env: d["env"] = self.env if self.setup_files: d["setup_files"] = self.setup_files if self.expected_files: d["expected_files"] = self.expected_files return d @classmethod def from_dict(cls, d: dict[str, Any]) -> TestCase: return cls( stdin=d["stdin"], expected_stdout=d["expected_stdout"], env=d.get("env", {}), setup_files=d.get("setup_files", {}), expected_files=d.get("expected_files", {}), ) @dataclass class Task: name: str category: str # "algorithm", "pipeline", "environment", "filesystem", "process" description: str test_cases: list[TestCase] mode: str = "solve" # "solve" or "convert" bash_source: str | None = None # convert mode only def to_dict(self) -> dict[str, Any]: d: dict[str, Any] = { "name": self.name, "category": self.category, "mode": self.mode, "description": self.description, "test_cases": [tc.to_dict() for tc in self.test_cases], } if self.bash_source is not None: d["bash_source"] = self.bash_source return d @classmethod def from_dict(cls, d: dict[str, Any]) -> Task: return cls( name=d["name"], category=d["category"], description=d["description"], test_cases=[TestCase.from_dict(tc) for tc in d["test_cases"]], mode=d.get("mode", "solve"), bash_source=d.get("bash_source"), ) @dataclass class RunOutput: stdout: str stderr: str exit_code: int runtime_ms: float def to_dict(self) -> dict[str, Any]: return { "stdout": self.stdout, "stderr": self.stderr, "exit_code": self.exit_code, "runtime_ms": self.runtime_ms, } @classmethod def from_dict(cls, d: dict[str, Any]) -> RunOutput: return cls( stdout=d["stdout"], stderr=d["stderr"], exit_code=d["exit_code"], runtime_ms=d["runtime_ms"], ) @dataclass class TestResult: test_case_index: int passed: bool actual_stdout: str expected_stdout: str stderr: str exit_code: int file_mismatches: dict[str, dict[str, str]] = field(default_factory=dict) def to_dict(self) -> dict[str, Any]: d: dict[str, Any] = { "test_case_index": self.test_case_index, "passed": self.passed, "actual_stdout": self.actual_stdout, "expected_stdout": self.expected_stdout, "stderr": self.stderr, "exit_code": self.exit_code, } if self.file_mismatches: d["file_mismatches"] = self.file_mismatches return d @classmethod def from_dict(cls, d: dict[str, Any]) -> TestResult: return cls( test_case_index=d["test_case_index"], passed=d["passed"], actual_stdout=d["actual_stdout"], expected_stdout=d["expected_stdout"], stderr=d["stderr"], exit_code=d["exit_code"], file_mismatches=d.get("file_mismatches", {}), ) @dataclass class LanguageResult: language: str solution_code: str test_results: list[TestResult] all_passed: bool agent_turns: int questionnaire: list[QuestionnaireResponse] = field(default_factory=list) def to_dict(self) -> dict[str, Any]: return { "language": self.language, "solution_code": self.solution_code, "test_results": [tr.to_dict() for tr in self.test_results], "all_passed": self.all_passed, "agent_turns": self.agent_turns, "questionnaire": [q.to_dict() for q in self.questionnaire], } @classmethod def from_dict(cls, d: dict[str, Any]) -> LanguageResult: return cls( language=d["language"], solution_code=d["solution_code"], test_results=[TestResult.from_dict(tr) for tr in d["test_results"]], all_passed=d["all_passed"], agent_turns=d["agent_turns"], questionnaire=[QuestionnaireResponse.from_dict(q) for q in d.get("questionnaire", [])], ) @dataclass class QuestionnaireResponse: question: str selected: str | int choices: list[str] | None = None def to_dict(self) -> dict[str, Any]: d: dict[str, Any] = {"question": self.question, "selected": self.selected} if self.choices is not None: d["choices"] = self.choices return d @classmethod def from_dict(cls, d: dict[str, Any]) -> QuestionnaireResponse: return cls( question=d["question"], selected=d["selected"], choices=d.get("choices"), ) @dataclass class BenchmarkResult: task_name: str category: str mode: str = "solve" # "solve" or "convert" provider: str = "" model: str = "" timestamp: str = "" bash_result: LanguageResult | None = None lush_result: LanguageResult | None = None def to_dict(self) -> dict[str, Any]: return { "task_name": self.task_name, "category": self.category, "mode": self.mode, "provider": self.provider, "model": self.model, "timestamp": self.timestamp, "bash_result": self.bash_result.to_dict() if self.bash_result else None, "lush_result": self.lush_result.to_dict() if self.lush_result else None, } @classmethod def from_dict(cls, d: dict[str, Any]) -> BenchmarkResult: return cls( task_name=d["task_name"], category=d["category"], mode=d.get("mode", "solve"), provider=d["provider"], model=d["model"], timestamp=d["timestamp"], bash_result=LanguageResult.from_dict(d["bash_result"]) if d.get("bash_result") else None, lush_result=LanguageResult.from_dict(d["lush_result"]) if d.get("lush_result") else None, )