Benchmark harness that uses LLM agents to solve shell scripting tasks in both Bash and Lush, then compares correctness and code quality. - CLI with run, run-all, list-tasks, report, and export commands - Agent loop with retry support via Anthropic Claude provider - Test harness executing solutions in sandboxed subprocesses - LLM-driven questionnaire for subjective code quality evaluation - HTML report export with charts (matplotlib) - 8 Category A tasks (write-from-scratch in both languages) - 4 Category B tasks (verify provided Bash, convert to Lush) - Lush language reference for agent context
40 lines
1.3 KiB
Python
40 lines
1.3 KiB
Python
from __future__ import annotations
|
|
|
|
import tomllib
|
|
from dataclasses import dataclass, field
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
|
|
@dataclass
|
|
class Config:
|
|
lush_binary: Path
|
|
max_retries: int = 3
|
|
timeout_seconds: float = 10.0
|
|
normalize_whitespace: bool = True
|
|
output_dir: Path = Path("results")
|
|
provider_configs: dict[str, dict[str, Any]] = field(default_factory=dict)
|
|
|
|
@classmethod
|
|
def load(cls, path: Path | None = None) -> Config:
|
|
if path is None:
|
|
path = Path(__file__).parent.parent / "config.toml"
|
|
raw = tomllib.loads(path.read_text())
|
|
|
|
lush = raw.get("lush", {})
|
|
agent = raw.get("agent", {})
|
|
results = raw.get("results", {})
|
|
|
|
# Collect provider configs (any top-level section not in known keys)
|
|
known_sections = {"lush", "agent", "results"}
|
|
provider_configs = {k: v for k, v in raw.items() if k not in known_sections and isinstance(v, dict)}
|
|
|
|
return cls(
|
|
lush_binary=Path(lush["binary"]),
|
|
max_retries=agent.get("max_retries", 3),
|
|
timeout_seconds=agent.get("timeout_seconds", 10.0),
|
|
normalize_whitespace=agent.get("normalize_whitespace", True),
|
|
output_dir=Path(results.get("output_dir", "results")),
|
|
provider_configs=provider_configs,
|
|
)
|