Initial commit: Lush vs Bash AI benchmarking framework

Benchmark harness that uses LLM agents to solve shell scripting tasks in both Bash and Lush, then compares correctness and code quality. - CLI with run, run-all, list-tasks, report, and export commands - Agent loop with retry support via Anthropic Claude provider - Test harness executing solutions in sandboxed subprocesses - LLM-driven questionnaire for subjective code quality evaluation - HTML report export with charts (matplotlib) - 8 Category A tasks (write-from-scratch in both languages) - 4 Category B tasks (verify provided Bash, convert to Lush) - Lush language reference for agent context
2026-03-29 17:56:30 +01:00
commit be8d657b24
33 changed files with 3302 additions and 0 deletions
--- a/lush_bench/providers/init.py
+++ b/lush_bench/providers/init.py
--- a/lush_bench/providers/anthropic.py
+++ b/lush_bench/providers/anthropic.py
@@ -0,0 +1,35 @@
+from __future__ import annotations
+
+import os
+from typing import Any
+
+import anthropic
+
+from .base import Message
+
+
+class AnthropicProvider:
+    def __init__(self, config: dict[str, Any]) -> None:
+        api_key_env = config.get("api_key_env", "ANTHROPIC_API_KEY")
+        api_key = os.environ.get(api_key_env)
+        if not api_key:
+            raise RuntimeError(f"Set {api_key_env} environment variable")
+        self._client = anthropic.Anthropic(api_key=api_key)
+        self._model = config.get("model", "claude-sonnet-4-20250514")
+        self._max_tokens = config.get("max_tokens", 4096)
+
+    def send(self, messages: list[Message], system: str = "") -> str:
+        api_messages = [{"role": m.role, "content": m.content} for m in messages]
+        kwargs: dict[str, Any] = {
+            "model": self._model,
+            "max_tokens": self._max_tokens,
+            "messages": api_messages,
+        }
+        if system:
+            kwargs["system"] = system
+        response = self._client.messages.create(**kwargs)
+        return response.content[0].text
+
+    @property
+    def model_name(self) -> str:
+        return self._model
--- a/lush_bench/providers/base.py
+++ b/lush_bench/providers/base.py
@@ -0,0 +1,17 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Protocol
+
+
+@dataclass
+class Message:
+    role: str  # "user" or "assistant"
+    content: str
+
+
+class LLMProvider(Protocol):
+    def send(self, messages: list[Message], system: str = "") -> str: ...
+
+    @property
+    def model_name(self) -> str: ...