Initial commit: Lush vs Bash AI benchmarking framework
Benchmark harness that uses LLM agents to solve shell scripting tasks in both Bash and Lush, then compares correctness and code quality. - CLI with run, run-all, list-tasks, report, and export commands - Agent loop with retry support via Anthropic Claude provider - Test harness executing solutions in sandboxed subprocesses - LLM-driven questionnaire for subjective code quality evaluation - HTML report export with charts (matplotlib) - 8 Category A tasks (write-from-scratch in both languages) - 4 Category B tasks (verify provided Bash, convert to Lush) - Lush language reference for agent context
This commit is contained in:
0
lush_bench/providers/__init__.py
Normal file
0
lush_bench/providers/__init__.py
Normal file
35
lush_bench/providers/anthropic.py
Normal file
35
lush_bench/providers/anthropic.py
Normal file
@@ -0,0 +1,35 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
import anthropic
|
||||
|
||||
from .base import Message
|
||||
|
||||
|
||||
class AnthropicProvider:
|
||||
def __init__(self, config: dict[str, Any]) -> None:
|
||||
api_key_env = config.get("api_key_env", "ANTHROPIC_API_KEY")
|
||||
api_key = os.environ.get(api_key_env)
|
||||
if not api_key:
|
||||
raise RuntimeError(f"Set {api_key_env} environment variable")
|
||||
self._client = anthropic.Anthropic(api_key=api_key)
|
||||
self._model = config.get("model", "claude-sonnet-4-20250514")
|
||||
self._max_tokens = config.get("max_tokens", 4096)
|
||||
|
||||
def send(self, messages: list[Message], system: str = "") -> str:
|
||||
api_messages = [{"role": m.role, "content": m.content} for m in messages]
|
||||
kwargs: dict[str, Any] = {
|
||||
"model": self._model,
|
||||
"max_tokens": self._max_tokens,
|
||||
"messages": api_messages,
|
||||
}
|
||||
if system:
|
||||
kwargs["system"] = system
|
||||
response = self._client.messages.create(**kwargs)
|
||||
return response.content[0].text
|
||||
|
||||
@property
|
||||
def model_name(self) -> str:
|
||||
return self._model
|
||||
17
lush_bench/providers/base.py
Normal file
17
lush_bench/providers/base.py
Normal file
@@ -0,0 +1,17 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Protocol
|
||||
|
||||
|
||||
@dataclass
|
||||
class Message:
|
||||
role: str # "user" or "assistant"
|
||||
content: str
|
||||
|
||||
|
||||
class LLMProvider(Protocol):
|
||||
def send(self, messages: list[Message], system: str = "") -> str: ...
|
||||
|
||||
@property
|
||||
def model_name(self) -> str: ...
|
||||
Reference in New Issue
Block a user