Files
lush_grading/lush_bench/providers/anthropic.py
Cormac Shannon be8d657b24 Initial commit: Lush vs Bash AI benchmarking framework
Benchmark harness that uses LLM agents to solve shell scripting tasks
in both Bash and Lush, then compares correctness and code quality.

- CLI with run, run-all, list-tasks, report, and export commands
- Agent loop with retry support via Anthropic Claude provider
- Test harness executing solutions in sandboxed subprocesses
- LLM-driven questionnaire for subjective code quality evaluation
- HTML report export with charts (matplotlib)
- 8 Category A tasks (write-from-scratch in both languages)
- 4 Category B tasks (verify provided Bash, convert to Lush)
- Lush language reference for agent context
2026-03-29 17:56:30 +01:00

36 lines
1.1 KiB
Python

from __future__ import annotations
import os
from typing import Any
import anthropic
from .base import Message
class AnthropicProvider:
def __init__(self, config: dict[str, Any]) -> None:
api_key_env = config.get("api_key_env", "ANTHROPIC_API_KEY")
api_key = os.environ.get(api_key_env)
if not api_key:
raise RuntimeError(f"Set {api_key_env} environment variable")
self._client = anthropic.Anthropic(api_key=api_key)
self._model = config.get("model", "claude-sonnet-4-20250514")
self._max_tokens = config.get("max_tokens", 4096)
def send(self, messages: list[Message], system: str = "") -> str:
api_messages = [{"role": m.role, "content": m.content} for m in messages]
kwargs: dict[str, Any] = {
"model": self._model,
"max_tokens": self._max_tokens,
"messages": api_messages,
}
if system:
kwargs["system"] = system
response = self._client.messages.create(**kwargs)
return response.content[0].text
@property
def model_name(self) -> str:
return self._model