Initial commit: Lush vs Bash AI benchmarking framework
Benchmark harness that uses LLM agents to solve shell scripting tasks in both Bash and Lush, then compares correctness and code quality. - CLI with run, run-all, list-tasks, report, and export commands - Agent loop with retry support via Anthropic Claude provider - Test harness executing solutions in sandboxed subprocesses - LLM-driven questionnaire for subjective code quality evaluation - HTML report export with charts (matplotlib) - 8 Category A tasks (write-from-scratch in both languages) - 4 Category B tasks (verify provided Bash, convert to Lush) - Lush language reference for agent context
This commit is contained in:
164
lush_bench/agent.py
Normal file
164
lush_bench/agent.py
Normal file
@@ -0,0 +1,164 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
from .config import Config
|
||||
from .harness import evaluate
|
||||
from .models import LanguageResult, Task, TestCase
|
||||
from .providers.base import LLMProvider, Message
|
||||
from .questionnaire import run_questionnaire
|
||||
|
||||
LUSH_REFERENCE_PATH = Path(__file__).parent.parent / "lush_reference.md"
|
||||
|
||||
|
||||
def load_lush_reference() -> str:
|
||||
return LUSH_REFERENCE_PATH.read_text()
|
||||
|
||||
|
||||
def extract_code(response: str, language: str) -> str | None:
|
||||
"""Extract the last fenced code block from the response."""
|
||||
if language == "bash":
|
||||
patterns = [r"```(?:bash|sh)\n(.*?)```", r"```\n(.*?)```"]
|
||||
else:
|
||||
patterns = [r"```(?:lua|lush)\n(.*?)```", r"```\n(.*?)```"]
|
||||
|
||||
for pattern in patterns:
|
||||
matches = re.findall(pattern, response, re.DOTALL)
|
||||
if matches:
|
||||
return matches[-1].strip()
|
||||
return None
|
||||
|
||||
|
||||
def build_system_prompt(language: str) -> str:
|
||||
base = (
|
||||
"You are a skilled programmer. Write solutions that read from stdin and write to stdout. "
|
||||
"Output ONLY the code in a single fenced code block. No explanations."
|
||||
)
|
||||
if language == "lush":
|
||||
ref = load_lush_reference()
|
||||
return f"{base}\n\nYou are writing in lush, a Lua-based shell language. Here is the language reference:\n\n{ref}"
|
||||
return f"{base}\n\nYou are writing in bash."
|
||||
|
||||
|
||||
def _describe_test_case(tc: TestCase, index: int) -> str:
|
||||
"""Build a human-readable description of a test case for the agent."""
|
||||
parts = [f"Test case {index}:"]
|
||||
if tc.stdin:
|
||||
parts.append(f" Input (stdin):\n{tc.stdin}")
|
||||
if tc.env:
|
||||
parts.append(f" Environment variables: {tc.env}")
|
||||
if tc.setup_files:
|
||||
for fname, content in tc.setup_files.items():
|
||||
parts.append(f" File in working directory ({fname}):\n{content}")
|
||||
if tc.expected_stdout:
|
||||
parts.append(f" Expected stdout:\n{tc.expected_stdout}")
|
||||
if tc.expected_files:
|
||||
for fname, content in tc.expected_files.items():
|
||||
parts.append(f" Expected file ({fname}):\n{content}")
|
||||
return "\n".join(parts)
|
||||
|
||||
|
||||
def build_task_prompt(task: Task, language: str) -> str:
|
||||
prompt = f"Task: {task.name}\n\n{task.description}\n\n"
|
||||
prompt += "Your script runs in an isolated working directory. "
|
||||
prompt += "Any files listed as setup files will exist in that directory before your script runs.\n\n"
|
||||
prompt += "Example test cases:\n"
|
||||
for i, tc in enumerate(task.test_cases[:2]): # Show first 2 as examples
|
||||
prompt += "\n" + _describe_test_case(tc, i) + "\n"
|
||||
lang_label = "bash" if language == "bash" else "lua"
|
||||
prompt += f"\nWrite the solution in a ```{lang_label} code block."
|
||||
return prompt
|
||||
|
||||
|
||||
def build_conversion_prompt(task: Task) -> str:
|
||||
prompt = f"Task: {task.name}\n\n{task.description}\n\n"
|
||||
prompt += f"Here is the bash source to convert to lush (Lua-based shell):\n\n```bash\n{task.bash_source}\n```\n\n"
|
||||
prompt += "Example test cases:\n"
|
||||
for i, tc in enumerate(task.test_cases[:2]):
|
||||
prompt += f"\nInput:\n{tc.stdin}\nExpected output:\n{tc.expected_stdout}\n"
|
||||
prompt += "\nConvert this to lush. Write the solution in a ```lua code block."
|
||||
return prompt
|
||||
|
||||
|
||||
def build_failure_feedback(test_results: list, task: Task) -> str:
|
||||
lines = ["Your solution failed some test cases:\n"]
|
||||
for tr in test_results:
|
||||
if not tr.passed:
|
||||
tc = task.test_cases[tr.test_case_index]
|
||||
lines.append(f"Test case {tr.test_case_index}:")
|
||||
lines.append(f" Input: {tc.stdin!r}")
|
||||
lines.append(f" Expected stdout: {tc.expected_stdout!r}")
|
||||
lines.append(f" Got stdout: {tr.actual_stdout!r}")
|
||||
if tr.stderr:
|
||||
lines.append(f" Stderr: {tr.stderr!r}")
|
||||
if tc.env:
|
||||
lines.append(f" Environment vars: {tc.env}")
|
||||
if tc.setup_files:
|
||||
lines.append(f" Files in working directory: {list(tc.setup_files.keys())}")
|
||||
for fname, mismatch in tr.file_mismatches.items():
|
||||
lines.append(f" File {fname!r}: expected {mismatch['expected']!r}, got {mismatch['actual']!r}")
|
||||
lines.append("")
|
||||
lines.append("Please fix your solution. Output ONLY the corrected code in a fenced code block.")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def solve_task(
|
||||
provider: LLMProvider,
|
||||
task: Task,
|
||||
language: str,
|
||||
config: Config,
|
||||
) -> LanguageResult:
|
||||
"""Run the agent loop: prompt -> code -> test -> retry."""
|
||||
system = build_system_prompt(language)
|
||||
|
||||
if task.category == "b" and language == "lush":
|
||||
user_prompt = build_conversion_prompt(task)
|
||||
else:
|
||||
user_prompt = build_task_prompt(task, language)
|
||||
|
||||
messages: list[Message] = [Message(role="user", content=user_prompt)]
|
||||
turns = 0
|
||||
|
||||
for attempt in range(1 + config.max_retries):
|
||||
turns += 1
|
||||
response = provider.send(messages, system=system)
|
||||
messages.append(Message(role="assistant", content=response))
|
||||
|
||||
code = extract_code(response, language)
|
||||
if code is None:
|
||||
if attempt < config.max_retries:
|
||||
feedback = "I couldn't find a code block in your response. Please provide your solution in a fenced code block."
|
||||
messages.append(Message(role="user", content=feedback))
|
||||
continue
|
||||
return LanguageResult(
|
||||
language=language,
|
||||
solution_code="",
|
||||
test_results=[],
|
||||
all_passed=False,
|
||||
agent_turns=turns,
|
||||
)
|
||||
|
||||
test_results = evaluate(task, code, language, config)
|
||||
all_passed = all(tr.passed for tr in test_results)
|
||||
|
||||
if all_passed or attempt == config.max_retries:
|
||||
return LanguageResult(
|
||||
language=language,
|
||||
solution_code=code,
|
||||
test_results=test_results,
|
||||
all_passed=all_passed,
|
||||
agent_turns=turns,
|
||||
)
|
||||
|
||||
feedback = build_failure_feedback(test_results, task)
|
||||
messages.append(Message(role="user", content=feedback))
|
||||
|
||||
# Should not reach here, but just in case
|
||||
return LanguageResult(
|
||||
language=language,
|
||||
solution_code=code if code else "",
|
||||
test_results=test_results if test_results else [],
|
||||
all_passed=False,
|
||||
agent_turns=turns,
|
||||
)
|
||||
Reference in New Issue
Block a user