Initial commit: Lush vs Bash AI benchmarking framework

Benchmark harness that uses LLM agents to solve shell scripting tasks in both Bash and Lush, then compares correctness and code quality. - CLI with run, run-all, list-tasks, report, and export commands - Agent loop with retry support via Anthropic Claude provider - Test harness executing solutions in sandboxed subprocesses - LLM-driven questionnaire for subjective code quality evaluation - HTML report export with charts (matplotlib) - 8 Category A tasks (write-from-scratch in both languages) - 4 Category B tasks (verify provided Bash, convert to Lush) - Lush language reference for agent context
2026-03-29 17:56:30 +01:00
commit be8d657b24
33 changed files with 3302 additions and 0 deletions
--- a/lush_bench/agent.py
+++ b/lush_bench/agent.py
@@ -0,0 +1,164 @@
+from __future__ import annotations
+
+import re
+from pathlib import Path
+
+from .config import Config
+from .harness import evaluate
+from .models import LanguageResult, Task, TestCase
+from .providers.base import LLMProvider, Message
+from .questionnaire import run_questionnaire
+
+LUSH_REFERENCE_PATH = Path(__file__).parent.parent / "lush_reference.md"
+
+
+def load_lush_reference() -> str:
+    return LUSH_REFERENCE_PATH.read_text()
+
+
+def extract_code(response: str, language: str) -> str | None:
+    """Extract the last fenced code block from the response."""
+    if language == "bash":
+        patterns = [r"```(?:bash|sh)\n(.*?)```", r"```\n(.*?)```"]
+    else:
+        patterns = [r"```(?:lua|lush)\n(.*?)```", r"```\n(.*?)```"]
+
+    for pattern in patterns:
+        matches = re.findall(pattern, response, re.DOTALL)
+        if matches:
+            return matches[-1].strip()
+    return None
+
+
+def build_system_prompt(language: str) -> str:
+    base = (
+        "You are a skilled programmer. Write solutions that read from stdin and write to stdout. "
+        "Output ONLY the code in a single fenced code block. No explanations."
+    )
+    if language == "lush":
+        ref = load_lush_reference()
+        return f"{base}\n\nYou are writing in lush, a Lua-based shell language. Here is the language reference:\n\n{ref}"
+    return f"{base}\n\nYou are writing in bash."
+
+
+def _describe_test_case(tc: TestCase, index: int) -> str:
+    """Build a human-readable description of a test case for the agent."""
+    parts = [f"Test case {index}:"]
+    if tc.stdin:
+        parts.append(f"  Input (stdin):\n{tc.stdin}")
+    if tc.env:
+        parts.append(f"  Environment variables: {tc.env}")
+    if tc.setup_files:
+        for fname, content in tc.setup_files.items():
+            parts.append(f"  File in working directory ({fname}):\n{content}")
+    if tc.expected_stdout:
+        parts.append(f"  Expected stdout:\n{tc.expected_stdout}")
+    if tc.expected_files:
+        for fname, content in tc.expected_files.items():
+            parts.append(f"  Expected file ({fname}):\n{content}")
+    return "\n".join(parts)
+
+
+def build_task_prompt(task: Task, language: str) -> str:
+    prompt = f"Task: {task.name}\n\n{task.description}\n\n"
+    prompt += "Your script runs in an isolated working directory. "
+    prompt += "Any files listed as setup files will exist in that directory before your script runs.\n\n"
+    prompt += "Example test cases:\n"
+    for i, tc in enumerate(task.test_cases[:2]):  # Show first 2 as examples
+        prompt += "\n" + _describe_test_case(tc, i) + "\n"
+    lang_label = "bash" if language == "bash" else "lua"
+    prompt += f"\nWrite the solution in a ```{lang_label} code block."
+    return prompt
+
+
+def build_conversion_prompt(task: Task) -> str:
+    prompt = f"Task: {task.name}\n\n{task.description}\n\n"
+    prompt += f"Here is the bash source to convert to lush (Lua-based shell):\n\n```bash\n{task.bash_source}\n```\n\n"
+    prompt += "Example test cases:\n"
+    for i, tc in enumerate(task.test_cases[:2]):
+        prompt += f"\nInput:\n{tc.stdin}\nExpected output:\n{tc.expected_stdout}\n"
+    prompt += "\nConvert this to lush. Write the solution in a ```lua code block."
+    return prompt
+
+
+def build_failure_feedback(test_results: list, task: Task) -> str:
+    lines = ["Your solution failed some test cases:\n"]
+    for tr in test_results:
+        if not tr.passed:
+            tc = task.test_cases[tr.test_case_index]
+            lines.append(f"Test case {tr.test_case_index}:")
+            lines.append(f"  Input: {tc.stdin!r}")
+            lines.append(f"  Expected stdout: {tc.expected_stdout!r}")
+            lines.append(f"  Got stdout: {tr.actual_stdout!r}")
+            if tr.stderr:
+                lines.append(f"  Stderr: {tr.stderr!r}")
+            if tc.env:
+                lines.append(f"  Environment vars: {tc.env}")
+            if tc.setup_files:
+                lines.append(f"  Files in working directory: {list(tc.setup_files.keys())}")
+            for fname, mismatch in tr.file_mismatches.items():
+                lines.append(f"  File {fname!r}: expected {mismatch['expected']!r}, got {mismatch['actual']!r}")
+            lines.append("")
+    lines.append("Please fix your solution. Output ONLY the corrected code in a fenced code block.")
+    return "\n".join(lines)
+
+
+def solve_task(
+    provider: LLMProvider,
+    task: Task,
+    language: str,
+    config: Config,
+) -> LanguageResult:
+    """Run the agent loop: prompt -> code -> test -> retry."""
+    system = build_system_prompt(language)
+
+    if task.category == "b" and language == "lush":
+        user_prompt = build_conversion_prompt(task)
+    else:
+        user_prompt = build_task_prompt(task, language)
+
+    messages: list[Message] = [Message(role="user", content=user_prompt)]
+    turns = 0
+
+    for attempt in range(1 + config.max_retries):
+        turns += 1
+        response = provider.send(messages, system=system)
+        messages.append(Message(role="assistant", content=response))
+
+        code = extract_code(response, language)
+        if code is None:
+            if attempt < config.max_retries:
+                feedback = "I couldn't find a code block in your response. Please provide your solution in a fenced code block."
+                messages.append(Message(role="user", content=feedback))
+                continue
+            return LanguageResult(
+                language=language,
+                solution_code="",
+                test_results=[],
+                all_passed=False,
+                agent_turns=turns,
+            )
+
+        test_results = evaluate(task, code, language, config)
+        all_passed = all(tr.passed for tr in test_results)
+
+        if all_passed or attempt == config.max_retries:
+            return LanguageResult(
+                language=language,
+                solution_code=code,
+                test_results=test_results,
+                all_passed=all_passed,
+                agent_turns=turns,
+            )
+
+        feedback = build_failure_feedback(test_results, task)
+        messages.append(Message(role="user", content=feedback))
+
+    # Should not reach here, but just in case
+    return LanguageResult(
+        language=language,
+        solution_code=code if code else "",
+        test_results=test_results if test_results else [],
+        all_passed=False,
+        agent_turns=turns,
+    )