Files
lush_grading/lush_bench/agent.py
Cormac Shannon 20e62f60f6 Reorganize task categories from opaque a/b to descriptive names
Replace category_a/category_b directories with algorithm, pipeline,
environment, filesystem, and process. Add separate mode field (solve/convert)
to decouple orchestration from capability grouping. Add per-category
summary and questionnaire breakdowns to both terminal report and HTML export.
2026-03-29 20:59:01 +01:00

165 lines
6.2 KiB
Python

from __future__ import annotations
import re
from pathlib import Path
from .config import Config
from .harness import evaluate
from .models import LanguageResult, Task, TestCase
from .providers.base import LLMProvider, Message
from .questionnaire import run_questionnaire
LUSH_REFERENCE_PATH = Path(__file__).parent.parent / "lush_reference.md"
def load_lush_reference() -> str:
return LUSH_REFERENCE_PATH.read_text()
def extract_code(response: str, language: str) -> str | None:
"""Extract the last fenced code block from the response."""
if language == "bash":
patterns = [r"```(?:bash|sh)\n(.*?)```", r"```\n(.*?)```"]
else:
patterns = [r"```(?:lua|lush)\n(.*?)```", r"```\n(.*?)```"]
for pattern in patterns:
matches = re.findall(pattern, response, re.DOTALL)
if matches:
return matches[-1].strip()
return None
def build_system_prompt(language: str) -> str:
base = (
"You are a skilled programmer. Write solutions that read from stdin and write to stdout. "
"Output ONLY the code in a single fenced code block. No explanations."
)
if language == "lush":
ref = load_lush_reference()
return f"{base}\n\nYou are writing in lush, a Lua-based shell language. Here is the language reference:\n\n{ref}"
return f"{base}\n\nYou are writing in bash."
def _describe_test_case(tc: TestCase, index: int) -> str:
"""Build a human-readable description of a test case for the agent."""
parts = [f"Test case {index}:"]
if tc.stdin:
parts.append(f" Input (stdin):\n{tc.stdin}")
if tc.env:
parts.append(f" Environment variables: {tc.env}")
if tc.setup_files:
for fname, content in tc.setup_files.items():
parts.append(f" File in working directory ({fname}):\n{content}")
if tc.expected_stdout:
parts.append(f" Expected stdout:\n{tc.expected_stdout}")
if tc.expected_files:
for fname, content in tc.expected_files.items():
parts.append(f" Expected file ({fname}):\n{content}")
return "\n".join(parts)
def build_task_prompt(task: Task, language: str) -> str:
prompt = f"Task: {task.name}\n\n{task.description}\n\n"
prompt += "Your script runs in an isolated working directory. "
prompt += "Any files listed as setup files will exist in that directory before your script runs.\n\n"
prompt += "Example test cases:\n"
for i, tc in enumerate(task.test_cases[:2]): # Show first 2 as examples
prompt += "\n" + _describe_test_case(tc, i) + "\n"
lang_label = "bash" if language == "bash" else "lua"
prompt += f"\nWrite the solution in a ```{lang_label} code block."
return prompt
def build_conversion_prompt(task: Task) -> str:
prompt = f"Task: {task.name}\n\n{task.description}\n\n"
prompt += f"Here is the bash source to convert to lush (Lua-based shell):\n\n```bash\n{task.bash_source}\n```\n\n"
prompt += "Example test cases:\n"
for i, tc in enumerate(task.test_cases[:2]):
prompt += f"\nInput:\n{tc.stdin}\nExpected output:\n{tc.expected_stdout}\n"
prompt += "\nConvert this to lush. Write the solution in a ```lua code block."
return prompt
def build_failure_feedback(test_results: list, task: Task) -> str:
lines = ["Your solution failed some test cases:\n"]
for tr in test_results:
if not tr.passed:
tc = task.test_cases[tr.test_case_index]
lines.append(f"Test case {tr.test_case_index}:")
lines.append(f" Input: {tc.stdin!r}")
lines.append(f" Expected stdout: {tc.expected_stdout!r}")
lines.append(f" Got stdout: {tr.actual_stdout!r}")
if tr.stderr:
lines.append(f" Stderr: {tr.stderr!r}")
if tc.env:
lines.append(f" Environment vars: {tc.env}")
if tc.setup_files:
lines.append(f" Files in working directory: {list(tc.setup_files.keys())}")
for fname, mismatch in tr.file_mismatches.items():
lines.append(f" File {fname!r}: expected {mismatch['expected']!r}, got {mismatch['actual']!r}")
lines.append("")
lines.append("Please fix your solution. Output ONLY the corrected code in a fenced code block.")
return "\n".join(lines)
def solve_task(
provider: LLMProvider,
task: Task,
language: str,
config: Config,
) -> LanguageResult:
"""Run the agent loop: prompt -> code -> test -> retry."""
system = build_system_prompt(language)
if task.mode == "convert" and language == "lush":
user_prompt = build_conversion_prompt(task)
else:
user_prompt = build_task_prompt(task, language)
messages: list[Message] = [Message(role="user", content=user_prompt)]
turns = 0
for attempt in range(1 + config.max_retries):
turns += 1
response = provider.send(messages, system=system)
messages.append(Message(role="assistant", content=response))
code = extract_code(response, language)
if code is None:
if attempt < config.max_retries:
feedback = "I couldn't find a code block in your response. Please provide your solution in a fenced code block."
messages.append(Message(role="user", content=feedback))
continue
return LanguageResult(
language=language,
solution_code="",
test_results=[],
all_passed=False,
agent_turns=turns,
)
test_results = evaluate(task, code, language, config)
all_passed = all(tr.passed for tr in test_results)
if all_passed or attempt == config.max_retries:
return LanguageResult(
language=language,
solution_code=code,
test_results=test_results,
all_passed=all_passed,
agent_turns=turns,
)
feedback = build_failure_feedback(test_results, task)
messages.append(Message(role="user", content=feedback))
# Should not reach here, but just in case
return LanguageResult(
language=language,
solution_code=code if code else "",
test_results=test_results if test_results else [],
all_passed=False,
agent_turns=turns,
)