Replace category_a/category_b directories with algorithm, pipeline, environment, filesystem, and process. Add separate mode field (solve/convert) to decouple orchestration from capability grouping. Add per-category summary and questionnaire breakdowns to both terminal report and HTML export.
165 lines
6.2 KiB
Python
165 lines
6.2 KiB
Python
from __future__ import annotations
|
|
|
|
import re
|
|
from pathlib import Path
|
|
|
|
from .config import Config
|
|
from .harness import evaluate
|
|
from .models import LanguageResult, Task, TestCase
|
|
from .providers.base import LLMProvider, Message
|
|
from .questionnaire import run_questionnaire
|
|
|
|
LUSH_REFERENCE_PATH = Path(__file__).parent.parent / "lush_reference.md"
|
|
|
|
|
|
def load_lush_reference() -> str:
|
|
return LUSH_REFERENCE_PATH.read_text()
|
|
|
|
|
|
def extract_code(response: str, language: str) -> str | None:
|
|
"""Extract the last fenced code block from the response."""
|
|
if language == "bash":
|
|
patterns = [r"```(?:bash|sh)\n(.*?)```", r"```\n(.*?)```"]
|
|
else:
|
|
patterns = [r"```(?:lua|lush)\n(.*?)```", r"```\n(.*?)```"]
|
|
|
|
for pattern in patterns:
|
|
matches = re.findall(pattern, response, re.DOTALL)
|
|
if matches:
|
|
return matches[-1].strip()
|
|
return None
|
|
|
|
|
|
def build_system_prompt(language: str) -> str:
|
|
base = (
|
|
"You are a skilled programmer. Write solutions that read from stdin and write to stdout. "
|
|
"Output ONLY the code in a single fenced code block. No explanations."
|
|
)
|
|
if language == "lush":
|
|
ref = load_lush_reference()
|
|
return f"{base}\n\nYou are writing in lush, a Lua-based shell language. Here is the language reference:\n\n{ref}"
|
|
return f"{base}\n\nYou are writing in bash."
|
|
|
|
|
|
def _describe_test_case(tc: TestCase, index: int) -> str:
|
|
"""Build a human-readable description of a test case for the agent."""
|
|
parts = [f"Test case {index}:"]
|
|
if tc.stdin:
|
|
parts.append(f" Input (stdin):\n{tc.stdin}")
|
|
if tc.env:
|
|
parts.append(f" Environment variables: {tc.env}")
|
|
if tc.setup_files:
|
|
for fname, content in tc.setup_files.items():
|
|
parts.append(f" File in working directory ({fname}):\n{content}")
|
|
if tc.expected_stdout:
|
|
parts.append(f" Expected stdout:\n{tc.expected_stdout}")
|
|
if tc.expected_files:
|
|
for fname, content in tc.expected_files.items():
|
|
parts.append(f" Expected file ({fname}):\n{content}")
|
|
return "\n".join(parts)
|
|
|
|
|
|
def build_task_prompt(task: Task, language: str) -> str:
|
|
prompt = f"Task: {task.name}\n\n{task.description}\n\n"
|
|
prompt += "Your script runs in an isolated working directory. "
|
|
prompt += "Any files listed as setup files will exist in that directory before your script runs.\n\n"
|
|
prompt += "Example test cases:\n"
|
|
for i, tc in enumerate(task.test_cases[:2]): # Show first 2 as examples
|
|
prompt += "\n" + _describe_test_case(tc, i) + "\n"
|
|
lang_label = "bash" if language == "bash" else "lua"
|
|
prompt += f"\nWrite the solution in a ```{lang_label} code block."
|
|
return prompt
|
|
|
|
|
|
def build_conversion_prompt(task: Task) -> str:
|
|
prompt = f"Task: {task.name}\n\n{task.description}\n\n"
|
|
prompt += f"Here is the bash source to convert to lush (Lua-based shell):\n\n```bash\n{task.bash_source}\n```\n\n"
|
|
prompt += "Example test cases:\n"
|
|
for i, tc in enumerate(task.test_cases[:2]):
|
|
prompt += f"\nInput:\n{tc.stdin}\nExpected output:\n{tc.expected_stdout}\n"
|
|
prompt += "\nConvert this to lush. Write the solution in a ```lua code block."
|
|
return prompt
|
|
|
|
|
|
def build_failure_feedback(test_results: list, task: Task) -> str:
|
|
lines = ["Your solution failed some test cases:\n"]
|
|
for tr in test_results:
|
|
if not tr.passed:
|
|
tc = task.test_cases[tr.test_case_index]
|
|
lines.append(f"Test case {tr.test_case_index}:")
|
|
lines.append(f" Input: {tc.stdin!r}")
|
|
lines.append(f" Expected stdout: {tc.expected_stdout!r}")
|
|
lines.append(f" Got stdout: {tr.actual_stdout!r}")
|
|
if tr.stderr:
|
|
lines.append(f" Stderr: {tr.stderr!r}")
|
|
if tc.env:
|
|
lines.append(f" Environment vars: {tc.env}")
|
|
if tc.setup_files:
|
|
lines.append(f" Files in working directory: {list(tc.setup_files.keys())}")
|
|
for fname, mismatch in tr.file_mismatches.items():
|
|
lines.append(f" File {fname!r}: expected {mismatch['expected']!r}, got {mismatch['actual']!r}")
|
|
lines.append("")
|
|
lines.append("Please fix your solution. Output ONLY the corrected code in a fenced code block.")
|
|
return "\n".join(lines)
|
|
|
|
|
|
def solve_task(
|
|
provider: LLMProvider,
|
|
task: Task,
|
|
language: str,
|
|
config: Config,
|
|
) -> LanguageResult:
|
|
"""Run the agent loop: prompt -> code -> test -> retry."""
|
|
system = build_system_prompt(language)
|
|
|
|
if task.mode == "convert" and language == "lush":
|
|
user_prompt = build_conversion_prompt(task)
|
|
else:
|
|
user_prompt = build_task_prompt(task, language)
|
|
|
|
messages: list[Message] = [Message(role="user", content=user_prompt)]
|
|
turns = 0
|
|
|
|
for attempt in range(1 + config.max_retries):
|
|
turns += 1
|
|
response = provider.send(messages, system=system)
|
|
messages.append(Message(role="assistant", content=response))
|
|
|
|
code = extract_code(response, language)
|
|
if code is None:
|
|
if attempt < config.max_retries:
|
|
feedback = "I couldn't find a code block in your response. Please provide your solution in a fenced code block."
|
|
messages.append(Message(role="user", content=feedback))
|
|
continue
|
|
return LanguageResult(
|
|
language=language,
|
|
solution_code="",
|
|
test_results=[],
|
|
all_passed=False,
|
|
agent_turns=turns,
|
|
)
|
|
|
|
test_results = evaluate(task, code, language, config)
|
|
all_passed = all(tr.passed for tr in test_results)
|
|
|
|
if all_passed or attempt == config.max_retries:
|
|
return LanguageResult(
|
|
language=language,
|
|
solution_code=code,
|
|
test_results=test_results,
|
|
all_passed=all_passed,
|
|
agent_turns=turns,
|
|
)
|
|
|
|
feedback = build_failure_feedback(test_results, task)
|
|
messages.append(Message(role="user", content=feedback))
|
|
|
|
# Should not reach here, but just in case
|
|
return LanguageResult(
|
|
language=language,
|
|
solution_code=code if code else "",
|
|
test_results=test_results if test_results else [],
|
|
all_passed=False,
|
|
agent_turns=turns,
|
|
)
|