from __future__ import annotations import re from pathlib import Path from .config import Config from .harness import evaluate from .models import LanguageResult, Task, TestCase from .providers.base import LLMProvider, Message from .questionnaire import run_questionnaire LUSH_REFERENCE_PATH = Path(__file__).parent.parent / "lush_reference.md" def load_lush_reference() -> str: return LUSH_REFERENCE_PATH.read_text() def extract_code(response: str, language: str) -> str | None: """Extract the last fenced code block from the response.""" if language == "bash": patterns = [r"```(?:bash|sh)\n(.*?)```", r"```\n(.*?)```"] else: patterns = [r"```(?:lua|lush)\n(.*?)```", r"```\n(.*?)```"] for pattern in patterns: matches = re.findall(pattern, response, re.DOTALL) if matches: return matches[-1].strip() return None def build_system_prompt(language: str) -> str: base = ( "You are a skilled programmer. Write solutions that read from stdin and write to stdout. " "Output ONLY the code in a single fenced code block. No explanations." ) if language == "lush": ref = load_lush_reference() return f"{base}\n\nYou are writing in lush, a Lua-based shell language. Here is the language reference:\n\n{ref}" return f"{base}\n\nYou are writing in bash." def _describe_test_case(tc: TestCase, index: int) -> str: """Build a human-readable description of a test case for the agent.""" parts = [f"Test case {index}:"] if tc.stdin: parts.append(f" Input (stdin):\n{tc.stdin}") if tc.env: parts.append(f" Environment variables: {tc.env}") if tc.setup_files: for fname, content in tc.setup_files.items(): parts.append(f" File in working directory ({fname}):\n{content}") if tc.expected_stdout: parts.append(f" Expected stdout:\n{tc.expected_stdout}") if tc.expected_files: for fname, content in tc.expected_files.items(): parts.append(f" Expected file ({fname}):\n{content}") return "\n".join(parts) def build_task_prompt(task: Task, language: str) -> str: prompt = f"Task: {task.name}\n\n{task.description}\n\n" prompt += "Your script runs in an isolated working directory. " prompt += "Any files listed as setup files will exist in that directory before your script runs.\n\n" prompt += "Example test cases:\n" for i, tc in enumerate(task.test_cases[:2]): # Show first 2 as examples prompt += "\n" + _describe_test_case(tc, i) + "\n" lang_label = "bash" if language == "bash" else "lua" prompt += f"\nWrite the solution in a ```{lang_label} code block." return prompt def build_conversion_prompt(task: Task) -> str: prompt = f"Task: {task.name}\n\n{task.description}\n\n" prompt += f"Here is the bash source to convert to lush (Lua-based shell):\n\n```bash\n{task.bash_source}\n```\n\n" prompt += "Example test cases:\n" for i, tc in enumerate(task.test_cases[:2]): prompt += f"\nInput:\n{tc.stdin}\nExpected output:\n{tc.expected_stdout}\n" prompt += "\nConvert this to lush. Write the solution in a ```lua code block." return prompt def build_failure_feedback(test_results: list, task: Task) -> str: lines = ["Your solution failed some test cases:\n"] for tr in test_results: if not tr.passed: tc = task.test_cases[tr.test_case_index] lines.append(f"Test case {tr.test_case_index}:") lines.append(f" Input: {tc.stdin!r}") lines.append(f" Expected stdout: {tc.expected_stdout!r}") lines.append(f" Got stdout: {tr.actual_stdout!r}") if tr.stderr: lines.append(f" Stderr: {tr.stderr!r}") if tc.env: lines.append(f" Environment vars: {tc.env}") if tc.setup_files: lines.append(f" Files in working directory: {list(tc.setup_files.keys())}") for fname, mismatch in tr.file_mismatches.items(): lines.append(f" File {fname!r}: expected {mismatch['expected']!r}, got {mismatch['actual']!r}") lines.append("") lines.append("Please fix your solution. Output ONLY the corrected code in a fenced code block.") return "\n".join(lines) def solve_task( provider: LLMProvider, task: Task, language: str, config: Config, ) -> LanguageResult: """Run the agent loop: prompt -> code -> test -> retry.""" system = build_system_prompt(language) if task.category == "b" and language == "lush": user_prompt = build_conversion_prompt(task) else: user_prompt = build_task_prompt(task, language) messages: list[Message] = [Message(role="user", content=user_prompt)] turns = 0 for attempt in range(1 + config.max_retries): turns += 1 response = provider.send(messages, system=system) messages.append(Message(role="assistant", content=response)) code = extract_code(response, language) if code is None: if attempt < config.max_retries: feedback = "I couldn't find a code block in your response. Please provide your solution in a fenced code block." messages.append(Message(role="user", content=feedback)) continue return LanguageResult( language=language, solution_code="", test_results=[], all_passed=False, agent_turns=turns, ) test_results = evaluate(task, code, language, config) all_passed = all(tr.passed for tr in test_results) if all_passed or attempt == config.max_retries: return LanguageResult( language=language, solution_code=code, test_results=test_results, all_passed=all_passed, agent_turns=turns, ) feedback = build_failure_feedback(test_results, task) messages.append(Message(role="user", content=feedback)) # Should not reach here, but just in case return LanguageResult( language=language, solution_code=code if code else "", test_results=test_results if test_results else [], all_passed=False, agent_turns=turns, )