Benchmark harness that uses LLM agents to solve shell scripting tasks in both Bash and Lush, then compares correctness and code quality. - CLI with run, run-all, list-tasks, report, and export commands - Agent loop with retry support via Anthropic Claude provider - Test harness executing solutions in sandboxed subprocesses - LLM-driven questionnaire for subjective code quality evaluation - HTML report export with charts (matplotlib) - 8 Category A tasks (write-from-scratch in both languages) - 4 Category B tasks (verify provided Bash, convert to Lush) - Lush language reference for agent context
92 lines
3.2 KiB
Python
92 lines
3.2 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
import re
|
|
|
|
from .models import QuestionnaireResponse
|
|
from .providers.base import LLMProvider, Message
|
|
|
|
QUESTIONS = [
|
|
{
|
|
"question": "Readability: The solution is easy to read and understand",
|
|
"choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
|
|
},
|
|
{
|
|
"question": "Expressiveness: The language provided sufficient constructs to solve the problem naturally",
|
|
"choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
|
|
},
|
|
{
|
|
"question": "Conciseness: The solution required minimal boilerplate",
|
|
"choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
|
|
},
|
|
{
|
|
"question": "Error handling: Error handling was straightforward",
|
|
"choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
|
|
},
|
|
{
|
|
"question": "Overall preference: I would prefer this language for similar tasks",
|
|
"choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
|
|
},
|
|
{
|
|
"question": "Learning curve: An unfamiliar developer could understand the solution quickly",
|
|
"choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
|
|
},
|
|
]
|
|
|
|
|
|
def build_questionnaire_prompt(
|
|
task_name: str,
|
|
language: str,
|
|
solution_code: str,
|
|
) -> str:
|
|
questions_text = ""
|
|
for i, q in enumerate(QUESTIONS, 1):
|
|
choices_str = ", ".join(f'"{c}"' for c in q["choices"])
|
|
questions_text += f' {{"question": "{q["question"]}", "choices": [{choices_str}], "selected": <your choice>}},\n'
|
|
|
|
return f"""You just solved the task "{task_name}" in {language}. Here is your solution:
|
|
|
|
```
|
|
{solution_code}
|
|
```
|
|
|
|
Please evaluate your experience by answering the following questionnaire. Respond with ONLY a JSON array — no other text.
|
|
|
|
[
|
|
{questions_text} {{"question": "Free-form observation about using {language} for this task", "selected": "<your observation>"}}
|
|
]"""
|
|
|
|
|
|
def parse_questionnaire_response(response: str) -> list[QuestionnaireResponse]:
|
|
# Try to extract JSON array from response
|
|
json_match = re.search(r"\[.*\]", response, re.DOTALL)
|
|
if not json_match:
|
|
return [QuestionnaireResponse(question="raw_response", selected=response)]
|
|
|
|
try:
|
|
data = json.loads(json_match.group())
|
|
except json.JSONDecodeError:
|
|
return [QuestionnaireResponse(question="raw_response", selected=response)]
|
|
|
|
results = []
|
|
for item in data:
|
|
results.append(
|
|
QuestionnaireResponse(
|
|
question=item.get("question", ""),
|
|
selected=item.get("selected", ""),
|
|
choices=item.get("choices"),
|
|
)
|
|
)
|
|
return results
|
|
|
|
|
|
def run_questionnaire(
|
|
provider: LLMProvider,
|
|
task_name: str,
|
|
language: str,
|
|
solution_code: str,
|
|
) -> list[QuestionnaireResponse]:
|
|
prompt = build_questionnaire_prompt(task_name, language, solution_code)
|
|
response = provider.send([Message(role="user", content=prompt)])
|
|
return parse_questionnaire_response(response)
|