Files
lush_grading/lush_bench/questionnaire.py
Cormac Shannon be8d657b24 Initial commit: Lush vs Bash AI benchmarking framework
Benchmark harness that uses LLM agents to solve shell scripting tasks
in both Bash and Lush, then compares correctness and code quality.

- CLI with run, run-all, list-tasks, report, and export commands
- Agent loop with retry support via Anthropic Claude provider
- Test harness executing solutions in sandboxed subprocesses
- LLM-driven questionnaire for subjective code quality evaluation
- HTML report export with charts (matplotlib)
- 8 Category A tasks (write-from-scratch in both languages)
- 4 Category B tasks (verify provided Bash, convert to Lush)
- Lush language reference for agent context
2026-03-29 17:56:30 +01:00

92 lines
3.2 KiB
Python

from __future__ import annotations
import json
import re
from .models import QuestionnaireResponse
from .providers.base import LLMProvider, Message
QUESTIONS = [
{
"question": "Readability: The solution is easy to read and understand",
"choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
},
{
"question": "Expressiveness: The language provided sufficient constructs to solve the problem naturally",
"choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
},
{
"question": "Conciseness: The solution required minimal boilerplate",
"choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
},
{
"question": "Error handling: Error handling was straightforward",
"choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
},
{
"question": "Overall preference: I would prefer this language for similar tasks",
"choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
},
{
"question": "Learning curve: An unfamiliar developer could understand the solution quickly",
"choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
},
]
def build_questionnaire_prompt(
task_name: str,
language: str,
solution_code: str,
) -> str:
questions_text = ""
for i, q in enumerate(QUESTIONS, 1):
choices_str = ", ".join(f'"{c}"' for c in q["choices"])
questions_text += f' {{"question": "{q["question"]}", "choices": [{choices_str}], "selected": <your choice>}},\n'
return f"""You just solved the task "{task_name}" in {language}. Here is your solution:
```
{solution_code}
```
Please evaluate your experience by answering the following questionnaire. Respond with ONLY a JSON array — no other text.
[
{questions_text} {{"question": "Free-form observation about using {language} for this task", "selected": "<your observation>"}}
]"""
def parse_questionnaire_response(response: str) -> list[QuestionnaireResponse]:
# Try to extract JSON array from response
json_match = re.search(r"\[.*\]", response, re.DOTALL)
if not json_match:
return [QuestionnaireResponse(question="raw_response", selected=response)]
try:
data = json.loads(json_match.group())
except json.JSONDecodeError:
return [QuestionnaireResponse(question="raw_response", selected=response)]
results = []
for item in data:
results.append(
QuestionnaireResponse(
question=item.get("question", ""),
selected=item.get("selected", ""),
choices=item.get("choices"),
)
)
return results
def run_questionnaire(
provider: LLMProvider,
task_name: str,
language: str,
solution_code: str,
) -> list[QuestionnaireResponse]:
prompt = build_questionnaire_prompt(task_name, language, solution_code)
response = provider.send([Message(role="user", content=prompt)])
return parse_questionnaire_response(response)