Initial commit: Lush vs Bash AI benchmarking framework
Benchmark harness that uses LLM agents to solve shell scripting tasks in both Bash and Lush, then compares correctness and code quality. - CLI with run, run-all, list-tasks, report, and export commands - Agent loop with retry support via Anthropic Claude provider - Test harness executing solutions in sandboxed subprocesses - LLM-driven questionnaire for subjective code quality evaluation - HTML report export with charts (matplotlib) - 8 Category A tasks (write-from-scratch in both languages) - 4 Category B tasks (verify provided Bash, convert to Lush) - Lush language reference for agent context
This commit is contained in:
91
lush_bench/questionnaire.py
Normal file
91
lush_bench/questionnaire.py
Normal file
@@ -0,0 +1,91 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
|
||||
from .models import QuestionnaireResponse
|
||||
from .providers.base import LLMProvider, Message
|
||||
|
||||
QUESTIONS = [
|
||||
{
|
||||
"question": "Readability: The solution is easy to read and understand",
|
||||
"choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
|
||||
},
|
||||
{
|
||||
"question": "Expressiveness: The language provided sufficient constructs to solve the problem naturally",
|
||||
"choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
|
||||
},
|
||||
{
|
||||
"question": "Conciseness: The solution required minimal boilerplate",
|
||||
"choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
|
||||
},
|
||||
{
|
||||
"question": "Error handling: Error handling was straightforward",
|
||||
"choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
|
||||
},
|
||||
{
|
||||
"question": "Overall preference: I would prefer this language for similar tasks",
|
||||
"choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
|
||||
},
|
||||
{
|
||||
"question": "Learning curve: An unfamiliar developer could understand the solution quickly",
|
||||
"choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def build_questionnaire_prompt(
|
||||
task_name: str,
|
||||
language: str,
|
||||
solution_code: str,
|
||||
) -> str:
|
||||
questions_text = ""
|
||||
for i, q in enumerate(QUESTIONS, 1):
|
||||
choices_str = ", ".join(f'"{c}"' for c in q["choices"])
|
||||
questions_text += f' {{"question": "{q["question"]}", "choices": [{choices_str}], "selected": <your choice>}},\n'
|
||||
|
||||
return f"""You just solved the task "{task_name}" in {language}. Here is your solution:
|
||||
|
||||
```
|
||||
{solution_code}
|
||||
```
|
||||
|
||||
Please evaluate your experience by answering the following questionnaire. Respond with ONLY a JSON array — no other text.
|
||||
|
||||
[
|
||||
{questions_text} {{"question": "Free-form observation about using {language} for this task", "selected": "<your observation>"}}
|
||||
]"""
|
||||
|
||||
|
||||
def parse_questionnaire_response(response: str) -> list[QuestionnaireResponse]:
|
||||
# Try to extract JSON array from response
|
||||
json_match = re.search(r"\[.*\]", response, re.DOTALL)
|
||||
if not json_match:
|
||||
return [QuestionnaireResponse(question="raw_response", selected=response)]
|
||||
|
||||
try:
|
||||
data = json.loads(json_match.group())
|
||||
except json.JSONDecodeError:
|
||||
return [QuestionnaireResponse(question="raw_response", selected=response)]
|
||||
|
||||
results = []
|
||||
for item in data:
|
||||
results.append(
|
||||
QuestionnaireResponse(
|
||||
question=item.get("question", ""),
|
||||
selected=item.get("selected", ""),
|
||||
choices=item.get("choices"),
|
||||
)
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
def run_questionnaire(
|
||||
provider: LLMProvider,
|
||||
task_name: str,
|
||||
language: str,
|
||||
solution_code: str,
|
||||
) -> list[QuestionnaireResponse]:
|
||||
prompt = build_questionnaire_prompt(task_name, language, solution_code)
|
||||
response = provider.send([Message(role="user", content=prompt)])
|
||||
return parse_questionnaire_response(response)
|
||||
Reference in New Issue
Block a user