Initial commit: Lush vs Bash AI benchmarking framework

Benchmark harness that uses LLM agents to solve shell scripting tasks in both Bash and Lush, then compares correctness and code quality. - CLI with run, run-all, list-tasks, report, and export commands - Agent loop with retry support via Anthropic Claude provider - Test harness executing solutions in sandboxed subprocesses - LLM-driven questionnaire for subjective code quality evaluation - HTML report export with charts (matplotlib) - 8 Category A tasks (write-from-scratch in both languages) - 4 Category B tasks (verify provided Bash, convert to Lush) - Lush language reference for agent context
2026-03-29 17:56:30 +01:00
commit be8d657b24
33 changed files with 3302 additions and 0 deletions
--- a/lush_bench/questionnaire.py
+++ b/lush_bench/questionnaire.py
@@ -0,0 +1,91 @@
+from __future__ import annotations
+
+import json
+import re
+
+from .models import QuestionnaireResponse
+from .providers.base import LLMProvider, Message
+
+QUESTIONS = [
+    {
+        "question": "Readability: The solution is easy to read and understand",
+        "choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
+    },
+    {
+        "question": "Expressiveness: The language provided sufficient constructs to solve the problem naturally",
+        "choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
+    },
+    {
+        "question": "Conciseness: The solution required minimal boilerplate",
+        "choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
+    },
+    {
+        "question": "Error handling: Error handling was straightforward",
+        "choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
+    },
+    {
+        "question": "Overall preference: I would prefer this language for similar tasks",
+        "choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
+    },
+    {
+        "question": "Learning curve: An unfamiliar developer could understand the solution quickly",
+        "choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
+    },
+]
+
+
+def build_questionnaire_prompt(
+    task_name: str,
+    language: str,
+    solution_code: str,
+) -> str:
+    questions_text = ""
+    for i, q in enumerate(QUESTIONS, 1):
+        choices_str = ", ".join(f'"{c}"' for c in q["choices"])
+        questions_text += f'  {{"question": "{q["question"]}", "choices": [{choices_str}], "selected": <your choice>}},\n'
+
+    return f"""You just solved the task "{task_name}" in {language}. Here is your solution:
+
+```
+{solution_code}
+```
+
+Please evaluate your experience by answering the following questionnaire. Respond with ONLY a JSON array — no other text.
+
+[
+{questions_text}  {{"question": "Free-form observation about using {language} for this task", "selected": "<your observation>"}}
+]"""
+
+
+def parse_questionnaire_response(response: str) -> list[QuestionnaireResponse]:
+    # Try to extract JSON array from response
+    json_match = re.search(r"\[.*\]", response, re.DOTALL)
+    if not json_match:
+        return [QuestionnaireResponse(question="raw_response", selected=response)]
+
+    try:
+        data = json.loads(json_match.group())
+    except json.JSONDecodeError:
+        return [QuestionnaireResponse(question="raw_response", selected=response)]
+
+    results = []
+    for item in data:
+        results.append(
+            QuestionnaireResponse(
+                question=item.get("question", ""),
+                selected=item.get("selected", ""),
+                choices=item.get("choices"),
+            )
+        )
+    return results
+
+
+def run_questionnaire(
+    provider: LLMProvider,
+    task_name: str,
+    language: str,
+    solution_code: str,
+) -> list[QuestionnaireResponse]:
+    prompt = build_questionnaire_prompt(task_name, language, solution_code)
+    response = provider.send([Message(role="user", content=prompt)])
+    return parse_questionnaire_response(response)