Revamp questionnaire, parallelize run-all, add new tasks

- Replace 6 compound Likert questions with 12 atomic ones grouped by dimension (syntax, expressiveness, data/IO, errors, overall); drop free-form question. Responses now stored as ints, not strings. - Back-compat layer maps legacy keys to new dimensions so existing results still render. - Parallelize run-all with ThreadPoolExecutor (configurable workers) and add a thread-safe min-request-interval rate limiter to the Anthropic provider. - Add new tasks: path_normalizer, todo_manager, currency_converter, locale_weather_url, network_info_parser, url_normalizer.
2026-04-07 19:07:21 +01:00
parent 20e62f60f6
commit 18ce7e57cf
13 changed files with 943 additions and 206 deletions
--- a/lush_bench/questionnaire.py
+++ b/lush_bench/questionnaire.py
@@ -7,42 +7,38 @@ from .models import QuestionnaireResponse
 from .providers.base import LLMProvider, Message

 QUESTIONS = [
-    {
-        "question": "Readability: The solution is easy to read and understand",
-        "choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
-    },
-    {
-        "question": "Expressiveness: The language provided sufficient constructs to solve the problem naturally",
-        "choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
-    },
-    {
-        "question": "Conciseness: The solution required minimal boilerplate",
-        "choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
-    },
-    {
-        "question": "Error handling: Error handling was straightforward",
-        "choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
-    },
-    {
-        "question": "Overall preference: I would prefer this language for similar tasks",
-        "choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
-    },
-    {
-        "question": "Learning curve: An unfamiliar developer could understand the solution quickly",
-        "choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
-    },
+    # Syntax & Readability
+    {"id": "syntax_clarity", "dimension": "Syntax & Readability", "question": "The language's syntax makes the intent of operations visually obvious"},
+    {"id": "signal_to_noise", "dimension": "Syntax & Readability", "question": "The language keeps boilerplate low — most characters serve the task, not the language"},
+    {"id": "familiar_conventions", "dimension": "Syntax & Readability", "question": "The language follows conventions that developers from other languages would recognize"},
+    # Expressiveness
+    {"id": "builtin_ops", "dimension": "Expressiveness", "question": "The language provides built-in operations for the core task requirements (no workarounds needed)"},
+    {"id": "string_ops", "dimension": "Expressiveness", "question": "The language's string manipulation capabilities are convenient for this task"},
+    {"id": "composition", "dimension": "Expressiveness", "question": "The language makes it easy to compose operations (piping, chaining, nesting)"},
+    # Data & I/O
+    {"id": "io_ergonomics", "dimension": "Data & I/O", "question": "Reading input and producing output is straightforward in this language"},
+    {"id": "data_structures", "dimension": "Data & I/O", "question": "The language's data structures (arrays, maps, variables) are well-suited to this task"},
+    # Error Handling
+    {"id": "error_model", "dimension": "Error Handling", "question": "The language's error handling model is clear and predictable"},
+    {"id": "edge_case_support", "dimension": "Error Handling", "question": "The language makes it easy to handle edge cases (empty input, missing data, type mismatches)"},
+    # Overall
+    {"id": "learnability", "dimension": "Overall", "question": "A developer unfamiliar with this language could learn enough to solve this task quickly"},
+    {"id": "fitness", "dimension": "Overall", "question": "This language is a good fit for this type of task"},
 ]

+CHOICES = ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"]
+

 def build_questionnaire_prompt(
    task_name: str,
    language: str,
    solution_code: str,
 ) -> str:
+    choices_str = ", ".join(f'"{c}"' for c in CHOICES)
+
    questions_text = ""
-    for i, q in enumerate(QUESTIONS, 1):
-        choices_str = ", ".join(f'"{c}"' for c in q["choices"])
-        questions_text += f'  {{"question": "{q["question"]}", "choices": [{choices_str}], "selected": <your choice>}},\n'
+    for q in QUESTIONS:
+        questions_text += f'  {{"id": "{q["id"]}", "question": "{q["question"]}", "selected": <your choice>}},\n'

    return f"""You just solved the task "{task_name}" in {language}. Here is your solution:

@@ -50,11 +46,20 @@ def build_questionnaire_prompt(
 {solution_code}
 ```

-Please evaluate your experience by answering the following questionnaire. Respond with ONLY a JSON array — no other text.
+Rate the **language itself** on each aspect below, not the quality of this particular solution. Consider what the language's design and built-in features afford for this type of task.
+
+Respond with ONLY a JSON array — no other text. For "selected", use one of: {choices_str}

 [
-{questions_text}  {{"question": "Free-form observation about using {language} for this task", "selected": "<your observation>"}}
-]"""
+{questions_text}]"""
+
+
+def _extract_int(value: str) -> int | None:
+    """Extract leading digit from a response like '4 - Agree'."""
+    s = value.strip()
+    if s and s[0].isdigit():
+        return int(s[0])
+    return None


 def parse_questionnaire_response(response: str) -> list[QuestionnaireResponse]:
@@ -70,11 +75,20 @@ def parse_questionnaire_response(response: str) -> list[QuestionnaireResponse]:

    results = []
    for item in data:
+        question_id = item.get("id", item.get("question", ""))
+        raw_selected = item.get("selected", "")
+
+        # Normalize to int
+        if isinstance(raw_selected, int):
+            selected: int | str = raw_selected
+        else:
+            parsed = _extract_int(str(raw_selected))
+            selected = parsed if parsed is not None else raw_selected
+
        results.append(
            QuestionnaireResponse(
-                question=item.get("question", ""),
-                selected=item.get("selected", ""),
-                choices=item.get("choices"),
+                question=question_id,
+                selected=selected,
            )
        )
    return results