lush_grading/lush_bench/questionnaire.py

from __future__ import annotations

import json
import re

from .models import QuestionnaireResponse
from .providers.base import LLMProvider, Message

QUESTIONS = [
    # Syntax & Readability
    {"id": "syntax_clarity", "dimension": "Syntax & Readability", "question": "The language's syntax makes the intent of operations visually obvious"},
    {"id": "signal_to_noise", "dimension": "Syntax & Readability", "question": "The language keeps boilerplate low — most characters serve the task, not the language"},
    {"id": "familiar_conventions", "dimension": "Syntax & Readability", "question": "The language follows conventions that developers from other languages would recognize"},
    # Expressiveness
    {"id": "builtin_ops", "dimension": "Expressiveness", "question": "The language provides built-in operations for the core task requirements (no workarounds needed)"},
    {"id": "string_ops", "dimension": "Expressiveness", "question": "The language's string manipulation capabilities are convenient for this task"},
    {"id": "composition", "dimension": "Expressiveness", "question": "The language makes it easy to compose operations (piping, chaining, nesting)"},
    # Data & I/O
    {"id": "io_ergonomics", "dimension": "Data & I/O", "question": "Reading input and producing output is straightforward in this language"},
    {"id": "data_structures", "dimension": "Data & I/O", "question": "The language's data structures (arrays, maps, variables) are well-suited to this task"},
    # Error Handling
    {"id": "error_model", "dimension": "Error Handling", "question": "The language's error handling model is clear and predictable"},
    {"id": "edge_case_support", "dimension": "Error Handling", "question": "The language makes it easy to handle edge cases (empty input, missing data, type mismatches)"},
    # Overall
    {"id": "learnability", "dimension": "Overall", "question": "A developer unfamiliar with this language could learn enough to solve this task quickly"},
    {"id": "fitness", "dimension": "Overall", "question": "This language is a good fit for this type of task"},
]

CHOICES = ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"]


def build_questionnaire_prompt(
    task_name: str,
    language: str,
    solution_code: str,
) -> str:
    choices_str = ", ".join(f'"{c}"' for c in CHOICES)

    questions_text = ""
    for q in QUESTIONS:
        questions_text += f'  {{"id": "{q["id"]}", "question": "{q["question"]}", "selected": <your choice>}},\n'

    return f"""You just solved the task "{task_name}" in {language}. Here is your solution:

```
{solution_code}
```

Rate the **language itself** on each aspect below, not the quality of this particular solution. Consider what the language's design and built-in features afford for this type of task.

Respond with ONLY a JSON array — no other text. For "selected", use one of: {choices_str}

[
{questions_text}]"""


def _extract_int(value: str) -> int | None:
    """Extract leading digit from a response like '4 - Agree'."""
    s = value.strip()
    if s and s[0].isdigit():
        return int(s[0])
    return None


def parse_questionnaire_response(response: str) -> list[QuestionnaireResponse]:
    # Try to extract JSON array from response
    json_match = re.search(r"\[.*\]", response, re.DOTALL)
    if not json_match:
        return [QuestionnaireResponse(question="raw_response", selected=response)]

    try:
        data = json.loads(json_match.group())
    except json.JSONDecodeError:
        return [QuestionnaireResponse(question="raw_response", selected=response)]

    results = []
    for item in data:
        question_id = item.get("id", item.get("question", ""))
        raw_selected = item.get("selected", "")

        # Normalize to int
        if isinstance(raw_selected, int):
            selected: int | str = raw_selected
        else:
            parsed = _extract_int(str(raw_selected))
            selected = parsed if parsed is not None else raw_selected

        results.append(
            QuestionnaireResponse(
                question=question_id,
                selected=selected,
            )
        )
    return results


def run_questionnaire(
    provider: LLMProvider,
    task_name: str,
    language: str,
    solution_code: str,
) -> list[QuestionnaireResponse]:
    prompt = build_questionnaire_prompt(task_name, language, solution_code)
    response = provider.send([Message(role="user", content=prompt)])
    return parse_questionnaire_response(response)