Revamp questionnaire, parallelize run-all, add new tasks
- Replace 6 compound Likert questions with 12 atomic ones grouped by dimension (syntax, expressiveness, data/IO, errors, overall); drop free-form question. Responses now stored as ints, not strings. - Back-compat layer maps legacy keys to new dimensions so existing results still render. - Parallelize run-all with ThreadPoolExecutor (configurable workers) and add a thread-safe min-request-interval rate limiter to the Anthropic provider. - Add new tasks: path_normalizer, todo_manager, currency_converter, locale_weather_url, network_info_parser, url_normalizer.
This commit is contained in:
@@ -13,6 +13,7 @@ class Config:
|
||||
timeout_seconds: float = 10.0
|
||||
normalize_whitespace: bool = True
|
||||
output_dir: Path = Path("results")
|
||||
max_workers: int = 4
|
||||
provider_configs: dict[str, dict[str, Any]] = field(default_factory=dict)
|
||||
|
||||
@classmethod
|
||||
@@ -35,5 +36,6 @@ class Config:
|
||||
timeout_seconds=agent.get("timeout_seconds", 10.0),
|
||||
normalize_whitespace=agent.get("normalize_whitespace", True),
|
||||
output_dir=Path(results.get("output_dir", "results")),
|
||||
max_workers=agent.get("max_workers", 4),
|
||||
provider_configs=provider_configs,
|
||||
)
|
||||
|
||||
@@ -13,7 +13,6 @@ import matplotlib.ticker as ticker
|
||||
from .models import BenchmarkResult
|
||||
from .report import (
|
||||
LIKERT_QUESTIONS,
|
||||
_get_freeform,
|
||||
_get_likert_scores,
|
||||
_parse_likert,
|
||||
load_latest_results,
|
||||
@@ -35,7 +34,7 @@ def _fig_to_base64(fig: plt.Figure) -> str:
|
||||
def _aggregate_likert(results: list[BenchmarkResult]) -> dict[str, dict[str, float]]:
|
||||
"""Return {question_key: {bash: avg, lush: avg}}."""
|
||||
agg: dict[str, dict[str, list[float]]] = {}
|
||||
for key, _ in LIKERT_QUESTIONS:
|
||||
for key, _, _ in LIKERT_QUESTIONS:
|
||||
agg[key] = {"bash": [], "lush": []}
|
||||
for r in results:
|
||||
scores = _get_likert_scores(r)
|
||||
@@ -56,11 +55,11 @@ def _aggregate_likert(results: list[BenchmarkResult]) -> dict[str, dict[str, flo
|
||||
def chart_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
|
||||
"""Grouped horizontal bar chart comparing bash vs lush on each Likert metric."""
|
||||
avgs = _aggregate_likert(results)
|
||||
labels = [label for _, label in LIKERT_QUESTIONS]
|
||||
bash_vals = [avgs[key]["bash"] for key, _ in LIKERT_QUESTIONS]
|
||||
lush_vals = [avgs[key]["lush"] for key, _ in LIKERT_QUESTIONS]
|
||||
labels = [label for _, label, _ in LIKERT_QUESTIONS]
|
||||
bash_vals = [avgs[key]["bash"] for key, _, _ in LIKERT_QUESTIONS]
|
||||
lush_vals = [avgs[key]["lush"] for key, _, _ in LIKERT_QUESTIONS]
|
||||
|
||||
fig, ax = plt.subplots(figsize=(8, 4.5))
|
||||
fig, ax = plt.subplots(figsize=(8, 7))
|
||||
y = range(len(labels))
|
||||
bar_h = 0.35
|
||||
bars_bash = ax.barh([i + bar_h / 2 for i in y], bash_vals, bar_h, label="bash", color=BASH_COLOR)
|
||||
@@ -112,14 +111,14 @@ def chart_turns_comparison(results: list[BenchmarkResult]) -> str:
|
||||
|
||||
def chart_per_task_heatmap(results: list[BenchmarkResult]) -> str:
|
||||
"""Heatmap showing lush-minus-bash score diff per task and metric."""
|
||||
labels = [label for _, label in LIKERT_QUESTIONS]
|
||||
labels = [label for _, label, _ in LIKERT_QUESTIONS]
|
||||
tasks = [r.task_name for r in results]
|
||||
|
||||
data: list[list[float]] = []
|
||||
for r in results:
|
||||
scores = _get_likert_scores(r)
|
||||
row = []
|
||||
for key, _ in LIKERT_QUESTIONS:
|
||||
for key, _, _ in LIKERT_QUESTIONS:
|
||||
b = scores[key]["bash"]
|
||||
l = scores[key]["lush"]
|
||||
if b is not None and l is not None:
|
||||
@@ -128,11 +127,11 @@ def chart_per_task_heatmap(results: list[BenchmarkResult]) -> str:
|
||||
row.append(0.0)
|
||||
data.append(row)
|
||||
|
||||
fig, ax = plt.subplots(figsize=(8, max(4, len(tasks) * 0.45 + 1)))
|
||||
fig, ax = plt.subplots(figsize=(10, max(4, len(tasks) * 0.45 + 1)))
|
||||
im = ax.imshow(data, cmap="RdYlGn", aspect="auto", vmin=-3, vmax=3)
|
||||
|
||||
ax.set_xticks(range(len(labels)))
|
||||
ax.set_xticklabels(labels, rotation=35, ha="right", fontsize=8)
|
||||
ax.set_xticklabels(labels, rotation=45, ha="right", fontsize=7)
|
||||
ax.set_yticks(range(len(tasks)))
|
||||
ax.set_yticklabels(tasks, fontsize=8)
|
||||
|
||||
@@ -140,7 +139,7 @@ def chart_per_task_heatmap(results: list[BenchmarkResult]) -> str:
|
||||
for j in range(len(labels)):
|
||||
val = data[i][j]
|
||||
text = f"+{val:.0f}" if val > 0 else f"{val:.0f}" if val < 0 else "0"
|
||||
ax.text(j, i, text, ha="center", va="center", fontsize=8,
|
||||
ax.text(j, i, text, ha="center", va="center", fontsize=7,
|
||||
color="white" if abs(val) >= 2 else "black")
|
||||
|
||||
ax.set_title("Score Difference (Lush - Bash)")
|
||||
@@ -197,7 +196,7 @@ def chart_per_category_questionnaire(results: list[BenchmarkResult]) -> str:
|
||||
|
||||
|
||||
def chart_per_category_radar(results: list[BenchmarkResult]) -> list[tuple[str, str]]:
|
||||
"""Small-multiples bar charts: one per category showing 6 Likert dimensions for bash vs lush."""
|
||||
"""Small-multiples bar charts: one per category showing 12 Likert dimensions for bash vs lush."""
|
||||
import numpy as np
|
||||
from collections import defaultdict
|
||||
|
||||
@@ -206,12 +205,12 @@ def chart_per_category_radar(results: list[BenchmarkResult]) -> list[tuple[str,
|
||||
by_cat[r.category].append(r)
|
||||
|
||||
charts: list[tuple[str, str]] = []
|
||||
labels = [label for _, label in LIKERT_QUESTIONS]
|
||||
labels = [label for _, label, _ in LIKERT_QUESTIONS]
|
||||
|
||||
for cat in sorted(by_cat):
|
||||
cat_results = by_cat[cat]
|
||||
agg: dict[str, dict[str, list[float]]] = {}
|
||||
for key, _ in LIKERT_QUESTIONS:
|
||||
for key, _, _ in LIKERT_QUESTIONS:
|
||||
agg[key] = {"bash": [], "lush": []}
|
||||
for r in cat_results:
|
||||
scores = _get_likert_scores(r)
|
||||
@@ -221,10 +220,10 @@ def chart_per_category_radar(results: list[BenchmarkResult]) -> list[tuple[str,
|
||||
if val is not None:
|
||||
agg[key][lang].append(val)
|
||||
|
||||
bash_vals = [sum(agg[k]["bash"]) / len(agg[k]["bash"]) if agg[k]["bash"] else 0.0 for k, _ in LIKERT_QUESTIONS]
|
||||
lush_vals = [sum(agg[k]["lush"]) / len(agg[k]["lush"]) if agg[k]["lush"] else 0.0 for k, _ in LIKERT_QUESTIONS]
|
||||
bash_vals = [sum(agg[k]["bash"]) / len(agg[k]["bash"]) if agg[k]["bash"] else 0.0 for k, _, _ in LIKERT_QUESTIONS]
|
||||
lush_vals = [sum(agg[k]["lush"]) / len(agg[k]["lush"]) if agg[k]["lush"] else 0.0 for k, _, _ in LIKERT_QUESTIONS]
|
||||
|
||||
fig, ax = plt.subplots(figsize=(6, 3.5))
|
||||
fig, ax = plt.subplots(figsize=(7, 5))
|
||||
y = range(len(labels))
|
||||
bar_h = 0.35
|
||||
ax.barh([i + bar_h / 2 for i in y], bash_vals, bar_h, label="bash", color=BASH_COLOR)
|
||||
@@ -337,7 +336,7 @@ def _build_detail_html(results: list[BenchmarkResult]) -> str:
|
||||
|
||||
scores = _get_likert_scores(r)
|
||||
score_rows = []
|
||||
for key, label in LIKERT_QUESTIONS:
|
||||
for key, label, _ in LIKERT_QUESTIONS:
|
||||
b_val = scores[key]["bash"]
|
||||
l_val = scores[key]["lush"]
|
||||
b_str = f"{b_val:.0f}" if b_val is not None else "-"
|
||||
@@ -353,11 +352,6 @@ def _build_detail_html(results: list[BenchmarkResult]) -> str:
|
||||
f'<td>{b_str}</td><td>{l_str}</td>'
|
||||
f'<td class="{d_cls}">{d_str}</td></tr>')
|
||||
|
||||
obs = _get_freeform(r)
|
||||
obs_html = ""
|
||||
for lang, text in obs.items():
|
||||
obs_html += f'<p><strong>{lang}:</strong> {html.escape(text)}</p>\n'
|
||||
|
||||
sections.append(f"""
|
||||
<div class="task-detail">
|
||||
<h3>{html.escape(r.task_name)} <span class="cat">[{r.category}/{r.mode}]</span>
|
||||
@@ -368,7 +362,6 @@ def _build_detail_html(results: list[BenchmarkResult]) -> str:
|
||||
<thead><tr><th>Metric</th><th>Bash</th><th>Lush</th><th>Diff</th></tr></thead>
|
||||
<tbody>{"".join(score_rows)}</tbody>
|
||||
</table>
|
||||
<div class="observations">{obs_html}</div>
|
||||
</div>""")
|
||||
|
||||
return "\n".join(sections)
|
||||
@@ -424,8 +417,6 @@ def export_html(results_dir: Path, output_path: Path) -> None:
|
||||
.scores {{ width: auto; }}
|
||||
.scores td:nth-child(n+2) {{ text-align: center; min-width: 50px; }}
|
||||
.scores th:nth-child(n+2) {{ text-align: center; }}
|
||||
.observations {{ margin-top: 12px; font-size: 0.85rem; color: #444; }}
|
||||
.observations p {{ margin-bottom: 6px; }}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
from typing import Any
|
||||
|
||||
import anthropic
|
||||
@@ -17,8 +19,17 @@ class AnthropicProvider:
|
||||
self._client = anthropic.Anthropic(api_key=api_key)
|
||||
self._model = config.get("model", "claude-sonnet-4-20250514")
|
||||
self._max_tokens = config.get("max_tokens", 4096)
|
||||
self._min_request_interval = config.get("min_request_interval", 0.1)
|
||||
self._last_request_time = 0.0
|
||||
self._lock = threading.Lock()
|
||||
|
||||
def send(self, messages: list[Message], system: str = "") -> str:
|
||||
with self._lock:
|
||||
elapsed = time.monotonic() - self._last_request_time
|
||||
if elapsed < self._min_request_interval:
|
||||
time.sleep(self._min_request_interval - elapsed)
|
||||
self._last_request_time = time.monotonic()
|
||||
|
||||
api_messages = [{"role": m.role, "content": m.content} for m in messages]
|
||||
kwargs: dict[str, Any] = {
|
||||
"model": self._model,
|
||||
|
||||
@@ -7,42 +7,38 @@ from .models import QuestionnaireResponse
|
||||
from .providers.base import LLMProvider, Message
|
||||
|
||||
QUESTIONS = [
|
||||
{
|
||||
"question": "Readability: The solution is easy to read and understand",
|
||||
"choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
|
||||
},
|
||||
{
|
||||
"question": "Expressiveness: The language provided sufficient constructs to solve the problem naturally",
|
||||
"choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
|
||||
},
|
||||
{
|
||||
"question": "Conciseness: The solution required minimal boilerplate",
|
||||
"choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
|
||||
},
|
||||
{
|
||||
"question": "Error handling: Error handling was straightforward",
|
||||
"choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
|
||||
},
|
||||
{
|
||||
"question": "Overall preference: I would prefer this language for similar tasks",
|
||||
"choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
|
||||
},
|
||||
{
|
||||
"question": "Learning curve: An unfamiliar developer could understand the solution quickly",
|
||||
"choices": ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"],
|
||||
},
|
||||
# Syntax & Readability
|
||||
{"id": "syntax_clarity", "dimension": "Syntax & Readability", "question": "The language's syntax makes the intent of operations visually obvious"},
|
||||
{"id": "signal_to_noise", "dimension": "Syntax & Readability", "question": "The language keeps boilerplate low — most characters serve the task, not the language"},
|
||||
{"id": "familiar_conventions", "dimension": "Syntax & Readability", "question": "The language follows conventions that developers from other languages would recognize"},
|
||||
# Expressiveness
|
||||
{"id": "builtin_ops", "dimension": "Expressiveness", "question": "The language provides built-in operations for the core task requirements (no workarounds needed)"},
|
||||
{"id": "string_ops", "dimension": "Expressiveness", "question": "The language's string manipulation capabilities are convenient for this task"},
|
||||
{"id": "composition", "dimension": "Expressiveness", "question": "The language makes it easy to compose operations (piping, chaining, nesting)"},
|
||||
# Data & I/O
|
||||
{"id": "io_ergonomics", "dimension": "Data & I/O", "question": "Reading input and producing output is straightforward in this language"},
|
||||
{"id": "data_structures", "dimension": "Data & I/O", "question": "The language's data structures (arrays, maps, variables) are well-suited to this task"},
|
||||
# Error Handling
|
||||
{"id": "error_model", "dimension": "Error Handling", "question": "The language's error handling model is clear and predictable"},
|
||||
{"id": "edge_case_support", "dimension": "Error Handling", "question": "The language makes it easy to handle edge cases (empty input, missing data, type mismatches)"},
|
||||
# Overall
|
||||
{"id": "learnability", "dimension": "Overall", "question": "A developer unfamiliar with this language could learn enough to solve this task quickly"},
|
||||
{"id": "fitness", "dimension": "Overall", "question": "This language is a good fit for this type of task"},
|
||||
]
|
||||
|
||||
CHOICES = ["1 - Strongly disagree", "2 - Disagree", "3 - Neutral", "4 - Agree", "5 - Strongly agree"]
|
||||
|
||||
|
||||
def build_questionnaire_prompt(
|
||||
task_name: str,
|
||||
language: str,
|
||||
solution_code: str,
|
||||
) -> str:
|
||||
choices_str = ", ".join(f'"{c}"' for c in CHOICES)
|
||||
|
||||
questions_text = ""
|
||||
for i, q in enumerate(QUESTIONS, 1):
|
||||
choices_str = ", ".join(f'"{c}"' for c in q["choices"])
|
||||
questions_text += f' {{"question": "{q["question"]}", "choices": [{choices_str}], "selected": <your choice>}},\n'
|
||||
for q in QUESTIONS:
|
||||
questions_text += f' {{"id": "{q["id"]}", "question": "{q["question"]}", "selected": <your choice>}},\n'
|
||||
|
||||
return f"""You just solved the task "{task_name}" in {language}. Here is your solution:
|
||||
|
||||
@@ -50,11 +46,20 @@ def build_questionnaire_prompt(
|
||||
{solution_code}
|
||||
```
|
||||
|
||||
Please evaluate your experience by answering the following questionnaire. Respond with ONLY a JSON array — no other text.
|
||||
Rate the **language itself** on each aspect below, not the quality of this particular solution. Consider what the language's design and built-in features afford for this type of task.
|
||||
|
||||
Respond with ONLY a JSON array — no other text. For "selected", use one of: {choices_str}
|
||||
|
||||
[
|
||||
{questions_text} {{"question": "Free-form observation about using {language} for this task", "selected": "<your observation>"}}
|
||||
]"""
|
||||
{questions_text}]"""
|
||||
|
||||
|
||||
def _extract_int(value: str) -> int | None:
|
||||
"""Extract leading digit from a response like '4 - Agree'."""
|
||||
s = value.strip()
|
||||
if s and s[0].isdigit():
|
||||
return int(s[0])
|
||||
return None
|
||||
|
||||
|
||||
def parse_questionnaire_response(response: str) -> list[QuestionnaireResponse]:
|
||||
@@ -70,11 +75,20 @@ def parse_questionnaire_response(response: str) -> list[QuestionnaireResponse]:
|
||||
|
||||
results = []
|
||||
for item in data:
|
||||
question_id = item.get("id", item.get("question", ""))
|
||||
raw_selected = item.get("selected", "")
|
||||
|
||||
# Normalize to int
|
||||
if isinstance(raw_selected, int):
|
||||
selected: int | str = raw_selected
|
||||
else:
|
||||
parsed = _extract_int(str(raw_selected))
|
||||
selected = parsed if parsed is not None else raw_selected
|
||||
|
||||
results.append(
|
||||
QuestionnaireResponse(
|
||||
question=item.get("question", ""),
|
||||
selected=item.get("selected", ""),
|
||||
choices=item.get("choices"),
|
||||
question=question_id,
|
||||
selected=selected,
|
||||
)
|
||||
)
|
||||
return results
|
||||
|
||||
@@ -5,16 +5,32 @@ from pathlib import Path
|
||||
|
||||
from .models import BenchmarkResult
|
||||
|
||||
# Likert questions in order (must match questionnaire.py QUESTIONS)
|
||||
# New 12-item question list: (key, label, dimension)
|
||||
LIKERT_QUESTIONS = [
|
||||
("Readability", "Readability"),
|
||||
("Expressiveness", "Expressiveness"),
|
||||
("Conciseness", "Conciseness"),
|
||||
("Error handling", "Error handling"),
|
||||
("Overall preference", "Overall preference"),
|
||||
("Learning curve", "Learning curve"),
|
||||
("syntax_clarity", "Syntax clarity", "Syntax & Readability"),
|
||||
("signal_to_noise", "Signal-to-noise", "Syntax & Readability"),
|
||||
("familiar_conventions", "Familiar conventions", "Syntax & Readability"),
|
||||
("builtin_ops", "Built-in operations", "Expressiveness"),
|
||||
("string_ops", "String operations", "Expressiveness"),
|
||||
("composition", "Composition", "Expressiveness"),
|
||||
("io_ergonomics", "I/O ergonomics", "Data & I/O"),
|
||||
("data_structures", "Data structures", "Data & I/O"),
|
||||
("error_model", "Error model", "Error Handling"),
|
||||
("edge_case_support", "Edge case support", "Error Handling"),
|
||||
("learnability", "Learnability", "Overall"),
|
||||
("fitness", "Fitness for task", "Overall"),
|
||||
]
|
||||
|
||||
# Map old 6 legacy keys to new keys for back-compat with existing results
|
||||
LEGACY_KEY_MAP = {
|
||||
"Readability": ["syntax_clarity", "signal_to_noise", "familiar_conventions"],
|
||||
"Expressiveness": ["builtin_ops", "string_ops", "composition"],
|
||||
"Conciseness": ["signal_to_noise"],
|
||||
"Error handling": ["error_model", "edge_case_support"],
|
||||
"Overall preference": ["fitness"],
|
||||
"Learning curve": ["learnability"],
|
||||
}
|
||||
|
||||
|
||||
def load_latest_results(results_dir: Path) -> list[BenchmarkResult]:
|
||||
"""Load results, keeping only the latest run per task name."""
|
||||
@@ -30,7 +46,7 @@ def load_latest_results(results_dir: Path) -> list[BenchmarkResult]:
|
||||
|
||||
|
||||
def _parse_likert(selected: str | int) -> int | None:
|
||||
"""Extract numeric value from a likert response like '4 - Agree'."""
|
||||
"""Extract numeric value from a likert response. Handles int directly or string like '4 - Agree'."""
|
||||
if isinstance(selected, int):
|
||||
return selected
|
||||
s = str(selected).strip()
|
||||
@@ -40,20 +56,34 @@ def _parse_likert(selected: str | int) -> int | None:
|
||||
|
||||
|
||||
def _get_likert_scores(result: BenchmarkResult) -> dict[str, dict[str, float | None]]:
|
||||
"""Extract likert scores per language. Returns {question_key: {bash: N, lush: N}}."""
|
||||
"""Extract likert scores per language. Returns {question_key: {bash: N, lush: N}}.
|
||||
|
||||
Handles both new-format results (exact id match) and legacy results (startswith match
|
||||
mapped to new keys).
|
||||
"""
|
||||
scores: dict[str, dict[str, float | None]] = {}
|
||||
for key, _ in LIKERT_QUESTIONS:
|
||||
for key, _, _ in LIKERT_QUESTIONS:
|
||||
scores[key] = {"bash": None, "lush": None}
|
||||
|
||||
for lang_name, lang_result in [("bash", result.bash_result), ("lush", result.lush_result)]:
|
||||
if not lang_result:
|
||||
continue
|
||||
for q in lang_result.questionnaire:
|
||||
for key, _ in LIKERT_QUESTIONS:
|
||||
if q.question.startswith(key):
|
||||
# Try exact match on new question ids
|
||||
if q.question in scores:
|
||||
val = _parse_likert(q.selected)
|
||||
if val is not None:
|
||||
scores[q.question][lang_name] = float(val)
|
||||
continue
|
||||
|
||||
# Legacy: map old key to new keys (spread the score)
|
||||
for legacy_prefix, new_keys in LEGACY_KEY_MAP.items():
|
||||
if q.question.startswith(legacy_prefix):
|
||||
val = _parse_likert(q.selected)
|
||||
if val is not None:
|
||||
scores[key][lang_name] = float(val)
|
||||
for nk in new_keys:
|
||||
if scores[nk][lang_name] is None:
|
||||
scores[nk][lang_name] = float(val)
|
||||
break
|
||||
return scores
|
||||
|
||||
@@ -64,19 +94,6 @@ def _bar(value: float, max_val: float = 5.0, width: int = 20) -> str:
|
||||
return "\u2588" * filled + "\u2591" * (width - filled)
|
||||
|
||||
|
||||
def _get_freeform(result: BenchmarkResult) -> dict[str, str]:
|
||||
"""Extract free-form observations per language."""
|
||||
obs: dict[str, str] = {}
|
||||
for lang_name, lang_result in [("bash", result.bash_result), ("lush", result.lush_result)]:
|
||||
if not lang_result:
|
||||
continue
|
||||
for q in lang_result.questionnaire:
|
||||
if q.question.startswith("Free-form"):
|
||||
obs[lang_name] = str(q.selected)
|
||||
break
|
||||
return obs
|
||||
|
||||
|
||||
def render_summary_table(results: list[BenchmarkResult]) -> str:
|
||||
"""Render the pass/fail + turns overview table."""
|
||||
lines: list[str] = []
|
||||
@@ -123,7 +140,7 @@ def render_summary_table(results: list[BenchmarkResult]) -> str:
|
||||
|
||||
|
||||
def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
|
||||
"""Render aggregated questionnaire scores with bar charts."""
|
||||
"""Render aggregated questionnaire scores with bar charts, grouped by dimension."""
|
||||
lines: list[str] = []
|
||||
lines.append("=" * 78)
|
||||
lines.append(" QUESTIONNAIRE SCORES (1-5 Likert, higher = better)")
|
||||
@@ -132,7 +149,7 @@ def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
|
||||
|
||||
# Aggregate scores across all tasks
|
||||
agg: dict[str, dict[str, list[float]]] = {}
|
||||
for key, _ in LIKERT_QUESTIONS:
|
||||
for key, _, _ in LIKERT_QUESTIONS:
|
||||
agg[key] = {"bash": [], "lush": []}
|
||||
|
||||
for r in results:
|
||||
@@ -143,7 +160,15 @@ def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
|
||||
if val is not None:
|
||||
agg[key][lang].append(val)
|
||||
|
||||
for key, label in LIKERT_QUESTIONS:
|
||||
# Group by dimension
|
||||
current_dim = None
|
||||
for key, label, dimension in LIKERT_QUESTIONS:
|
||||
if dimension != current_dim:
|
||||
if current_dim is not None:
|
||||
lines.append("")
|
||||
lines.append(f" [{dimension}]")
|
||||
current_dim = dimension
|
||||
|
||||
b_vals = agg[key]["bash"]
|
||||
l_vals = agg[key]["lush"]
|
||||
b_avg = sum(b_vals) / len(b_vals) if b_vals else 0.0
|
||||
@@ -151,10 +176,9 @@ def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
|
||||
diff = l_avg - b_avg
|
||||
diff_str = f"+{diff:.1f}" if diff > 0 else f"{diff:.1f}" if diff < 0 else " 0.0"
|
||||
|
||||
lines.append(f" {label}")
|
||||
lines.append(f" bash {_bar(b_avg)} {b_avg:.1f}")
|
||||
lines.append(f" lush {_bar(l_avg)} {l_avg:.1f} ({diff_str})")
|
||||
lines.append("")
|
||||
lines.append(f" {label}")
|
||||
lines.append(f" bash {_bar(b_avg)} {b_avg:.1f}")
|
||||
lines.append(f" lush {_bar(l_avg)} {l_avg:.1f} ({diff_str})")
|
||||
|
||||
# Overall average
|
||||
all_bash = [v for key in agg for v in agg[key]["bash"]]
|
||||
@@ -164,6 +188,7 @@ def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
|
||||
diff = l_overall - b_overall
|
||||
diff_str = f"+{diff:.1f}" if diff > 0 else f"{diff:.1f}" if diff < 0 else " 0.0"
|
||||
|
||||
lines.append("")
|
||||
lines.append(" " + "-" * 50)
|
||||
lines.append(f" Overall average")
|
||||
lines.append(f" bash {_bar(b_overall)} {b_overall:.1f}")
|
||||
@@ -244,7 +269,7 @@ def render_per_category_questionnaire(results: list[BenchmarkResult]) -> str:
|
||||
lines.append(f" {cat}")
|
||||
|
||||
agg: dict[str, dict[str, list[float]]] = {}
|
||||
for key, _ in LIKERT_QUESTIONS:
|
||||
for key, _, _ in LIKERT_QUESTIONS:
|
||||
agg[key] = {"bash": [], "lush": []}
|
||||
for r in cat_results:
|
||||
scores = _get_likert_scores(r)
|
||||
@@ -254,7 +279,7 @@ def render_per_category_questionnaire(results: list[BenchmarkResult]) -> str:
|
||||
if val is not None:
|
||||
agg[key][lang].append(val)
|
||||
|
||||
for key, label in LIKERT_QUESTIONS:
|
||||
for key, label, _ in LIKERT_QUESTIONS:
|
||||
b_vals = agg[key]["bash"]
|
||||
l_vals = agg[key]["lush"]
|
||||
b_avg = sum(b_vals) / len(b_vals) if b_vals else 0.0
|
||||
@@ -284,7 +309,7 @@ def render_per_task_detail(results: list[BenchmarkResult]) -> str:
|
||||
scores = _get_likert_scores(r)
|
||||
lines.append(f" {'Metric':<22s} {'Bash':>4s} {'Lush':>4s} {'Diff':>5s}")
|
||||
lines.append(" " + "-" * 40)
|
||||
for key, label in LIKERT_QUESTIONS:
|
||||
for key, label, _ in LIKERT_QUESTIONS:
|
||||
b_val = scores[key]["bash"]
|
||||
l_val = scores[key]["lush"]
|
||||
b_str = f"{b_val:.0f}" if b_val is not None else "-"
|
||||
@@ -296,15 +321,6 @@ def render_per_task_detail(results: list[BenchmarkResult]) -> str:
|
||||
d_str = "-"
|
||||
lines.append(f" {label:<22s} {b_str:>4s} {l_str:>4s} {d_str:>5s}")
|
||||
|
||||
# Free-form observations
|
||||
obs = _get_freeform(r)
|
||||
if obs:
|
||||
lines.append("")
|
||||
for lang, text in obs.items():
|
||||
# Wrap long text
|
||||
wrapped = text[:120] + ("..." if len(text) > 120 else "")
|
||||
lines.append(f" {lang}: {wrapped}")
|
||||
|
||||
lines.append("")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user