Revamp questionnaire, parallelize run-all, add new tasks
- Replace 6 compound Likert questions with 12 atomic ones grouped by dimension (syntax, expressiveness, data/IO, errors, overall); drop free-form question. Responses now stored as ints, not strings. - Back-compat layer maps legacy keys to new dimensions so existing results still render. - Parallelize run-all with ThreadPoolExecutor (configurable workers) and add a thread-safe min-request-interval rate limiter to the Anthropic provider. - Add new tasks: path_normalizer, todo_manager, currency_converter, locale_weather_url, network_info_parser, url_normalizer.
This commit is contained in:
@@ -13,7 +13,6 @@ import matplotlib.ticker as ticker
|
||||
from .models import BenchmarkResult
|
||||
from .report import (
|
||||
LIKERT_QUESTIONS,
|
||||
_get_freeform,
|
||||
_get_likert_scores,
|
||||
_parse_likert,
|
||||
load_latest_results,
|
||||
@@ -35,7 +34,7 @@ def _fig_to_base64(fig: plt.Figure) -> str:
|
||||
def _aggregate_likert(results: list[BenchmarkResult]) -> dict[str, dict[str, float]]:
|
||||
"""Return {question_key: {bash: avg, lush: avg}}."""
|
||||
agg: dict[str, dict[str, list[float]]] = {}
|
||||
for key, _ in LIKERT_QUESTIONS:
|
||||
for key, _, _ in LIKERT_QUESTIONS:
|
||||
agg[key] = {"bash": [], "lush": []}
|
||||
for r in results:
|
||||
scores = _get_likert_scores(r)
|
||||
@@ -56,11 +55,11 @@ def _aggregate_likert(results: list[BenchmarkResult]) -> dict[str, dict[str, flo
|
||||
def chart_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
|
||||
"""Grouped horizontal bar chart comparing bash vs lush on each Likert metric."""
|
||||
avgs = _aggregate_likert(results)
|
||||
labels = [label for _, label in LIKERT_QUESTIONS]
|
||||
bash_vals = [avgs[key]["bash"] for key, _ in LIKERT_QUESTIONS]
|
||||
lush_vals = [avgs[key]["lush"] for key, _ in LIKERT_QUESTIONS]
|
||||
labels = [label for _, label, _ in LIKERT_QUESTIONS]
|
||||
bash_vals = [avgs[key]["bash"] for key, _, _ in LIKERT_QUESTIONS]
|
||||
lush_vals = [avgs[key]["lush"] for key, _, _ in LIKERT_QUESTIONS]
|
||||
|
||||
fig, ax = plt.subplots(figsize=(8, 4.5))
|
||||
fig, ax = plt.subplots(figsize=(8, 7))
|
||||
y = range(len(labels))
|
||||
bar_h = 0.35
|
||||
bars_bash = ax.barh([i + bar_h / 2 for i in y], bash_vals, bar_h, label="bash", color=BASH_COLOR)
|
||||
@@ -112,14 +111,14 @@ def chart_turns_comparison(results: list[BenchmarkResult]) -> str:
|
||||
|
||||
def chart_per_task_heatmap(results: list[BenchmarkResult]) -> str:
|
||||
"""Heatmap showing lush-minus-bash score diff per task and metric."""
|
||||
labels = [label for _, label in LIKERT_QUESTIONS]
|
||||
labels = [label for _, label, _ in LIKERT_QUESTIONS]
|
||||
tasks = [r.task_name for r in results]
|
||||
|
||||
data: list[list[float]] = []
|
||||
for r in results:
|
||||
scores = _get_likert_scores(r)
|
||||
row = []
|
||||
for key, _ in LIKERT_QUESTIONS:
|
||||
for key, _, _ in LIKERT_QUESTIONS:
|
||||
b = scores[key]["bash"]
|
||||
l = scores[key]["lush"]
|
||||
if b is not None and l is not None:
|
||||
@@ -128,11 +127,11 @@ def chart_per_task_heatmap(results: list[BenchmarkResult]) -> str:
|
||||
row.append(0.0)
|
||||
data.append(row)
|
||||
|
||||
fig, ax = plt.subplots(figsize=(8, max(4, len(tasks) * 0.45 + 1)))
|
||||
fig, ax = plt.subplots(figsize=(10, max(4, len(tasks) * 0.45 + 1)))
|
||||
im = ax.imshow(data, cmap="RdYlGn", aspect="auto", vmin=-3, vmax=3)
|
||||
|
||||
ax.set_xticks(range(len(labels)))
|
||||
ax.set_xticklabels(labels, rotation=35, ha="right", fontsize=8)
|
||||
ax.set_xticklabels(labels, rotation=45, ha="right", fontsize=7)
|
||||
ax.set_yticks(range(len(tasks)))
|
||||
ax.set_yticklabels(tasks, fontsize=8)
|
||||
|
||||
@@ -140,7 +139,7 @@ def chart_per_task_heatmap(results: list[BenchmarkResult]) -> str:
|
||||
for j in range(len(labels)):
|
||||
val = data[i][j]
|
||||
text = f"+{val:.0f}" if val > 0 else f"{val:.0f}" if val < 0 else "0"
|
||||
ax.text(j, i, text, ha="center", va="center", fontsize=8,
|
||||
ax.text(j, i, text, ha="center", va="center", fontsize=7,
|
||||
color="white" if abs(val) >= 2 else "black")
|
||||
|
||||
ax.set_title("Score Difference (Lush - Bash)")
|
||||
@@ -197,7 +196,7 @@ def chart_per_category_questionnaire(results: list[BenchmarkResult]) -> str:
|
||||
|
||||
|
||||
def chart_per_category_radar(results: list[BenchmarkResult]) -> list[tuple[str, str]]:
|
||||
"""Small-multiples bar charts: one per category showing 6 Likert dimensions for bash vs lush."""
|
||||
"""Small-multiples bar charts: one per category showing 12 Likert dimensions for bash vs lush."""
|
||||
import numpy as np
|
||||
from collections import defaultdict
|
||||
|
||||
@@ -206,12 +205,12 @@ def chart_per_category_radar(results: list[BenchmarkResult]) -> list[tuple[str,
|
||||
by_cat[r.category].append(r)
|
||||
|
||||
charts: list[tuple[str, str]] = []
|
||||
labels = [label for _, label in LIKERT_QUESTIONS]
|
||||
labels = [label for _, label, _ in LIKERT_QUESTIONS]
|
||||
|
||||
for cat in sorted(by_cat):
|
||||
cat_results = by_cat[cat]
|
||||
agg: dict[str, dict[str, list[float]]] = {}
|
||||
for key, _ in LIKERT_QUESTIONS:
|
||||
for key, _, _ in LIKERT_QUESTIONS:
|
||||
agg[key] = {"bash": [], "lush": []}
|
||||
for r in cat_results:
|
||||
scores = _get_likert_scores(r)
|
||||
@@ -221,10 +220,10 @@ def chart_per_category_radar(results: list[BenchmarkResult]) -> list[tuple[str,
|
||||
if val is not None:
|
||||
agg[key][lang].append(val)
|
||||
|
||||
bash_vals = [sum(agg[k]["bash"]) / len(agg[k]["bash"]) if agg[k]["bash"] else 0.0 for k, _ in LIKERT_QUESTIONS]
|
||||
lush_vals = [sum(agg[k]["lush"]) / len(agg[k]["lush"]) if agg[k]["lush"] else 0.0 for k, _ in LIKERT_QUESTIONS]
|
||||
bash_vals = [sum(agg[k]["bash"]) / len(agg[k]["bash"]) if agg[k]["bash"] else 0.0 for k, _, _ in LIKERT_QUESTIONS]
|
||||
lush_vals = [sum(agg[k]["lush"]) / len(agg[k]["lush"]) if agg[k]["lush"] else 0.0 for k, _, _ in LIKERT_QUESTIONS]
|
||||
|
||||
fig, ax = plt.subplots(figsize=(6, 3.5))
|
||||
fig, ax = plt.subplots(figsize=(7, 5))
|
||||
y = range(len(labels))
|
||||
bar_h = 0.35
|
||||
ax.barh([i + bar_h / 2 for i in y], bash_vals, bar_h, label="bash", color=BASH_COLOR)
|
||||
@@ -337,7 +336,7 @@ def _build_detail_html(results: list[BenchmarkResult]) -> str:
|
||||
|
||||
scores = _get_likert_scores(r)
|
||||
score_rows = []
|
||||
for key, label in LIKERT_QUESTIONS:
|
||||
for key, label, _ in LIKERT_QUESTIONS:
|
||||
b_val = scores[key]["bash"]
|
||||
l_val = scores[key]["lush"]
|
||||
b_str = f"{b_val:.0f}" if b_val is not None else "-"
|
||||
@@ -353,11 +352,6 @@ def _build_detail_html(results: list[BenchmarkResult]) -> str:
|
||||
f'<td>{b_str}</td><td>{l_str}</td>'
|
||||
f'<td class="{d_cls}">{d_str}</td></tr>')
|
||||
|
||||
obs = _get_freeform(r)
|
||||
obs_html = ""
|
||||
for lang, text in obs.items():
|
||||
obs_html += f'<p><strong>{lang}:</strong> {html.escape(text)}</p>\n'
|
||||
|
||||
sections.append(f"""
|
||||
<div class="task-detail">
|
||||
<h3>{html.escape(r.task_name)} <span class="cat">[{r.category}/{r.mode}]</span>
|
||||
@@ -368,7 +362,6 @@ def _build_detail_html(results: list[BenchmarkResult]) -> str:
|
||||
<thead><tr><th>Metric</th><th>Bash</th><th>Lush</th><th>Diff</th></tr></thead>
|
||||
<tbody>{"".join(score_rows)}</tbody>
|
||||
</table>
|
||||
<div class="observations">{obs_html}</div>
|
||||
</div>""")
|
||||
|
||||
return "\n".join(sections)
|
||||
@@ -424,8 +417,6 @@ def export_html(results_dir: Path, output_path: Path) -> None:
|
||||
.scores {{ width: auto; }}
|
||||
.scores td:nth-child(n+2) {{ text-align: center; min-width: 50px; }}
|
||||
.scores th:nth-child(n+2) {{ text-align: center; }}
|
||||
.observations {{ margin-top: 12px; font-size: 0.85rem; color: #444; }}
|
||||
.observations p {{ margin-bottom: 6px; }}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
Reference in New Issue
Block a user