Reorganize task categories from opaque a/b to descriptive names
Replace category_a/category_b directories with algorithm, pipeline, environment, filesystem, and process. Add separate mode field (solve/convert) to decouple orchestration from capability grouping. Add per-category summary and questionnaire breakdowns to both terminal report and HTML export.
This commit is contained in:
@@ -112,7 +112,7 @@ def solve_task(
|
||||
"""Run the agent loop: prompt -> code -> test -> retry."""
|
||||
system = build_system_prompt(language)
|
||||
|
||||
if task.category == "b" and language == "lush":
|
||||
if task.mode == "convert" and language == "lush":
|
||||
user_prompt = build_conversion_prompt(task)
|
||||
else:
|
||||
user_prompt = build_task_prompt(task, language)
|
||||
|
||||
@@ -88,11 +88,11 @@ def chart_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
|
||||
|
||||
def chart_turns_comparison(results: list[BenchmarkResult]) -> str:
|
||||
"""Bar chart of agent turns per task for bash vs lush."""
|
||||
# Only include tasks where the agent actually solved (turns > 0)
|
||||
cat_a = [r for r in results if r.category == "a"]
|
||||
names = [r.task_name for r in cat_a]
|
||||
bash_turns = [r.bash_result.agent_turns if r.bash_result else 0 for r in cat_a]
|
||||
lush_turns = [r.lush_result.agent_turns if r.lush_result else 0 for r in cat_a]
|
||||
# Only include tasks where the agent actually solved (solve mode)
|
||||
solve = [r for r in results if r.mode == "solve"]
|
||||
names = [r.task_name for r in solve]
|
||||
bash_turns = [r.bash_result.agent_turns if r.bash_result else 0 for r in solve]
|
||||
lush_turns = [r.lush_result.agent_turns if r.lush_result else 0 for r in solve]
|
||||
|
||||
fig, ax = plt.subplots(figsize=(8, 4))
|
||||
x = range(len(names))
|
||||
@@ -103,7 +103,7 @@ def chart_turns_comparison(results: list[BenchmarkResult]) -> str:
|
||||
ax.set_xticks(list(x))
|
||||
ax.set_xticklabels(names, rotation=35, ha="right", fontsize=8)
|
||||
ax.set_ylabel("Agent Turns")
|
||||
ax.set_title("Agent Turns to Solve (Category A)")
|
||||
ax.set_title("Agent Turns to Solve (Solve Mode)")
|
||||
ax.yaxis.set_major_locator(ticker.MaxNLocator(integer=True))
|
||||
ax.legend()
|
||||
ax.grid(axis="y", alpha=0.3)
|
||||
@@ -148,6 +148,151 @@ def chart_per_task_heatmap(results: list[BenchmarkResult]) -> str:
|
||||
return _fig_to_base64(fig)
|
||||
|
||||
|
||||
def chart_per_category_questionnaire(results: list[BenchmarkResult]) -> str:
|
||||
"""Grouped bar chart: one cluster per category, bars for bash/lush avg scores."""
|
||||
from collections import defaultdict
|
||||
|
||||
by_cat: dict[str, list[BenchmarkResult]] = defaultdict(list)
|
||||
for r in results:
|
||||
by_cat[r.category].append(r)
|
||||
|
||||
categories = sorted(by_cat)
|
||||
bash_avgs = []
|
||||
lush_avgs = []
|
||||
for cat in categories:
|
||||
b_scores: list[float] = []
|
||||
l_scores: list[float] = []
|
||||
for r in by_cat[cat]:
|
||||
scores = _get_likert_scores(r)
|
||||
for key in scores:
|
||||
if scores[key]["bash"] is not None:
|
||||
b_scores.append(scores[key]["bash"])
|
||||
if scores[key]["lush"] is not None:
|
||||
l_scores.append(scores[key]["lush"])
|
||||
bash_avgs.append(sum(b_scores) / len(b_scores) if b_scores else 0.0)
|
||||
lush_avgs.append(sum(l_scores) / len(l_scores) if l_scores else 0.0)
|
||||
|
||||
fig, ax = plt.subplots(figsize=(8, 4))
|
||||
x = range(len(categories))
|
||||
bar_w = 0.35
|
||||
bars_b = ax.bar([i - bar_w / 2 for i in x], bash_avgs, bar_w, label="bash", color=BASH_COLOR)
|
||||
bars_l = ax.bar([i + bar_w / 2 for i in x], lush_avgs, bar_w, label="lush", color=LUSH_COLOR)
|
||||
|
||||
ax.set_xticks(list(x))
|
||||
ax.set_xticklabels(categories, fontsize=9)
|
||||
ax.set_ylim(0, 5.5)
|
||||
ax.set_ylabel("Avg Score (1-5)")
|
||||
ax.set_title("Questionnaire Scores by Category")
|
||||
ax.legend()
|
||||
ax.grid(axis="y", alpha=0.3)
|
||||
|
||||
for bar in bars_b:
|
||||
ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.08, f"{bar.get_height():.1f}",
|
||||
ha="center", va="bottom", fontsize=8)
|
||||
for bar in bars_l:
|
||||
ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.08, f"{bar.get_height():.1f}",
|
||||
ha="center", va="bottom", fontsize=8)
|
||||
|
||||
return _fig_to_base64(fig)
|
||||
|
||||
|
||||
def chart_per_category_radar(results: list[BenchmarkResult]) -> list[tuple[str, str]]:
|
||||
"""Small-multiples bar charts: one per category showing 6 Likert dimensions for bash vs lush."""
|
||||
import numpy as np
|
||||
from collections import defaultdict
|
||||
|
||||
by_cat: dict[str, list[BenchmarkResult]] = defaultdict(list)
|
||||
for r in results:
|
||||
by_cat[r.category].append(r)
|
||||
|
||||
charts: list[tuple[str, str]] = []
|
||||
labels = [label for _, label in LIKERT_QUESTIONS]
|
||||
|
||||
for cat in sorted(by_cat):
|
||||
cat_results = by_cat[cat]
|
||||
agg: dict[str, dict[str, list[float]]] = {}
|
||||
for key, _ in LIKERT_QUESTIONS:
|
||||
agg[key] = {"bash": [], "lush": []}
|
||||
for r in cat_results:
|
||||
scores = _get_likert_scores(r)
|
||||
for key in scores:
|
||||
for lang in ("bash", "lush"):
|
||||
val = scores[key][lang]
|
||||
if val is not None:
|
||||
agg[key][lang].append(val)
|
||||
|
||||
bash_vals = [sum(agg[k]["bash"]) / len(agg[k]["bash"]) if agg[k]["bash"] else 0.0 for k, _ in LIKERT_QUESTIONS]
|
||||
lush_vals = [sum(agg[k]["lush"]) / len(agg[k]["lush"]) if agg[k]["lush"] else 0.0 for k, _ in LIKERT_QUESTIONS]
|
||||
|
||||
fig, ax = plt.subplots(figsize=(6, 3.5))
|
||||
y = range(len(labels))
|
||||
bar_h = 0.35
|
||||
ax.barh([i + bar_h / 2 for i in y], bash_vals, bar_h, label="bash", color=BASH_COLOR)
|
||||
ax.barh([i - bar_h / 2 for i in y], lush_vals, bar_h, label="lush", color=LUSH_COLOR)
|
||||
ax.set_yticks(list(y))
|
||||
ax.set_yticklabels(labels, fontsize=8)
|
||||
ax.set_xlim(0, 5.5)
|
||||
ax.set_title(f"{cat}", fontsize=10)
|
||||
ax.legend(fontsize=8, loc="lower right")
|
||||
ax.invert_yaxis()
|
||||
ax.grid(axis="x", alpha=0.3)
|
||||
|
||||
charts.append((cat, _fig_to_base64(fig)))
|
||||
|
||||
return charts
|
||||
|
||||
|
||||
def _build_per_category_summary_html(results: list[BenchmarkResult]) -> str:
|
||||
"""HTML table: rows=categories, columns=bash/lush pass rate, turns, scores."""
|
||||
from collections import defaultdict
|
||||
|
||||
by_cat: dict[str, list[BenchmarkResult]] = defaultdict(list)
|
||||
for r in results:
|
||||
by_cat[r.category].append(r)
|
||||
|
||||
rows = []
|
||||
for cat in sorted(by_cat):
|
||||
cat_results = by_cat[cat]
|
||||
b_passed = sum(1 for r in cat_results if r.bash_result and r.bash_result.all_passed)
|
||||
l_passed = sum(1 for r in cat_results if r.lush_result and r.lush_result.all_passed)
|
||||
b_total = sum(1 for r in cat_results if r.bash_result)
|
||||
l_total = sum(1 for r in cat_results if r.lush_result)
|
||||
|
||||
b_turn_vals = [r.bash_result.agent_turns for r in cat_results if r.bash_result and r.bash_result.agent_turns > 0]
|
||||
l_turn_vals = [r.lush_result.agent_turns for r in cat_results if r.lush_result and r.lush_result.agent_turns > 0]
|
||||
b_turns_avg = sum(b_turn_vals) / len(b_turn_vals) if b_turn_vals else 0.0
|
||||
l_turns_avg = sum(l_turn_vals) / len(l_turn_vals) if l_turn_vals else 0.0
|
||||
|
||||
b_scores: list[float] = []
|
||||
l_scores: list[float] = []
|
||||
for r in cat_results:
|
||||
scores = _get_likert_scores(r)
|
||||
for key in scores:
|
||||
if scores[key]["bash"] is not None:
|
||||
b_scores.append(scores[key]["bash"])
|
||||
if scores[key]["lush"] is not None:
|
||||
l_scores.append(scores[key]["lush"])
|
||||
b_avg = sum(b_scores) / len(b_scores) if b_scores else 0.0
|
||||
l_avg = sum(l_scores) / len(l_scores) if l_scores else 0.0
|
||||
|
||||
rows.append(f"""<tr>
|
||||
<td>{html.escape(cat)}</td>
|
||||
<td>{b_passed}/{b_total}</td><td>{l_passed}/{l_total}</td>
|
||||
<td>{b_turns_avg:.1f}</td><td>{l_turns_avg:.1f}</td>
|
||||
<td>{b_avg:.1f}</td><td>{l_avg:.1f}</td>
|
||||
</tr>""")
|
||||
|
||||
return f"""<table>
|
||||
<thead><tr>
|
||||
<th>Category</th>
|
||||
<th>Bash Pass</th><th>Lush Pass</th>
|
||||
<th>Bash Avg Turns</th><th>Lush Avg Turns</th>
|
||||
<th>Bash Avg Score</th><th>Lush Avg Score</th>
|
||||
</tr></thead>
|
||||
<tbody>{"".join(rows)}</tbody>
|
||||
</table>"""
|
||||
|
||||
|
||||
def _build_summary_html(results: list[BenchmarkResult]) -> str:
|
||||
rows = []
|
||||
for r in results:
|
||||
@@ -160,7 +305,7 @@ def _build_summary_html(results: list[BenchmarkResult]) -> str:
|
||||
b_turns = str(b.agent_turns) if b else "-"
|
||||
l_turns = str(l.agent_turns) if l else "-"
|
||||
rows.append(f"""<tr>
|
||||
<td>{html.escape(r.task_name)}</td><td>{r.category.upper()}</td>
|
||||
<td>{html.escape(r.task_name)}</td><td>{html.escape(r.category)}</td>
|
||||
<td class="{b_cls}">{b_pass}</td><td>{b_turns}</td>
|
||||
<td class="{l_cls}">{l_pass}</td><td>{l_turns}</td>
|
||||
</tr>""")
|
||||
@@ -215,7 +360,7 @@ def _build_detail_html(results: list[BenchmarkResult]) -> str:
|
||||
|
||||
sections.append(f"""
|
||||
<div class="task-detail">
|
||||
<h3>{html.escape(r.task_name)} <span class="cat">[{r.category}]</span>
|
||||
<h3>{html.escape(r.task_name)} <span class="cat">[{r.category}/{r.mode}]</span>
|
||||
<span class="{"pass" if b_status == "PASS" else "fail"}">bash={b_status}</span>
|
||||
<span class="{"pass" if l_status == "PASS" else "fail"}">lush={l_status}</span>
|
||||
</h3>
|
||||
@@ -238,7 +383,10 @@ def export_html(results_dir: Path, output_path: Path) -> None:
|
||||
chart_questionnaire = chart_questionnaire_comparison(results)
|
||||
chart_turns = chart_turns_comparison(results)
|
||||
chart_heatmap = chart_per_task_heatmap(results)
|
||||
chart_cat_quest = chart_per_category_questionnaire(results)
|
||||
cat_radar_charts = chart_per_category_radar(results)
|
||||
summary_table = _build_summary_html(results)
|
||||
cat_summary_table = _build_per_category_summary_html(results)
|
||||
detail_html = _build_detail_html(results)
|
||||
|
||||
model = results[0].model if results else "unknown"
|
||||
@@ -288,15 +436,24 @@ def export_html(results_dir: Path, output_path: Path) -> None:
|
||||
<h2>Summary</h2>
|
||||
{summary_table}
|
||||
|
||||
<h2>Per-Category Summary</h2>
|
||||
{cat_summary_table}
|
||||
|
||||
<h2>Questionnaire Scores</h2>
|
||||
<div class="chart"><img src="data:image/png;base64,{chart_questionnaire}" alt="Questionnaire comparison"></div>
|
||||
|
||||
<h2>Agent Turns (Category A)</h2>
|
||||
<h2>Questionnaire Scores by Category</h2>
|
||||
<div class="chart"><img src="data:image/png;base64,{chart_cat_quest}" alt="Per-category questionnaire"></div>
|
||||
|
||||
<h2>Agent Turns (Solve Mode)</h2>
|
||||
<div class="chart"><img src="data:image/png;base64,{chart_turns}" alt="Turns comparison"></div>
|
||||
|
||||
<h2>Score Difference Heatmap (Lush - Bash)</h2>
|
||||
<div class="chart"><img src="data:image/png;base64,{chart_heatmap}" alt="Score heatmap"></div>
|
||||
|
||||
<h2>Per-Category Breakdown</h2>
|
||||
{"".join(f'<h3>{cat}</h3><div class="chart"><img src="data:image/png;base64,{img}" alt="{cat} breakdown"></div>' for cat, img in cat_radar_charts)}
|
||||
|
||||
<h2>Per-Task Detail</h2>
|
||||
{detail_html}
|
||||
|
||||
|
||||
@@ -36,15 +36,17 @@ class TestCase:
|
||||
@dataclass
|
||||
class Task:
|
||||
name: str
|
||||
category: str # "a" or "b"
|
||||
category: str # "algorithm", "pipeline", "environment", "filesystem", "process"
|
||||
description: str
|
||||
test_cases: list[TestCase]
|
||||
bash_source: str | None = None # category B only
|
||||
mode: str = "solve" # "solve" or "convert"
|
||||
bash_source: str | None = None # convert mode only
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
d: dict[str, Any] = {
|
||||
"name": self.name,
|
||||
"category": self.category,
|
||||
"mode": self.mode,
|
||||
"description": self.description,
|
||||
"test_cases": [tc.to_dict() for tc in self.test_cases],
|
||||
}
|
||||
@@ -59,6 +61,7 @@ class Task:
|
||||
category=d["category"],
|
||||
description=d["description"],
|
||||
test_cases=[TestCase.from_dict(tc) for tc in d["test_cases"]],
|
||||
mode=d.get("mode", "solve"),
|
||||
bash_source=d.get("bash_source"),
|
||||
)
|
||||
|
||||
@@ -180,16 +183,18 @@ class QuestionnaireResponse:
|
||||
class BenchmarkResult:
|
||||
task_name: str
|
||||
category: str
|
||||
provider: str
|
||||
model: str
|
||||
timestamp: str
|
||||
bash_result: LanguageResult | None
|
||||
lush_result: LanguageResult | None
|
||||
mode: str = "solve" # "solve" or "convert"
|
||||
provider: str = ""
|
||||
model: str = ""
|
||||
timestamp: str = ""
|
||||
bash_result: LanguageResult | None = None
|
||||
lush_result: LanguageResult | None = None
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"task_name": self.task_name,
|
||||
"category": self.category,
|
||||
"mode": self.mode,
|
||||
"provider": self.provider,
|
||||
"model": self.model,
|
||||
"timestamp": self.timestamp,
|
||||
@@ -202,6 +207,7 @@ class BenchmarkResult:
|
||||
return cls(
|
||||
task_name=d["task_name"],
|
||||
category=d["category"],
|
||||
mode=d.get("mode", "solve"),
|
||||
provider=d["provider"],
|
||||
model=d["model"],
|
||||
timestamp=d["timestamp"],
|
||||
|
||||
@@ -86,11 +86,11 @@ def render_summary_table(results: list[BenchmarkResult]) -> str:
|
||||
lines.append("=" * 78)
|
||||
lines.append("")
|
||||
|
||||
header = f" {'Task':<22s} {'Cat':>3s} {'Bash':^14s} {'Lush':^14s}"
|
||||
header = f" {'Task':<22s} {'Category':<12s} {'Mode':<8s} {'Bash':^14s} {'Lush':^14s}"
|
||||
lines.append(header)
|
||||
sub = f" {'':<22s} {'':>3s} {'pass turns':^14s} {'pass turns':^14s}"
|
||||
sub = f" {'':<22s} {'':<12s} {'':<8s} {'pass turns':^14s} {'pass turns':^14s}"
|
||||
lines.append(sub)
|
||||
lines.append(" " + "-" * 60)
|
||||
lines.append(" " + "-" * 74)
|
||||
|
||||
for r in results:
|
||||
b = r.bash_result
|
||||
@@ -99,7 +99,7 @@ def render_summary_table(results: list[BenchmarkResult]) -> str:
|
||||
l_pass = "PASS" if l and l.all_passed else "FAIL" if l else "-"
|
||||
b_turns = str(b.agent_turns) if b else "-"
|
||||
l_turns = str(l.agent_turns) if l else "-"
|
||||
lines.append(f" {r.task_name:<22s} [{r.category}] {b_pass:>4s} {b_turns:>5s} {l_pass:>4s} {l_turns:>5s}")
|
||||
lines.append(f" {r.task_name:<22s} {r.category:<12s} {r.mode:<8s} {b_pass:>4s} {b_turns:>5s} {l_pass:>4s} {l_turns:>5s}")
|
||||
|
||||
# Totals
|
||||
b_passed = sum(1 for r in results if r.bash_result and r.bash_result.all_passed)
|
||||
@@ -115,9 +115,9 @@ def render_summary_table(results: list[BenchmarkResult]) -> str:
|
||||
if l_turn_counts:
|
||||
l_turns_avg = sum(l_turn_counts) / len(l_turn_counts)
|
||||
|
||||
lines.append(" " + "-" * 60)
|
||||
lines.append(f" {'TOTAL':<22s} {b_passed}/{b_total:>2d} {b_turns_avg:>5.1f} {l_passed}/{l_total:>2d} {l_turns_avg:>5.1f}")
|
||||
lines.append(f" {'':27s}{'pass avg turns':^14s} {'pass avg turns':^14s}")
|
||||
lines.append(" " + "-" * 74)
|
||||
lines.append(f" {'TOTAL':<22s} {'':<12s} {'':<8s} {b_passed}/{b_total:>2d} {b_turns_avg:>5.1f} {l_passed}/{l_total:>2d} {l_turns_avg:>5.1f}")
|
||||
lines.append(f" {'':<22s} {'':<12s} {'':<8s} {'pass avg turns':^14s} {'pass avg turns':^14s}")
|
||||
lines.append("")
|
||||
return "\n".join(lines)
|
||||
|
||||
@@ -172,6 +172,101 @@ def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def render_per_category_summary(results: list[BenchmarkResult]) -> str:
|
||||
"""Render per-category breakdown: pass rates, avg turns, avg questionnaire scores."""
|
||||
from collections import defaultdict
|
||||
|
||||
lines: list[str] = []
|
||||
lines.append("=" * 78)
|
||||
lines.append(" PER-CATEGORY SUMMARY")
|
||||
lines.append("=" * 78)
|
||||
lines.append("")
|
||||
|
||||
# Group by category
|
||||
by_cat: dict[str, list[BenchmarkResult]] = defaultdict(list)
|
||||
for r in results:
|
||||
by_cat[r.category].append(r)
|
||||
|
||||
header = f" {'Category':<12s} {'Bash pass':>9s} {'Lush pass':>9s} {'B turns':>7s} {'L turns':>7s} {'B score':>7s} {'L score':>7s}"
|
||||
lines.append(header)
|
||||
lines.append(" " + "-" * 70)
|
||||
|
||||
for cat in sorted(by_cat):
|
||||
cat_results = by_cat[cat]
|
||||
b_passed = sum(1 for r in cat_results if r.bash_result and r.bash_result.all_passed)
|
||||
l_passed = sum(1 for r in cat_results if r.lush_result and r.lush_result.all_passed)
|
||||
b_total = sum(1 for r in cat_results if r.bash_result)
|
||||
l_total = sum(1 for r in cat_results if r.lush_result)
|
||||
|
||||
b_turn_vals = [r.bash_result.agent_turns for r in cat_results if r.bash_result and r.bash_result.agent_turns > 0]
|
||||
l_turn_vals = [r.lush_result.agent_turns for r in cat_results if r.lush_result and r.lush_result.agent_turns > 0]
|
||||
b_turns_avg = sum(b_turn_vals) / len(b_turn_vals) if b_turn_vals else 0.0
|
||||
l_turns_avg = sum(l_turn_vals) / len(l_turn_vals) if l_turn_vals else 0.0
|
||||
|
||||
# Avg questionnaire scores
|
||||
b_scores: list[float] = []
|
||||
l_scores: list[float] = []
|
||||
for r in cat_results:
|
||||
scores = _get_likert_scores(r)
|
||||
for key in scores:
|
||||
if scores[key]["bash"] is not None:
|
||||
b_scores.append(scores[key]["bash"])
|
||||
if scores[key]["lush"] is not None:
|
||||
l_scores.append(scores[key]["lush"])
|
||||
b_avg_score = sum(b_scores) / len(b_scores) if b_scores else 0.0
|
||||
l_avg_score = sum(l_scores) / len(l_scores) if l_scores else 0.0
|
||||
|
||||
lines.append(
|
||||
f" {cat:<12s} {b_passed}/{b_total:>2d} {l_passed}/{l_total:>2d} "
|
||||
f"{b_turns_avg:>5.1f} {l_turns_avg:>5.1f} {b_avg_score:>5.1f} {l_avg_score:>5.1f}"
|
||||
)
|
||||
|
||||
lines.append("")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def render_per_category_questionnaire(results: list[BenchmarkResult]) -> str:
|
||||
"""Render per-category Likert averages."""
|
||||
from collections import defaultdict
|
||||
|
||||
lines: list[str] = []
|
||||
lines.append("=" * 78)
|
||||
lines.append(" PER-CATEGORY QUESTIONNAIRE AVERAGES")
|
||||
lines.append("=" * 78)
|
||||
lines.append("")
|
||||
|
||||
by_cat: dict[str, list[BenchmarkResult]] = defaultdict(list)
|
||||
for r in results:
|
||||
by_cat[r.category].append(r)
|
||||
|
||||
for cat in sorted(by_cat):
|
||||
cat_results = by_cat[cat]
|
||||
lines.append(f" {cat}")
|
||||
|
||||
agg: dict[str, dict[str, list[float]]] = {}
|
||||
for key, _ in LIKERT_QUESTIONS:
|
||||
agg[key] = {"bash": [], "lush": []}
|
||||
for r in cat_results:
|
||||
scores = _get_likert_scores(r)
|
||||
for key in scores:
|
||||
for lang in ("bash", "lush"):
|
||||
val = scores[key][lang]
|
||||
if val is not None:
|
||||
agg[key][lang].append(val)
|
||||
|
||||
for key, label in LIKERT_QUESTIONS:
|
||||
b_vals = agg[key]["bash"]
|
||||
l_vals = agg[key]["lush"]
|
||||
b_avg = sum(b_vals) / len(b_vals) if b_vals else 0.0
|
||||
l_avg = sum(l_vals) / len(l_vals) if l_vals else 0.0
|
||||
diff = l_avg - b_avg
|
||||
diff_str = f"+{diff:.1f}" if diff > 0 else f"{diff:.1f}" if diff < 0 else " 0.0"
|
||||
lines.append(f" {label:<22s} bash={b_avg:.1f} lush={l_avg:.1f} ({diff_str})")
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def render_per_task_detail(results: list[BenchmarkResult]) -> str:
|
||||
"""Render per-task questionnaire breakdown."""
|
||||
lines: list[str] = []
|
||||
@@ -183,7 +278,7 @@ def render_per_task_detail(results: list[BenchmarkResult]) -> str:
|
||||
lines.append("")
|
||||
b_status = "PASS" if r.bash_result and r.bash_result.all_passed else "FAIL"
|
||||
l_status = "PASS" if r.lush_result and r.lush_result.all_passed else "FAIL"
|
||||
lines.append(f" {r.task_name} [{r.category}] bash={b_status} lush={l_status}")
|
||||
lines.append(f" {r.task_name} [{r.category}/{r.mode}] bash={b_status} lush={l_status}")
|
||||
lines.append("")
|
||||
|
||||
scores = _get_likert_scores(r)
|
||||
@@ -222,7 +317,9 @@ def render_report(results_dir: Path) -> str:
|
||||
|
||||
parts = [
|
||||
render_summary_table(results),
|
||||
render_per_category_summary(results),
|
||||
render_questionnaire_comparison(results),
|
||||
render_per_category_questionnaire(results),
|
||||
render_per_task_detail(results),
|
||||
]
|
||||
return "\n".join(parts)
|
||||
|
||||
34
main.py
34
main.py
@@ -39,32 +39,35 @@ def load_task(path: Path) -> Task:
|
||||
category=raw["category"],
|
||||
description=raw["description"],
|
||||
test_cases=test_cases,
|
||||
mode=raw.get("mode", "solve"),
|
||||
bash_source=raw.get("bash_source"),
|
||||
)
|
||||
|
||||
|
||||
def find_tasks(category: str | None = None) -> list[Path]:
|
||||
def find_tasks(category: str | None = None, mode: str | None = None) -> list[Path]:
|
||||
tasks_dir = Path(__file__).parent / "tasks"
|
||||
paths = []
|
||||
if category:
|
||||
cat_dir = tasks_dir / f"category_{category}"
|
||||
cat_dir = tasks_dir / category
|
||||
if cat_dir.exists():
|
||||
paths = sorted(cat_dir.glob("*.toml"))
|
||||
else:
|
||||
for cat_dir in sorted(tasks_dir.iterdir()):
|
||||
if cat_dir.is_dir():
|
||||
paths.extend(sorted(cat_dir.glob("*.toml")))
|
||||
if mode:
|
||||
paths = [p for p in paths if load_task(p).mode == mode]
|
||||
return paths
|
||||
|
||||
|
||||
def cmd_list_tasks(args: argparse.Namespace) -> None:
|
||||
paths = find_tasks(args.category)
|
||||
paths = find_tasks(args.category, getattr(args, "mode", None))
|
||||
if not paths:
|
||||
print("No tasks found.")
|
||||
return
|
||||
for p in paths:
|
||||
task = load_task(p)
|
||||
print(f" [{task.category}] {task.name:20s} {p.relative_to(Path.cwd())}")
|
||||
print(f" [{task.category:<12s} {task.mode:<7s}] {task.name:20s} {p.relative_to(Path.cwd())}")
|
||||
|
||||
|
||||
def cmd_run(args: argparse.Namespace) -> None:
|
||||
@@ -81,13 +84,13 @@ def cmd_run(args: argparse.Namespace) -> None:
|
||||
provider = PROVIDERS[provider_name](provider_config)
|
||||
timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
|
||||
|
||||
print(f"Running task: {task.name} (category {task.category}) with {provider.model_name}")
|
||||
print(f"Running task: {task.name} ({task.category}/{task.mode}) with {provider.model_name}")
|
||||
|
||||
bash_result = None
|
||||
lush_result = None
|
||||
|
||||
if task.category == "a":
|
||||
# Category A: solve in both languages
|
||||
if task.mode == "solve":
|
||||
# Solve mode: agent writes code in both languages
|
||||
print(" Solving in bash...")
|
||||
bash_result = solve_task(provider, task, "bash", config)
|
||||
print(f" Bash: {'PASS' if bash_result.all_passed else 'FAIL'} ({bash_result.agent_turns} turns)")
|
||||
@@ -96,9 +99,9 @@ def cmd_run(args: argparse.Namespace) -> None:
|
||||
lush_result = solve_task(provider, task, "lush", config)
|
||||
print(f" Lush: {'PASS' if lush_result.all_passed else 'FAIL'} ({lush_result.agent_turns} turns)")
|
||||
|
||||
elif task.category == "b":
|
||||
# Category B: verify provided bash source directly, then convert to lush
|
||||
assert task.bash_source, f"Category B task {task.name} missing bash_source"
|
||||
elif task.mode == "convert":
|
||||
# Convert mode: verify provided bash source directly, then convert to lush
|
||||
assert task.bash_source, f"Convert-mode task {task.name} missing bash_source"
|
||||
print(" Verifying provided bash source...")
|
||||
test_results = evaluate(task, task.bash_source, "bash", config)
|
||||
all_passed = all(tr.passed for tr in test_results)
|
||||
@@ -124,6 +127,7 @@ def cmd_run(args: argparse.Namespace) -> None:
|
||||
benchmark = BenchmarkResult(
|
||||
task_name=task.name,
|
||||
category=task.category,
|
||||
mode=task.mode,
|
||||
provider=provider_name,
|
||||
model=provider.model_name,
|
||||
timestamp=timestamp,
|
||||
@@ -136,7 +140,7 @@ def cmd_run(args: argparse.Namespace) -> None:
|
||||
|
||||
|
||||
def cmd_run_all(args: argparse.Namespace) -> None:
|
||||
paths = find_tasks(args.category)
|
||||
paths = find_tasks(args.category, getattr(args, "mode", None))
|
||||
if not paths:
|
||||
print("No tasks found.")
|
||||
return
|
||||
@@ -164,7 +168,8 @@ def main() -> None:
|
||||
|
||||
# list-tasks
|
||||
ls = sub.add_parser("list-tasks", help="List available tasks")
|
||||
ls.add_argument("--category", choices=["a", "b"], help="Filter by category")
|
||||
ls.add_argument("--category", choices=["algorithm", "pipeline", "environment", "filesystem", "process"], help="Filter by category")
|
||||
ls.add_argument("--mode", choices=["solve", "convert"], help="Filter by mode")
|
||||
ls.set_defaults(func=cmd_list_tasks)
|
||||
|
||||
# run
|
||||
@@ -174,8 +179,9 @@ def main() -> None:
|
||||
run.set_defaults(func=cmd_run)
|
||||
|
||||
# run-all
|
||||
ra = sub.add_parser("run-all", help="Run all tasks in a category")
|
||||
ra.add_argument("--category", choices=["a", "b"], help="Category to run")
|
||||
ra = sub.add_parser("run-all", help="Run all tasks (optionally filtered)")
|
||||
ra.add_argument("--category", choices=["algorithm", "pipeline", "environment", "filesystem", "process"], help="Filter by category")
|
||||
ra.add_argument("--mode", choices=["solve", "convert"], help="Filter by mode")
|
||||
ra.add_argument("--provider", default="anthropic", help="LLM provider")
|
||||
ra.set_defaults(func=cmd_run_all)
|
||||
|
||||
|
||||
274
report.html
274
report.html
File diff suppressed because one or more lines are too long
@@ -1,5 +1,6 @@
|
||||
name = "fizzbuzz"
|
||||
category = "a"
|
||||
category = "algorithm"
|
||||
mode = "solve"
|
||||
description = """
|
||||
Read a single integer N from stdin. Print numbers from 1 to N, one per line.
|
||||
For multiples of 3, print "Fizz" instead of the number.
|
||||
@@ -1,5 +1,6 @@
|
||||
name = "reverse_string"
|
||||
category = "a"
|
||||
category = "algorithm"
|
||||
mode = "solve"
|
||||
description = """
|
||||
Read a single line from stdin and print it reversed to stdout.
|
||||
"""
|
||||
@@ -1,5 +1,6 @@
|
||||
name = "two_sum"
|
||||
category = "a"
|
||||
category = "algorithm"
|
||||
mode = "solve"
|
||||
description = """
|
||||
Read input from stdin. The first line contains a target integer.
|
||||
The second line contains space-separated integers (the array).
|
||||
@@ -1,5 +1,6 @@
|
||||
name = "env_config"
|
||||
category = "a"
|
||||
category = "environment"
|
||||
mode = "solve"
|
||||
description = """
|
||||
Read a config format from stdin where each line is "KEY=VALUE".
|
||||
For each line, set an environment variable with that key and value.
|
||||
@@ -1,5 +1,6 @@
|
||||
name = "env_path_builder"
|
||||
category = "b"
|
||||
category = "environment"
|
||||
mode = "convert"
|
||||
description = """
|
||||
Read directory paths from stdin, one per line.
|
||||
Append each to the MYPATH environment variable (colon-separated), skipping duplicates.
|
||||
@@ -1,5 +1,6 @@
|
||||
name = "file_organizer"
|
||||
category = "a"
|
||||
category = "filesystem"
|
||||
mode = "solve"
|
||||
description = """
|
||||
You are given a working directory containing several files with extensions.
|
||||
Read a list of extension-to-directory mappings from stdin, one per line, in the format:
|
||||
@@ -1,5 +1,6 @@
|
||||
name = "multi_file_search"
|
||||
category = "a"
|
||||
category = "filesystem"
|
||||
mode = "solve"
|
||||
description = """
|
||||
You are given a working directory containing several text files.
|
||||
Read a search pattern (a simple string, not regex) from stdin.
|
||||
@@ -1,5 +1,6 @@
|
||||
name = "csv_transform"
|
||||
category = "b"
|
||||
category = "pipeline"
|
||||
mode = "convert"
|
||||
description = """
|
||||
Read CSV data from stdin. The first line is a header.
|
||||
Each subsequent line has fields: name,age,city
|
||||
@@ -1,5 +1,6 @@
|
||||
name = "log_parser"
|
||||
category = "b"
|
||||
category = "pipeline"
|
||||
mode = "convert"
|
||||
description = """
|
||||
Read log lines from stdin. Each line has the format: "LEVEL: message"
|
||||
where LEVEL is one of ERROR, WARN, INFO.
|
||||
@@ -1,5 +1,6 @@
|
||||
name = "pipeline_transform"
|
||||
category = "a"
|
||||
category = "pipeline"
|
||||
mode = "solve"
|
||||
description = """
|
||||
Read lines from stdin. Build a pipeline that:
|
||||
1. Filters to only lines containing the word "error" (case-insensitive)
|
||||
@@ -1,5 +1,6 @@
|
||||
name = "pipeline_word_freq"
|
||||
category = "b"
|
||||
category = "pipeline"
|
||||
mode = "convert"
|
||||
description = """
|
||||
Read text from stdin. Count the frequency of each word (case-insensitive, only alphabetic characters count as words).
|
||||
Print the top 5 most frequent words in descending order of frequency, in the format:
|
||||
@@ -1,5 +1,6 @@
|
||||
name = "process_exit_codes"
|
||||
category = "a"
|
||||
category = "process"
|
||||
mode = "solve"
|
||||
description = """
|
||||
Read commands from stdin, one per line. Execute each command as a subprocess.
|
||||
For each command, print: "command: exit_code" where command is the original command text
|
||||
Reference in New Issue
Block a user