Reorganize task categories from opaque a/b to descriptive names
Replace category_a/category_b directories with algorithm, pipeline, environment, filesystem, and process. Add separate mode field (solve/convert) to decouple orchestration from capability grouping. Add per-category summary and questionnaire breakdowns to both terminal report and HTML export.
This commit is contained in:
@@ -86,11 +86,11 @@ def render_summary_table(results: list[BenchmarkResult]) -> str:
|
||||
lines.append("=" * 78)
|
||||
lines.append("")
|
||||
|
||||
header = f" {'Task':<22s} {'Cat':>3s} {'Bash':^14s} {'Lush':^14s}"
|
||||
header = f" {'Task':<22s} {'Category':<12s} {'Mode':<8s} {'Bash':^14s} {'Lush':^14s}"
|
||||
lines.append(header)
|
||||
sub = f" {'':<22s} {'':>3s} {'pass turns':^14s} {'pass turns':^14s}"
|
||||
sub = f" {'':<22s} {'':<12s} {'':<8s} {'pass turns':^14s} {'pass turns':^14s}"
|
||||
lines.append(sub)
|
||||
lines.append(" " + "-" * 60)
|
||||
lines.append(" " + "-" * 74)
|
||||
|
||||
for r in results:
|
||||
b = r.bash_result
|
||||
@@ -99,7 +99,7 @@ def render_summary_table(results: list[BenchmarkResult]) -> str:
|
||||
l_pass = "PASS" if l and l.all_passed else "FAIL" if l else "-"
|
||||
b_turns = str(b.agent_turns) if b else "-"
|
||||
l_turns = str(l.agent_turns) if l else "-"
|
||||
lines.append(f" {r.task_name:<22s} [{r.category}] {b_pass:>4s} {b_turns:>5s} {l_pass:>4s} {l_turns:>5s}")
|
||||
lines.append(f" {r.task_name:<22s} {r.category:<12s} {r.mode:<8s} {b_pass:>4s} {b_turns:>5s} {l_pass:>4s} {l_turns:>5s}")
|
||||
|
||||
# Totals
|
||||
b_passed = sum(1 for r in results if r.bash_result and r.bash_result.all_passed)
|
||||
@@ -115,9 +115,9 @@ def render_summary_table(results: list[BenchmarkResult]) -> str:
|
||||
if l_turn_counts:
|
||||
l_turns_avg = sum(l_turn_counts) / len(l_turn_counts)
|
||||
|
||||
lines.append(" " + "-" * 60)
|
||||
lines.append(f" {'TOTAL':<22s} {b_passed}/{b_total:>2d} {b_turns_avg:>5.1f} {l_passed}/{l_total:>2d} {l_turns_avg:>5.1f}")
|
||||
lines.append(f" {'':27s}{'pass avg turns':^14s} {'pass avg turns':^14s}")
|
||||
lines.append(" " + "-" * 74)
|
||||
lines.append(f" {'TOTAL':<22s} {'':<12s} {'':<8s} {b_passed}/{b_total:>2d} {b_turns_avg:>5.1f} {l_passed}/{l_total:>2d} {l_turns_avg:>5.1f}")
|
||||
lines.append(f" {'':<22s} {'':<12s} {'':<8s} {'pass avg turns':^14s} {'pass avg turns':^14s}")
|
||||
lines.append("")
|
||||
return "\n".join(lines)
|
||||
|
||||
@@ -172,6 +172,101 @@ def render_questionnaire_comparison(results: list[BenchmarkResult]) -> str:
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def render_per_category_summary(results: list[BenchmarkResult]) -> str:
|
||||
"""Render per-category breakdown: pass rates, avg turns, avg questionnaire scores."""
|
||||
from collections import defaultdict
|
||||
|
||||
lines: list[str] = []
|
||||
lines.append("=" * 78)
|
||||
lines.append(" PER-CATEGORY SUMMARY")
|
||||
lines.append("=" * 78)
|
||||
lines.append("")
|
||||
|
||||
# Group by category
|
||||
by_cat: dict[str, list[BenchmarkResult]] = defaultdict(list)
|
||||
for r in results:
|
||||
by_cat[r.category].append(r)
|
||||
|
||||
header = f" {'Category':<12s} {'Bash pass':>9s} {'Lush pass':>9s} {'B turns':>7s} {'L turns':>7s} {'B score':>7s} {'L score':>7s}"
|
||||
lines.append(header)
|
||||
lines.append(" " + "-" * 70)
|
||||
|
||||
for cat in sorted(by_cat):
|
||||
cat_results = by_cat[cat]
|
||||
b_passed = sum(1 for r in cat_results if r.bash_result and r.bash_result.all_passed)
|
||||
l_passed = sum(1 for r in cat_results if r.lush_result and r.lush_result.all_passed)
|
||||
b_total = sum(1 for r in cat_results if r.bash_result)
|
||||
l_total = sum(1 for r in cat_results if r.lush_result)
|
||||
|
||||
b_turn_vals = [r.bash_result.agent_turns for r in cat_results if r.bash_result and r.bash_result.agent_turns > 0]
|
||||
l_turn_vals = [r.lush_result.agent_turns for r in cat_results if r.lush_result and r.lush_result.agent_turns > 0]
|
||||
b_turns_avg = sum(b_turn_vals) / len(b_turn_vals) if b_turn_vals else 0.0
|
||||
l_turns_avg = sum(l_turn_vals) / len(l_turn_vals) if l_turn_vals else 0.0
|
||||
|
||||
# Avg questionnaire scores
|
||||
b_scores: list[float] = []
|
||||
l_scores: list[float] = []
|
||||
for r in cat_results:
|
||||
scores = _get_likert_scores(r)
|
||||
for key in scores:
|
||||
if scores[key]["bash"] is not None:
|
||||
b_scores.append(scores[key]["bash"])
|
||||
if scores[key]["lush"] is not None:
|
||||
l_scores.append(scores[key]["lush"])
|
||||
b_avg_score = sum(b_scores) / len(b_scores) if b_scores else 0.0
|
||||
l_avg_score = sum(l_scores) / len(l_scores) if l_scores else 0.0
|
||||
|
||||
lines.append(
|
||||
f" {cat:<12s} {b_passed}/{b_total:>2d} {l_passed}/{l_total:>2d} "
|
||||
f"{b_turns_avg:>5.1f} {l_turns_avg:>5.1f} {b_avg_score:>5.1f} {l_avg_score:>5.1f}"
|
||||
)
|
||||
|
||||
lines.append("")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def render_per_category_questionnaire(results: list[BenchmarkResult]) -> str:
|
||||
"""Render per-category Likert averages."""
|
||||
from collections import defaultdict
|
||||
|
||||
lines: list[str] = []
|
||||
lines.append("=" * 78)
|
||||
lines.append(" PER-CATEGORY QUESTIONNAIRE AVERAGES")
|
||||
lines.append("=" * 78)
|
||||
lines.append("")
|
||||
|
||||
by_cat: dict[str, list[BenchmarkResult]] = defaultdict(list)
|
||||
for r in results:
|
||||
by_cat[r.category].append(r)
|
||||
|
||||
for cat in sorted(by_cat):
|
||||
cat_results = by_cat[cat]
|
||||
lines.append(f" {cat}")
|
||||
|
||||
agg: dict[str, dict[str, list[float]]] = {}
|
||||
for key, _ in LIKERT_QUESTIONS:
|
||||
agg[key] = {"bash": [], "lush": []}
|
||||
for r in cat_results:
|
||||
scores = _get_likert_scores(r)
|
||||
for key in scores:
|
||||
for lang in ("bash", "lush"):
|
||||
val = scores[key][lang]
|
||||
if val is not None:
|
||||
agg[key][lang].append(val)
|
||||
|
||||
for key, label in LIKERT_QUESTIONS:
|
||||
b_vals = agg[key]["bash"]
|
||||
l_vals = agg[key]["lush"]
|
||||
b_avg = sum(b_vals) / len(b_vals) if b_vals else 0.0
|
||||
l_avg = sum(l_vals) / len(l_vals) if l_vals else 0.0
|
||||
diff = l_avg - b_avg
|
||||
diff_str = f"+{diff:.1f}" if diff > 0 else f"{diff:.1f}" if diff < 0 else " 0.0"
|
||||
lines.append(f" {label:<22s} bash={b_avg:.1f} lush={l_avg:.1f} ({diff_str})")
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def render_per_task_detail(results: list[BenchmarkResult]) -> str:
|
||||
"""Render per-task questionnaire breakdown."""
|
||||
lines: list[str] = []
|
||||
@@ -183,7 +278,7 @@ def render_per_task_detail(results: list[BenchmarkResult]) -> str:
|
||||
lines.append("")
|
||||
b_status = "PASS" if r.bash_result and r.bash_result.all_passed else "FAIL"
|
||||
l_status = "PASS" if r.lush_result and r.lush_result.all_passed else "FAIL"
|
||||
lines.append(f" {r.task_name} [{r.category}] bash={b_status} lush={l_status}")
|
||||
lines.append(f" {r.task_name} [{r.category}/{r.mode}] bash={b_status} lush={l_status}")
|
||||
lines.append("")
|
||||
|
||||
scores = _get_likert_scores(r)
|
||||
@@ -222,7 +317,9 @@ def render_report(results_dir: Path) -> str:
|
||||
|
||||
parts = [
|
||||
render_summary_table(results),
|
||||
render_per_category_summary(results),
|
||||
render_questionnaire_comparison(results),
|
||||
render_per_category_questionnaire(results),
|
||||
render_per_task_detail(results),
|
||||
]
|
||||
return "\n".join(parts)
|
||||
|
||||
Reference in New Issue
Block a user