Reorganize task categories from opaque a/b to descriptive names

Replace category_a/category_b directories with algorithm, pipeline, environment, filesystem, and process. Add separate mode field (solve/convert) to decouple orchestration from capability grouping. Add per-category summary and questionnaire breakdowns to both terminal report and HTML export.
2026-03-29 20:59:01 +01:00
parent be8d657b24
commit 20e62f60f6
18 changed files with 487 additions and 167 deletions
--- a/lush_bench/models.py
+++ b/lush_bench/models.py
@@ -36,15 +36,17 @@ class TestCase:
@dataclass
 class Task:
    name: str
-    category: str  # "a" or "b"
+    category: str  # "algorithm", "pipeline", "environment", "filesystem", "process"
    description: str
    test_cases: list[TestCase]
-    bash_source: str | None = None  # category B only
+    mode: str = "solve"  # "solve" or "convert"
+    bash_source: str | None = None  # convert mode only

    def to_dict(self) -> dict[str, Any]:
        d: dict[str, Any] = {
            "name": self.name,
            "category": self.category,
+            "mode": self.mode,
            "description": self.description,
            "test_cases": [tc.to_dict() for tc in self.test_cases],
        }
@@ -59,6 +61,7 @@ class Task:
            category=d["category"],
            description=d["description"],
            test_cases=[TestCase.from_dict(tc) for tc in d["test_cases"]],
+            mode=d.get("mode", "solve"),
            bash_source=d.get("bash_source"),
        )

@@ -180,16 +183,18 @@ class QuestionnaireResponse:
 class BenchmarkResult:
    task_name: str
    category: str
-    provider: str
-    model: str
-    timestamp: str
-    bash_result: LanguageResult | None
-    lush_result: LanguageResult | None
+    mode: str = "solve"  # "solve" or "convert"
+    provider: str = ""
+    model: str = ""
+    timestamp: str = ""
+    bash_result: LanguageResult | None = None
+    lush_result: LanguageResult | None = None

    def to_dict(self) -> dict[str, Any]:
        return {
            "task_name": self.task_name,
            "category": self.category,
+            "mode": self.mode,
            "provider": self.provider,
            "model": self.model,
            "timestamp": self.timestamp,
@@ -202,6 +207,7 @@ class BenchmarkResult:
        return cls(
            task_name=d["task_name"],
            category=d["category"],
+            mode=d.get("mode", "solve"),
            provider=d["provider"],
            model=d["model"],
            timestamp=d["timestamp"],