Initial commit: Lush vs Bash AI benchmarking framework

Benchmark harness that uses LLM agents to solve shell scripting tasks in both Bash and Lush, then compares correctness and code quality. - CLI with run, run-all, list-tasks, report, and export commands - Agent loop with retry support via Anthropic Claude provider - Test harness executing solutions in sandboxed subprocesses - LLM-driven questionnaire for subjective code quality evaluation - HTML report export with charts (matplotlib) - 8 Category A tasks (write-from-scratch in both languages) - 4 Category B tasks (verify provided Bash, convert to Lush) - Lush language reference for agent context
2026-03-29 17:56:30 +01:00
commit be8d657b24
33 changed files with 3302 additions and 0 deletions
--- a/tasks/category_b/log_parser.toml
+++ b/tasks/category_b/log_parser.toml
@@ -0,0 +1,33 @@
+name = "log_parser"
+category = "b"
+description = """
+Read log lines from stdin. Each line has the format: "LEVEL: message"
+where LEVEL is one of ERROR, WARN, INFO.
+Count occurrences of each level and print a summary sorted by level name.
+Format: "LEVEL: count"
+"""
+
+bash_source = """
+#!/bin/bash
+while IFS= read -r line || [[ -n "$line" ]]; do
+    echo "${line%%:*}"
+done | sort | uniq -c | while read -r count level; do
+    echo "$level: $count"
+done
+"""
+
+[[test_cases]]
+stdin = """ERROR: disk full
+INFO: started
+WARN: low memory
+ERROR: timeout
+INFO: completed"""
+expected_stdout = """ERROR: 2
+INFO: 2
+WARN: 1"""
+
+[[test_cases]]
+stdin = """INFO: boot
+INFO: ready
+INFO: shutdown"""
+expected_stdout = "INFO: 3"