Benchmark harness that uses LLM agents to solve shell scripting tasks in both Bash and Lush, then compares correctness and code quality. - CLI with run, run-all, list-tasks, report, and export commands - Agent loop with retry support via Anthropic Claude provider - Test harness executing solutions in sandboxed subprocesses - LLM-driven questionnaire for subjective code quality evaluation - HTML report export with charts (matplotlib) - 8 Category A tasks (write-from-scratch in both languages) - 4 Category B tasks (verify provided Bash, convert to Lush) - Lush language reference for agent context
16 lines
261 B
TOML
16 lines
261 B
TOML
[lush]
|
|
binary = "/Users/nik/Code/20251000_lush/lush"
|
|
|
|
[agent]
|
|
max_retries = 3
|
|
timeout_seconds = 10
|
|
normalize_whitespace = true
|
|
|
|
[results]
|
|
output_dir = "results"
|
|
|
|
[anthropic]
|
|
api_key_env = "ANTHROPIC_API_KEY"
|
|
model = "claude-sonnet-4-20250514"
|
|
max_tokens = 4096
|