Files
lush_grading/tasks/pipeline/url_normalizer.toml
Cormac Shannon 18ce7e57cf Revamp questionnaire, parallelize run-all, add new tasks
- Replace 6 compound Likert questions with 12 atomic ones grouped by
  dimension (syntax, expressiveness, data/IO, errors, overall); drop
  free-form question. Responses now stored as ints, not strings.
- Back-compat layer maps legacy keys to new dimensions so existing
  results still render.
- Parallelize run-all with ThreadPoolExecutor (configurable workers)
  and add a thread-safe min-request-interval rate limiter to the
  Anthropic provider.
- Add new tasks: path_normalizer, todo_manager, currency_converter,
  locale_weather_url, network_info_parser, url_normalizer.
2026-04-07 19:07:21 +01:00

80 lines
2.0 KiB
TOML

name = "url_normalizer"
category = "pipeline"
mode = "convert"
description = """
Read URLs from stdin, one per line. Normalize each URL:
1. If the URL already starts with "https://", keep it as-is.
2. If it starts with "http://", keep it as-is.
3. Otherwise, prepend "http://" to it.
4. After normalization, validate that the URL matches a basic pattern:
it must have a protocol (http:// or https://), followed by at least
one character, a dot, and at least one more character for the domain.
5. Output the normalized URL, or "INVALID: <original>" for invalid entries.
Skip empty lines silently.
"""
bash_source = '''
#!/bin/bash
while IFS= read -r line || [[ -n "$line" ]]; do
# Skip empty lines
[[ -z "$line" ]] && continue
# Trim whitespace
url=$(echo "$line" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
[[ -z "$url" ]] && continue
original="$url"
# Check if it already has https://
prefix8=$(echo "$url" | cut -c1-8)
if [[ "$prefix8" == "https://" ]]; then
normalized="$url"
else
prefix7=$(echo "$url" | cut -c1-7)
if [[ "$prefix7" == "http://" ]]; then
normalized="$url"
else
normalized="http://$url"
fi
fi
# Validate: protocol + something.something
if echo "$normalized" | grep -qE '^https?://[^/]+\.[^/]+'; then
echo "$normalized"
else
echo "INVALID: $original"
fi
done
'''
[[test_cases]]
description = "URLs with and without protocol"
stdin = """example.com
http://example.com
https://example.com
www.google.com/search?q=test"""
expected_stdout = """http://example.com
http://example.com
https://example.com
http://www.google.com/search?q=test"""
[[test_cases]]
description = "Invalid entries"
stdin = """notaurl
https://valid.example.com
just-a-word"""
expected_stdout = """INVALID: notaurl
https://valid.example.com
INVALID: just-a-word"""
[[test_cases]]
description = "Mixed valid and empty lines"
stdin = """https://secure.site.org/path
api.service.io:8080
http://old.site.net"""
expected_stdout = """https://secure.site.org/path
http://api.service.io:8080
http://old.site.net"""