lush_grading/tasks/pipeline/url_normalizer.toml

name = "url_normalizer"
category = "pipeline"
mode = "convert"
description = """
Read URLs from stdin, one per line. Normalize each URL:
1. If the URL already starts with "https://", keep it as-is.
2. If it starts with "http://", keep it as-is.
3. Otherwise, prepend "http://" to it.
4. After normalization, validate that the URL matches a basic pattern:
   it must have a protocol (http:// or https://), followed by at least
   one character, a dot, and at least one more character for the domain.
5. Output the normalized URL, or "INVALID: <original>" for invalid entries.

Skip empty lines silently.
"""

bash_source = '''
#!/bin/bash
while IFS= read -r line || [[ -n "$line" ]]; do
  # Skip empty lines
  [[ -z "$line" ]] && continue

  # Trim whitespace
  url=$(echo "$line" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
  [[ -z "$url" ]] && continue

  original="$url"

  # Check if it already has https://
  prefix8=$(echo "$url" | cut -c1-8)
  if [[ "$prefix8" == "https://" ]]; then
    normalized="$url"
  else
    prefix7=$(echo "$url" | cut -c1-7)
    if [[ "$prefix7" == "http://" ]]; then
      normalized="$url"
    else
      normalized="http://$url"
    fi
  fi

  # Validate: protocol + something.something
  if echo "$normalized" | grep -qE '^https?://[^/]+\.[^/]+'; then
    echo "$normalized"
  else
    echo "INVALID: $original"
  fi
done
'''

[[test_cases]]
description = "URLs with and without protocol"
stdin = """example.com
http://example.com
https://example.com
www.google.com/search?q=test"""
expected_stdout = """http://example.com
http://example.com
https://example.com
http://www.google.com/search?q=test"""

[[test_cases]]
description = "Invalid entries"
stdin = """notaurl
https://valid.example.com
just-a-word"""
expected_stdout = """INVALID: notaurl
https://valid.example.com
INVALID: just-a-word"""

[[test_cases]]
description = "Mixed valid and empty lines"
stdin = """https://secure.site.org/path

api.service.io:8080
http://old.site.net"""
expected_stdout = """https://secure.site.org/path
http://api.service.io:8080
http://old.site.net"""