name = "url_normalizer" category = "pipeline" mode = "convert" description = """ Read URLs from stdin, one per line. Normalize each URL: 1. If the URL already starts with "https://", keep it as-is. 2. If it starts with "http://", keep it as-is. 3. Otherwise, prepend "http://" to it. 4. After normalization, validate that the URL matches a basic pattern: it must have a protocol (http:// or https://), followed by at least one character, a dot, and at least one more character for the domain. 5. Output the normalized URL, or "INVALID: " for invalid entries. Skip empty lines silently. """ bash_source = ''' #!/bin/bash while IFS= read -r line || [[ -n "$line" ]]; do # Skip empty lines [[ -z "$line" ]] && continue # Trim whitespace url=$(echo "$line" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//') [[ -z "$url" ]] && continue original="$url" # Check if it already has https:// prefix8=$(echo "$url" | cut -c1-8) if [[ "$prefix8" == "https://" ]]; then normalized="$url" else prefix7=$(echo "$url" | cut -c1-7) if [[ "$prefix7" == "http://" ]]; then normalized="$url" else normalized="http://$url" fi fi # Validate: protocol + something.something if echo "$normalized" | grep -qE '^https?://[^/]+\.[^/]+'; then echo "$normalized" else echo "INVALID: $original" fi done ''' [[test_cases]] description = "URLs with and without protocol" stdin = """example.com http://example.com https://example.com www.google.com/search?q=test""" expected_stdout = """http://example.com http://example.com https://example.com http://www.google.com/search?q=test""" [[test_cases]] description = "Invalid entries" stdin = """notaurl https://valid.example.com just-a-word""" expected_stdout = """INVALID: notaurl https://valid.example.com INVALID: just-a-word""" [[test_cases]] description = "Mixed valid and empty lines" stdin = """https://secure.site.org/path api.service.io:8080 http://old.site.net""" expected_stdout = """https://secure.site.org/path http://api.service.io:8080 http://old.site.net"""