1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58
|
#!/usr/bin/env bash
# SPDX-License-Identifier: LGPL-2.1-or-later
set -euo pipefail
# check-docs-urls.sh
# Extract external URLs from docs/ using git grep, clean them, de-duplicate,
# and check HTTP status codes with curl. Writes results to a status file.
OUT_LIST=${1:-/tmp/docs-urls.txt}
OUT_STATUS=${2:-/tmp/docs-url-status.txt}
usage() {
cat <<EOF
Usage: $0 [URL_LIST_OUT] [STATUS_OUT]
Extract external URLs from docs/, dedupe and clean them, then check each URL
with curl. Defaults:
URL_LIST_OUT = /tmp/docs-urls.txt
STATUS_OUT = /tmp/docs-url-status.txt
Examples:
$0
$0 /tmp/my-urls.txt /tmp/my-status.txt
EOF
}
if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then
usage
exit 0
fi
command -v curl >/dev/null 2>&1 || { echo "ERROR: curl not found in PATH" >&2; exit 2; }
# Extract likely URLs. Pattern stops at whitespace, angle bracket or quote/paren to avoid trailing HTML tags.
# Then strip trailing punctuation like ,.;:)\"' and any accidental trailing angle brackets.
git grep 'https*://' docs \
| sed -e 's|^.*http|http|; s/["`'"'"')< ].*$//' \
| sort -u > "$OUT_LIST"
echo "Found $(wc -l < "$OUT_LIST") unique urls (written to $OUT_LIST)"
# Check each URL with curl (follows redirects). Output: HTTP_CODE URL
: > "$OUT_STATUS"
while read -r url; do
[[ -z "$url" ]] && continue
# Use a reasonable timeout and follow redirects
code=$(curl -sS -L -o /dev/null -w "%{http_code}" --max-time 3 "$url" || echo "000")
printf "%s %s\n" "$code" "$url" >> "$OUT_STATUS"
done < "$OUT_LIST"
echo "Wrote status results to $OUT_STATUS"
# Show non-2xx/3xx entries
echo "Non-OK results (not 2xx/3xx):"
grep -E "^[^23]" "$OUT_STATUS" || true
exit 0
|