File: check-docs-urls.sh

package info (click to toggle)
systemd-udeb 260-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 114,360 kB
  • sloc: ansic: 741,727; xml: 122,306; python: 35,714; sh: 35,154; cpp: 947; awk: 126; makefile: 89; lisp: 13; sed: 1
file content (58 lines) | stat: -rwxr-xr-x 1,705 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#!/usr/bin/env bash
# SPDX-License-Identifier: LGPL-2.1-or-later

set -euo pipefail

# check-docs-urls.sh
# Extract external URLs from docs/ using git grep, clean them, de-duplicate,
# and check HTTP status codes with curl. Writes results to a status file.

OUT_LIST=${1:-/tmp/docs-urls.txt}
OUT_STATUS=${2:-/tmp/docs-url-status.txt}

usage() {
    cat <<EOF
Usage: $0 [URL_LIST_OUT] [STATUS_OUT]

Extract external URLs from docs/, dedupe and clean them, then check each URL
with curl. Defaults:
  URL_LIST_OUT = /tmp/docs-urls.txt
  STATUS_OUT   = /tmp/docs-url-status.txt

Examples:
  $0
  $0 /tmp/my-urls.txt /tmp/my-status.txt
EOF
}

if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then
    usage
    exit 0
fi

command -v curl >/dev/null 2>&1 || { echo "ERROR: curl not found in PATH" >&2; exit 2; }

# Extract likely URLs. Pattern stops at whitespace, angle bracket or quote/paren to avoid trailing HTML tags.
# Then strip trailing punctuation like ,.;:)\"' and any accidental trailing angle brackets.
git grep 'https*://' docs \
    | sed -e 's|^.*http|http|; s/["`'"'"')< ].*$//' \
    | sort -u > "$OUT_LIST"

echo "Found $(wc -l < "$OUT_LIST") unique urls (written to $OUT_LIST)"

# Check each URL with curl (follows redirects). Output: HTTP_CODE URL
: > "$OUT_STATUS"
while read -r url; do
    [[ -z "$url" ]] && continue
    # Use a reasonable timeout and follow redirects
    code=$(curl -sS -L -o /dev/null -w "%{http_code}" --max-time 3 "$url" || echo "000")
    printf "%s %s\n" "$code" "$url" >> "$OUT_STATUS"
done < "$OUT_LIST"

echo "Wrote status results to $OUT_STATUS"

# Show non-2xx/3xx entries
echo "Non-OK results (not 2xx/3xx):"
grep -E "^[^23]" "$OUT_STATUS" || true

exit 0