File: _check.py

package info (click to toggle)
python-sphinx-codeautolink 0.17.5-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 936 kB
  • sloc: python: 2,818; makefile: 4
file content (68 lines) | stat: -rw-r--r-- 2,667 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from pathlib import Path

import requests
from bs4 import BeautifulSoup

# Cache external pages for the duration of the runtime,
# so that we don't request them multiple times needlessly
sess = requests.Session()
external_site_ids = {}


def check_link_targets(root: Path) -> int:
    """Validate links in HTML site at root, return number of links found."""
    site_docs = {
        p: BeautifulSoup(p.read_text("utf-8"), "html.parser")
        for p in root.glob("**/*.html")
    }
    site_ids = {k: gather_ids(v) for k, v in site_docs.items()}

    total = 0
    for doc, soup in site_docs.items():
        for link in soup.find_all("a", attrs={"class": "sphinx-codeautolink-a"}):
            base, id_ = link["href"].split("#")
            if any(base.startswith(s) for s in ("http://", "https://")):
                if base not in external_site_ids:
                    sub_soup = BeautifulSoup(sess.get(base).text, "html.parser")
                    external_site_ids[base] = gather_ids(sub_soup)
                ids = external_site_ids[base]
            else:
                target_path = (doc.parent / base).resolve()
                if target_path.is_dir():
                    target_path /= "index.html"
                assert target_path.exists(), (
                    f"Target path {target_path!s} not found while validating"
                    f" link for `{link.string}` in {doc.relative_to(root)!s}!"
                )
                ids = site_ids[target_path]

            assert id_ in ids, (
                f"ID {id_} not found in {base} while validating link"
                f" for `{link.string}` in {doc.relative_to(root)!s}!"
            )
            total += 1
    return total


def check_reference_targets_exist(root: Path):
    site_docs = {
        p: BeautifulSoup(p.read_text("utf-8"), "html.parser")
        for p in root.glob("**/*.html")
    }
    for doc, soup in site_docs.items():
        for link in soup.find_all("a", attrs={"class": "reference internal"}):
            base = link["href"].split("#")[0]
            if any(base.startswith(s) for s in ("http://", "https://")):
                continue
            target_path = doc if base == "" else (doc.parent / base).resolve()
            if target_path.is_dir():
                target_path /= "index.html"
            assert target_path.exists(), (
                f"Target path {target_path!s} not found while validating"
                f" link for `{link.string}` in {doc.relative_to(root)!s}!"
            )


def gather_ids(soup: BeautifulSoup) -> set:
    """Gather all HTML IDs from a given page."""
    return {tag["id"] for tag in soup.find_all(id=True)}