1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
|
"""
Module for processing Sitemaps.
Note: The main purpose of this module is to provide support for the
SitemapSpider, its API is subject to change without notice.
"""
from __future__ import annotations
from typing import TYPE_CHECKING, Any
from urllib.parse import urljoin
import lxml.etree
if TYPE_CHECKING:
from collections.abc import Iterable, Iterator
class Sitemap:
"""Class to parse Sitemap (type=urlset) and Sitemap Index
(type=sitemapindex) files"""
def __init__(self, xmltext: str | bytes):
xmlp = lxml.etree.XMLParser(
recover=True, remove_comments=True, resolve_entities=False
)
self._root = lxml.etree.fromstring(xmltext, parser=xmlp) # noqa: S320
rt = self._root.tag
assert isinstance(rt, str)
self.type = rt.split("}", 1)[1] if "}" in rt else rt
def __iter__(self) -> Iterator[dict[str, Any]]:
for elem in self._root.getchildren():
d: dict[str, Any] = {}
for el in elem.getchildren():
tag = el.tag
assert isinstance(tag, str)
name = tag.split("}", 1)[1] if "}" in tag else tag
if name == "link":
if "href" in el.attrib:
d.setdefault("alternate", []).append(el.get("href"))
else:
d[name] = el.text.strip() if el.text else ""
if "loc" in d:
yield d
def sitemap_urls_from_robots(
robots_text: str, base_url: str | None = None
) -> Iterable[str]:
"""Return an iterator over all sitemap urls contained in the given
robots.txt file
"""
for line in robots_text.splitlines():
if line.lstrip().lower().startswith("sitemap:"):
url = line.split(":", 1)[1].strip()
yield urljoin(base_url or "", url)
|