File: example.py

package info (click to toggle)
python-url-matcher 0.6.0-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 372 kB
  • sloc: python: 627; makefile: 17
file content (108 lines) | stat: -rw-r--r-- 3,450 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
"""
Example of usage of the URLMatcher library
"""

from __future__ import annotations

import dataclasses
import random
import time
from collections import Counter

from url_matcher import Patterns, URLMatcher
from url_matcher.matcher import IncludePatternsWithoutDomainError

matcher = URLMatcher()

# Let's add a rule for books to scrape product
patterns = Patterns(include=["books.toscrape.com/catalogue/"], exclude=["/catalogue/category/"])
matcher.add_or_update("books product", patterns)

# Now a rule for product list in books to scrape
patterns = Patterns(
    include=["books.toscrape.com/catalogue/category/", "books.toscrape.com/|", "books.toscrape.com/index.html|"]
)
matcher.add_or_update("books productList", patterns)


# Let's try it

url = "https://books.toscrape.com/catalogue/soumission_998/index.html"
assert matcher.match(url) == "books product"

url = "https://books.toscrape.com/catalogue/category/books/fiction_10/index.html"
assert matcher.match(url) == "books productList"

url = "https://amazon.com"
assert not matcher.match(url)

# Adding a pattern without domain fails

try:
    matcher.add_or_update("won't work", Patterns(["/path"]))
    raise AssertionError
except IncludePatternsWithoutDomainError:
    ...

# But the empty pattern works. It matches anything

assert URLMatcher({"Anything": Patterns([""])}).match("http://anything")

# Now let's see that priorities are working. They are applied only if several
# rules match the URL.

patterns = Patterns(["priority.com"])
matcher.add_or_update("low priority", dataclasses.replace(patterns, priority=200))
matcher.add_or_update("high priority", dataclasses.replace(patterns, priority=300))
assert matcher.match("http://priority.com") == "high priority"

# Let's invert the priorities

matcher.add_or_update("low priority", dataclasses.replace(patterns, priority=300))
matcher.add_or_update("high priority", dataclasses.replace(patterns, priority=200))
assert matcher.match("http://priority.com") == "low priority"

# Let's check the speed creating patterns for many domains and matching
# urls for these domains.


def add_patterns(domain: int) -> None:
    patterns = Patterns(include=[f"{domain}/catalogue/?param=book"], exclude=["/catalogue/category/"])
    matcher.add_or_update(f"{domain} product", patterns)

    patterns = Patterns(include=[f"{domain}/catalogue/category/?param=book_list", f"{domain}/", f"{domain}/index.html"])
    matcher.add_or_update(f"{domain} productList", patterns)


N_DOMAINS = 500
N_URLS = 300

URLS = [
    "https://books.toscrape.com/catalogue/soumission_998/index.html?param=book&p1=23&p2=45",
    "https://books.toscrape.com/catalogue/category/books/fiction_10/index.html?param=book_list&p5=23&p6=45",
]

# Adding the patterns
for idx in range(N_DOMAINS):
    add_patterns(idx)

urls = []
for _ in range(N_URLS):
    url = random.choice(URLS)
    domain = random.randint(0, N_DOMAINS - 1)
    url = url.replace("books.toscrape.com", f"{domain}")
    urls.append((domain, url))

# Let's try to match the urls
start = time.perf_counter()
counter: Counter[bool] = Counter()
for domain, url in urls:
    match = matcher.match(url)
    counter[bool(match)] += 1
    assert match and f"{domain}" in match
end = time.perf_counter()

# It took in my machine ~ 0.04 millis per URL
print(f"{((end - start) / N_URLS) * 1000:.3f} milliseconds per URL. Total {end - start} seconds to match {N_URLS} URLs")

print("Everything worked fine!")