1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439
|
# SPDX-License-Identifier: MIT
# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
from __future__ import annotations
import html
import logging
import os
import re
import subprocess
import sys
import time
import traceback
from urllib import parse
import lxml
from . import AppName, http
from .configuration import App, SupportUrl
logger = logging.getLogger(__name__)
# Maximum content size for HTML pages
MaxContentBytes = 1024 * 1024 * 3 # 3 MB
# The character set to encode non-ASCII characters in a URL. See also
# http://tools.ietf.org/html/rfc2396#section-2.1
# Note that the encoding is not really specified, but most browsers
# encode in UTF-8 when no encoding is specified by the HTTP headers,
# else they use the page encoding for followed link. See als
# http://code.google.com/p/browsersec/wiki/Part1#Unicode_in_URLs
UrlEncoding = "utf-8"
def get_system_uid():
"""Get a (probably) unique ID to identify a system.
Used to differentiate votes.
"""
try:
if os.name == 'nt':
return get_nt_system_uid()
if sys.platform == 'darwin':
return get_osx_system_uid()
except Exception:
return get_mac_uid()
else:
return get_mac_uid()
def get_nt_system_uid():
r"""Get the MachineGuid from
HKEY_LOCAL_MACHINE\Software\Microsoft\Cryptography\MachineGuid
"""
import winreg
lm = winreg.ConnectRegistry(None, winreg.HKEY_LOCAL_MACHINE)
try:
key = winreg.OpenKey(lm, r"Software\Microsoft\Cryptography")
try:
return winreg.QueryValueEx(key, "MachineGuid")[0]
finally:
key.Close()
finally:
lm.Close()
def get_osx_system_uid():
"""Get the OSX system ID.
$ system_profiler |grep "r (system)"
Serial Number (system): C24E1322XYZ
"""
res = backtick(["system_profile"]).splitlines()
for line in res:
if "r (system)" in line:
return line.split(':', 1)[1].strip()
raise ValueError("Could not find system number in %r" % res)
def get_mac_uid():
"""Get the MAC address of the system."""
import uuid
return "%d" % uuid.getnode()
def backtick(cmd, encoding='utf-8'):
"""Return decoded output from command."""
data = subprocess.Popen(cmd, stdout=subprocess.PIPE).communicate()[0]
return data.decode(encoding)
def unicode_safe(text, encoding=UrlEncoding, errors='ignore'):
"""Decode text to Unicode if not already done."""
if isinstance(text, str):
return text
return text.decode(encoding, errors)
def tagre(tag, attribute, value, quote='"', before="", after=""):
"""Return a regular expression matching the given HTML tag, attribute
and value. It matches the tag and attribute names case insensitive,
and skips arbitrary whitespace and leading HTML attributes. The "<>" at
the start and end of the HTML tag is also matched.
@param tag: the tag name
@ptype tag: string
@param attribute: the attribute name
@ptype attribute: string
@param value: the attribute value
@ptype value: string
@param quote: the attribute quote (default ")
@ptype quote: string
@param after: match after attribute value but before end
@ptype after: string
@return: the generated regular expression suitable for re.compile()
@rtype: string
"""
if before:
prefix = r"[^>]*%s[^>]*\s+" % before
else:
prefix = r"(?:[^>]*\s+)?"
attrs = {
'tag': case_insensitive_re(tag),
'attribute': case_insensitive_re(attribute),
'value': value,
'quote': quote,
'prefix': prefix,
'after': after,
}
return (r'<\s*%(tag)s\s+%(prefix)s' +
r'%(attribute)s\s*=\s*%(quote)s%(value)s%(quote)' +
r's[^>]*%(after)s[^>]*>') % attrs
def case_insensitive_re(name):
"""Reformat the given name to a case insensitive regular expression string
without using re.IGNORECASE. This way selective strings can be made case
insensitive.
@param name: the name to make case insensitive
@ptype name: string
@return: the case insensitive regex
@rtype: string
"""
return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name)
def get_page(url, session, **kwargs):
"""Get text content of given URL."""
http.check_robotstxt(url, session)
# read page data
page = urlopen(url, session, max_content_bytes=MaxContentBytes, **kwargs)
logger.trace("Got page content %r", page.content)
return page
def makeSequence(item):
"""If item is already a list or tuple, return it.
Else return a tuple with item as single element."""
if isinstance(item, (list, tuple)):
return item
return (item,)
def prettyMatcherList(things):
"""Try to construct a nicely-formatted string for a list of matcher
objects. Those may be compiled regular expressions or strings..."""
norm = []
for x in makeSequence(things):
if hasattr(x, 'pattern'):
norm.append(x.pattern)
else:
norm.append(x)
return "('%s')" % "', '".join(norm)
def normaliseURL(url):
"""Normalising
- strips and leading or trailing whitespace,
- replaces HTML entities and character references,
- removes any leading empty segments to avoid breaking urllib2.
"""
url = unicode_safe(url).strip()
# XXX: brutal hack
url = html.unescape(url)
pu = list(parse.urlparse(url))
segments = pu[2].split('/')
while segments and segments[0] in ('', '..'):
del segments[0]
pu[2] = '/' + '/'.join(segments)
# remove leading '&' from query
if pu[4].startswith('&'):
pu[4] = pu[4][1:]
# remove anchor
pu[5] = ""
return parse.urlunparse(pu)
def urlopen(url, session, referrer=None, max_content_bytes=None,
allow_errors=(), **kwargs):
"""Open an URL and return the response object."""
logger.debug('Open URL %r', url)
if 'headers' not in kwargs:
kwargs['headers'] = {}
if referrer:
kwargs['headers']['Referer'] = referrer
logger.trace('Sending headers %r', kwargs['headers'])
logger.debug('Sending cookies %r', session.cookies)
if 'data' not in kwargs:
method = 'GET'
else:
method = 'POST'
logger.trace('Sending POST data %r', kwargs['data'])
req = session.request(method, url, **kwargs)
logger.debug('Response cookies: %r', req.cookies)
check_content_size(url, req.headers, max_content_bytes)
if req.status_code not in allow_errors:
req.raise_for_status()
return req
def check_content_size(url, headers, max_content_bytes):
"""Check that content length in URL response headers do not exceed the
given maximum bytes.
"""
if not max_content_bytes:
return
if 'content-length' in headers:
size = int(headers['content-length'])
if size > max_content_bytes:
raise IOError(
'URL content of %s with %d bytes exceeds %d bytes.' %
(url, size, max_content_bytes))
def splitpath(path):
"""Split a path in its components."""
c = []
head, tail = os.path.split(path)
while tail:
c.insert(0, tail)
head, tail = os.path.split(head)
return c
def getRelativePath(basepath, path):
"""Get a path that is relative to the given base path."""
basepath = splitpath(os.path.abspath(basepath))
path = splitpath(os.path.abspath(path))
afterCommon = False
for c in basepath:
if afterCommon or path[0] != c:
path.insert(0, os.path.pardir)
afterCommon = True
else:
del path[0]
return os.path.join(*path)
def getQueryParams(url):
"""Get URL query parameters."""
query = parse.urlsplit(url).query
logger.debug('Extracting query parameters from %r (%r)...', url, query)
return parse.parse_qs(query)
def internal_error(out=sys.stderr, etype=None, evalue=None, tb=None):
"""Print internal error message (output defaults to stderr)."""
print(os.linesep, file=out)
print(f"""********** Oops, I did it again. *************
You have found an internal error in {AppName}. Please write a bug report
at {SupportUrl} and include at least the information below:
Not disclosing some of the information below due to privacy reasons is ok.
I will try to help you nonetheless, but you have to give me something
I can work with ;) .
""", file=out)
if etype is None:
etype = sys.exc_info()[0]
if evalue is None:
evalue = sys.exc_info()[1]
print(etype, evalue, file=out)
if tb is None:
tb = sys.exc_info()[2]
traceback.print_exception(etype, evalue, tb, None, out)
print_app_info(out=out)
print_proxy_info(out=out)
print_locale_info(out=out)
print(os.linesep,
f"******** {AppName} internal error, over and out ********",
file=out)
def print_env_info(key, out=sys.stderr):
"""If given environment key is defined, print it out."""
value = os.getenv(key)
if value is not None:
print(key, "=", repr(value), file=out)
def print_proxy_info(out=sys.stderr):
"""Print proxy info."""
print_env_info("http_proxy", out=out)
def print_locale_info(out=sys.stderr):
"""Print locale info."""
for key in ("LANGUAGE", "LC_ALL", "LC_CTYPE", "LANG"):
print_env_info(key, out=out)
def print_app_info(out=sys.stderr):
"""Print system and application info (output defaults to stderr)."""
print("System info:", file=out)
print(App, file=out)
print("Python %(version)s on %(platform)s" %
{"version": sys.version, "platform": sys.platform}, file=out)
print("libxml2 version: %i.%i.%i" % lxml.etree.LIBXML_VERSION, file=out)
stime = strtime(time.time())
print("Local time:", stime, file=out)
print("sys.argv", sys.argv, file=out)
def strtime(t):
"""Return ISO 8601 formatted time."""
return (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(t)) +
strtimezone())
def strtimezone():
"""Return timezone info, %z on some platforms, but not supported on all.
"""
if time.daylight:
zone = time.altzone
else:
zone = time.timezone
return "%+04d" % (-zone // 3600)
def rfc822date(indate):
"""Format date in rfc822 format."""
return time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime(indate))
def unquote(text):
"""Replace all percent-encoded entities in text."""
while '%' in text:
newtext = parse.unquote(text)
if newtext == text:
break
text = newtext
return text
def getFilename(name):
"""Get a filename from given name without dangerous or incompatible
characters."""
# first replace all illegal chars
name = re.sub(r"[^0-9a-zA-Z_\-\.]", "_", name)
# then remove double dots and underscores
while ".." in name:
name = name.replace('..', '.')
while "__" in name:
name = name.replace('__', '_')
# remove a leading dot or minus
if name.startswith((".", "-")):
name = name[1:]
return name
def getExistingFile(name, max_suffix=1000):
"""Add filename suffix until file exists
@return: filename if file is found
@raise: ValueError if maximum suffix number is reached while searching
"""
num = 1
stem, ext = os.path.splitext(name)
filename = name
while not os.path.exists(filename):
suffix = "-%d" % num
filename = stem + suffix + ext
num += 1
if num >= max_suffix:
raise ValueError("No file %r found" % name)
return filename
def getNonexistingFile(name):
"""Add filename suffix until file not exists
@return: filename
"""
num = 1
stem, ext = os.path.splitext(name)
filename = name
while os.path.exists(filename):
suffix = "-%d" % num
filename = stem + suffix + ext
num += 1
return filename
def strlimit(s, length=72):
"""If the length of the string exceeds the given limit, it will be cut
off and three dots will be appended.
@param s: the string to limit
@type s: string
@param length: maximum length
@type length: non-negative integer
@return: limited string, at most length+3 characters long
"""
assert length >= 0, "length limit must be a non-negative integer"
if not s or len(s) <= length:
return s
if length == 0:
return ""
return "%s..." % s[:length]
def uniq(input):
"""Remove duplicates from a list while preserving the list order"""
output = []
for item in input:
if item not in output:
output.append(item)
return output
def urlpathsplit(url: str) -> list[str]:
"""Split the path of an URL into components, removing empty leading and
trailing segments. This makes URL handling more robust against added or
removed slashes.
"""
parts = parse.urlsplit(url).path.split('/')
while parts and not parts[-1]:
del parts[-1]
while parts and not parts[0]:
del parts[0]
return parts
|