1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278
|
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
# Copyright (c) 2014 Kevin B. Hendricks, John Schember, and Doug Massay
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this list of
# conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice, this list
# of conditions and the following disclaimer in the documentation and/or other materials
# provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
# SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
from __future__ import unicode_literals, division, absolute_import, print_function
import sys
import codecs
PY2 = sys.version_info[0] == 2
PY3 = sys.version_info[0] == 3
iswindows = sys.platform.startswith('win')
try:
from urllib.parse import unquote
except ImportError:
from urllib import unquote
if PY2:
from HTMLParser import HTMLParser
_h = HTMLParser()
elif sys.version_info[1] < 4:
import html.parser
_h = html.parser.HTMLParser()
else:
import html as _h
if PY3:
text_type = str
binary_type = bytes
# if will be printing arbitraty binary data to stdout on python 3
# sys.stdin = sys.stdin.detach()
# sys.stdout = sys.stdout.detach()
# sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
else:
range = xrange
text_type = unicode
binary_type = str
# if will be printing unicode under python 2 need to protect
# against sys.stdout.encoding being None stupidly forcing forcing ascii encoding of unicode
# sys.stdout = codecs.getwriter("utf-8")(sys.stdout)
# alternatively set environment variable as follows **before** launching python: export PYTHONIOENCODING=UTF-8
# NOTE: Python 3 is completely broken when accessing single bytes in bytes strings
# (and they amazingly claim by design and no bug!)
# To illustrate: this works for unicode in Python 3 and for all Python 2.X for both bytestrings and unicode
# >>> o = '123456789'
# >>> o[-3]
# '7'
# >>> type(o[-3])
# <class 'str'>
# >>> type(o)
# <class 'str'>
# Unfortunately, this is what Python 3 does for no sane reason and only for bytestrings
# >>> o = b'123456789'
# >>> o[-3]
# 55
# >>> type(o[-3])
# <class 'int'>
# >>> type(o)
# <class 'bytes'>
# This mind boggling behaviour also happens when indexing a bytestring and/or
# iteratoring over a bytestring. In other words it will return an int but not
# the byte itself!!!!!!!
# The only way to access a single byte as a byte in bytestring and get the byte in both
# Python 2 and Python 3 is to use a slice
# This problem is so common there are horrible hacks floating around the net to **try**
# to work around it, so that code that works on both Python 2 and Python 3 is possible.
# So in order to write code that works on both Python 2 and Python 3
# if you index or access a single byte and want its ord() then use the bord() function.
# If instead you want it as a single character byte use the bchar() function
# both of which are defined below.
if PY3:
# Also Note: if decode a bytestring using 'latin-1' (or any other full range 0-255 encoding)
# in place of ascii you will get a byte value to half-word or integer value
# one-to-one mapping (in the 0 - 255 range)
def bchr(s):
return bytes([s])
def bstr(s):
if isinstance(s, str):
return bytes(s, 'latin-1')
else:
return bytes(s)
def bord(s):
return s
def bchar(s):
return bytes([s])
else:
def bchr(s):
return chr(s)
def bstr(s):
return str(s)
def bord(s):
return ord(s)
def bchar(s):
return s
if PY3:
# list-producing versions of the major Python iterating functions
def lrange(*args, **kwargs):
return list(range(*args, **kwargs))
def lzip(*args, **kwargs):
return list(zip(*args, **kwargs))
def lmap(*args, **kwargs):
return list(map(*args, **kwargs))
def lfilter(*args, **kwargs):
return list(filter(*args, **kwargs))
else:
import __builtin__
# Python 2-builtin ranges produce lists
lrange = __builtin__.range
lzip = __builtin__.zip
lmap = __builtin__.map
lfilter = __builtin__.filter
# In Python 3 you can no longer use .encode('hex') on a bytestring
# instead use the following on both platforms
import binascii
def hexlify(bdata):
return (binascii.hexlify(bdata)).decode('ascii')
# If you: import struct
# Note: struct pack, unpack, unpack_from all *require* bytestring format
# data all the way up to at least Python 2.7.5, Python 3 is okay with either
# If you: import re
# note: Python 3 "re" requires the pattern to be the exact same type as the data to be
# searched ... but u"" is not allowed for the pattern itself only b""
# Python 2.X allows the pattern to be any type and converts it to match the data
# and returns the same type as the data
# convert string to be utf-8 encoded
def utf8_str(p, enc='utf-8'):
if p is None:
return None
if isinstance(p, text_type):
return p.encode('utf-8')
if enc != 'utf-8':
return p.decode(enc).encode('utf-8')
return p
# convert string to be unicode encoded
def unicode_str(p, enc='utf-8'):
if p is None:
return None
if isinstance(p, text_type):
return p
return p.decode(enc)
ASCII_CHARS = set(chr(x) for x in range(128))
URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
'abcdefghijklmnopqrstuvwxyz'
'0123456789' '#' '_.-/~')
IRI_UNSAFE = ASCII_CHARS - URL_SAFE
# returns a quoted IRI (not a URI)
def quoteurl(href):
if isinstance(href,binary_type):
href = href.decode('utf-8')
result = []
for char in href:
if char in IRI_UNSAFE:
char = "%%%02x" % ord(char)
result.append(char)
return ''.join(result)
# unquotes url/iri
def unquoteurl(href):
if isinstance(href,binary_type):
href = href.decode('utf-8')
href = unquote(href)
return href
# unescape html
def unescapeit(sval):
return _h.unescape(sval)
# Python 2.X commandline parsing under Windows has been horribly broken for years!
# Use the following code to emulate full unicode commandline parsing on Python 2
# ie. To get sys.argv arguments and properly encode them as unicode
def unicode_argv():
global iswindows
global PY3
if PY3:
return sys.argv
if iswindows:
# Versions 2.x of Python don't support Unicode in sys.argv on
# Windows, with the underlying Windows API instead replacing multi-byte
# characters with '?'. So use shell32.GetCommandLineArgvW to get sys.argv
# as a list of Unicode strings
from ctypes import POINTER, byref, cdll, c_int, windll
from ctypes.wintypes import LPCWSTR, LPWSTR
GetCommandLineW = cdll.kernel32.GetCommandLineW
GetCommandLineW.argtypes = []
GetCommandLineW.restype = LPCWSTR
CommandLineToArgvW = windll.shell32.CommandLineToArgvW
CommandLineToArgvW.argtypes = [LPCWSTR, POINTER(c_int)]
CommandLineToArgvW.restype = POINTER(LPWSTR)
cmd = GetCommandLineW()
argc = c_int(0)
argv = CommandLineToArgvW(cmd, byref(argc))
if argc.value > 0:
# Remove Python executable and commands if present
start = argc.value - len(sys.argv)
return [argv[i] for i in
range(start, argc.value)]
# this should never happen
return None
else:
argv = []
argvencoding = sys.stdin.encoding
if argvencoding is None:
argvencoding = sys.getfilesystemencoding()
if argvencoding is None:
argvencoding = 'utf-8'
for arg in sys.argv:
if isinstance(arg, text_type):
argv.append(arg)
else:
argv.append(arg.decode(argvencoding))
return argv
# Python 2.X is broken in that it does not recognize CP65001 as UTF-8
def add_cp65001_codec():
if PY2:
try:
codecs.lookup('cp65001')
except LookupError:
codecs.register(
lambda name: name == 'cp65001' and codecs.lookup('utf-8') or None)
return
|