1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327
|
"""This submodule contains file system utilities for Prance."""
__author__ = "Jens Finkhaeuser"
__copyright__ = "Copyright (c) 2016-2019 Jens Finkhaeuser"
__license__ = "MIT"
__all__ = ()
# Re-define an error for backwards compatibility
FileNotFoundError = FileNotFoundError # pragma: no cover
# The following constant and function are taken from
# https://stackoverflow.com/questions/9532499/check-whether-a-path-is-valid-in-python-without-creating-a-file-at-the-paths-ta
# Sadly, Python fails to provide the following magic number for us.
_ERROR_INVALID_NAME = 123
"""
Windows-specific error code indicating an invalid pathname.
See Also
----------
https://msdn.microsoft.com/en-us/library/windows/desktop/ms681382%28v=vs.85%29.aspx
Official listing of all such codes.
"""
# Following Microsoft documentation, set the default read size for detecting
# a file encoding to a multiple of 4k that seems to work well on various OSes
# and volume sizes.
# https://support.microsoft.com/en-us/help/140365/default-cluster-size-for-ntfs-fat-and-exfat
_READ_CHUNK_SIZE = 64 * 1024
"""
Default read size for detecting file encoding.
"""
def is_pathname_valid(pathname):
"""
Test whether a path name is valid.
:return: True if the passed pathname is valid on the current OS, False
otherwise.
:rtype: bool
"""
import errno
import os
# If this pathname is either not a string or is but is empty, this pathname
# is invalid.
try:
if not isinstance(pathname, str) or not pathname:
return False
# Strip this pathname's Windows-specific drive specifier (e.g., `C:\`)
# if any. Since Windows prohibits path components from containing `:`
# characters, failing to strip this `:`-suffixed prefix would
# erroneously invalidate all valid absolute Windows pathnames.
_, pathname = os.path.splitdrive(pathname)
# Directory guaranteed to exist. If the current OS is Windows, this is
# the drive to which Windows was installed (e.g., the "%SYSTEMDRIVE%"
# environment variable); else, the typical root directory.
# The %systemdrive% (typically c:) is the partition with
# the %systemroot% (typically Windows) directory.
import sys
root_dirname = (
os.environ.get("SYSTEMDRIVE", "C:")
if sys.platform == "win32"
else os.path.sep
)
assert os.path.isdir(root_dirname) # ...Murphy and her ironclad Law
# Append a path separator to this directory if needed.
root_dirname = root_dirname.rstrip(os.path.sep) + os.path.sep
# Test whether each path component split from this pathname is valid or
# not, ignoring non-existent and non-readable path components.
for pathname_part in pathname.split(os.path.sep):
try:
os.lstat(root_dirname + pathname_part)
except OSError as exc:
# If an OS-specific exception is raised, its error code
# indicates whether this pathname is valid or not. Unless this
# is the case, this exception implies an ignorable kernel or
# filesystem complaint (e.g., path not found or inaccessible).
#
# Only the following exceptions indicate invalid pathnames:
#
# * Instances of the Windows-specific "WindowsError" class
# defining the "winerror" attribute whose value is
# "_ERROR_INVALID_NAME". Under Windows, "winerror" is more
# fine-grained and hence useful than the generic "errno"
# attribute. When a too-long pathname is passed, for example,
# "errno" is "ENOENT" (i.e., no such file or directory) rather
# than "ENAMETOOLONG" (i.e., file name too long).
# * Instances of the cross-platform "OSError" class defining the
# generic "errno" attribute whose value is either:
# * Under most POSIX-compatible OSes, "ENAMETOOLONG".
# * Under some edge-case OSes (e.g., SunOS, *BSD), "ERANGE".
if hasattr(exc, "winerror"): # pragma: nocover
if exc.winerror == _ERROR_INVALID_NAME:
return False
elif exc.errno in {errno.ENAMETOOLONG, errno.ERANGE}:
return False
# If a "TypeError" exception was raised, it almost certainly has the
# error message "embedded NUL character" indicating an invalid pathname.
except TypeError: # pragma: nocover
return False
# Null-bytes may also cause this, and they are invalid.
except ValueError:
return False
# If no exception was raised, all path components and hence this
# pathname itself are valid. (Praise be to the curmudgeonly python.)
else:
return True
# If any other exception was raised, this is an unrelated fatal issue
# (e.g., a bug). Permit this exception to unwind the call stack.
#
# Did we mention this should be shipped with Python already?
def from_posix(fname):
"""
Convert a path from posix-like, to the platform format.
:param str fname: The filename in posix-like format.
:return: The filename in the format of the platform.
:rtype: str
"""
import sys
if sys.platform == "win32": # pragma: nocover
if fname[0] == "/":
fname = fname[1:]
fname = fname.replace("/", "\\")
return fname
def to_posix(fname):
"""
Convert a path to posix-like format.
:param str fname: The filename to convert to posix format.
:return: The filename in posix-like format.
:rtype: str
"""
import sys
if sys.platform == "win32": # pragma: nocover
import os.path
if os.path.isabs(fname):
fname = "/" + fname
fname = fname.replace("\\", "/")
return fname
def abspath(filename, relative_to=None):
"""
Return the absolute path of a file relative to a reference file.
If no reference file is given, this function works identical to
`canonical_filename`.
:param str filename: The filename to make absolute.
:param str relative_to: [optional] the reference file name.
:return: The absolute path
:rtype: str
"""
# Create filename relative to the reference, if it exists.
import os.path
fname = from_posix(filename)
if relative_to and not os.path.isabs(fname):
relative_to = from_posix(relative_to)
if os.path.isdir(relative_to):
fname = os.path.join(relative_to, fname)
else:
fname = os.path.join(os.path.dirname(relative_to), fname)
# Make the result canonical
fname = canonical_filename(fname)
return to_posix(fname)
def canonical_filename(filename):
"""
Return the canonical version of a file name.
The canonical version is defined as the absolute path, and all file system
links dereferenced.
:param str filename: The filename to make canonical.
:return: The canonical filename.
:rtype: str
"""
import os.path
path = from_posix(filename)
while True:
path = os.path.abspath(path)
try:
p = os.path.dirname(path)
# os.readlink doesn't exist in windows python2.7
try:
deref_path = os.readlink(path)
except AttributeError: # pragma: no cover
return path
path = os.path.join(p, deref_path)
except OSError:
return path
def detect_encoding(filename, default_to_utf8=True, **kwargs):
"""
Detect the named file's character encoding.
If the first parts of the file appear to be ASCII, this function returns
'UTF-8', as that's a safe superset of ASCII. This can be switched off by
changing the `default_to_utf8` parameter.
:param str filename: The name of the file to detect the encoding of.
:param bool default_to_utf8: Defaults to True. Set to False to disable
treating ASCII files as UTF-8.
:param bool read_all: Keyword argument; if True, reads the entire file
for encoding detection.
:return: The file encoding.
:rtype: str
"""
# Read some of the file
import os.path
filename = from_posix(filename)
file_len = os.path.getsize(filename)
read_len = min(_READ_CHUNK_SIZE, file_len)
# ... unless we're supposed to!
if kwargs.get("read_all", False):
read_len = file_len
# Read the first read_len bytes raw, so we can detect the encoding
with open(filename, "rb") as raw_handle:
raw = raw_handle.read(read_len)
# Detect the encoding the file specifies, if any.
import codecs
if raw.startswith(codecs.BOM_UTF8):
encoding = "utf-8-sig"
else:
# Detect encoding using the best detector available
try:
# First try ICU. ICU will report ASCII in the first 32 Bytes as
# ISO-8859-1, which isn't exactly wrong, but maybe optimistic.
import icu
encoding = icu.CharsetDetector(raw).detect().getName().lower()
except ImportError: # pragma: nocover
# If that doesn't work, try chardet - it's not got native components,
# which is a bonus in some environments, but it's not as precise.
import chardet
encoding = chardet.detect(raw)["encoding"].lower()
# Chardet is more brutal in that it reports ASCII if none of the first
# Bytes contain high bits. To emulate ICU, we just bump up the detected
# encoding.
if encoding == "ascii":
encoding = "iso-8859-1"
# Both chardet and ICU may detect ISO-8859-x, which may not be possible
# to decode as UTF-8. So whatever they report, we'll try decoding as
# UTF-8 before reporting it.
if default_to_utf8 and encoding in ("ascii", "iso-8859-1", "windows-1252"):
# Try decoding as utf-8
try:
raw.decode("utf-8")
# If this worked... well there's no guarantee it's utf-8, to be
# honest.
encoding = "utf-8"
except UnicodeDecodeError:
# Decoding as utf-8 failed, so we can't default to it.
pass
return encoding
def read_file(filename, encoding=None):
"""
Read and decode a file, taking BOMs into account.
:param str filename: The name of the file to read.
:param str encoding: The encoding to use. If not given, detect_encoding is
used to determine the encoding.
:return: The file contents.
:rtype: unicode string
"""
filename = from_posix(filename)
if not encoding:
# Detect encoding
encoding = detect_encoding(filename)
# Finally, read the file in the detected encoding
with open(filename, encoding=encoding) as handle:
return handle.read()
def write_file(filename, contents, encoding=None):
"""
Write a file with the given encoding.
The default encoding is 'utf-8'. It's recommended not to change that for
JSON or YAML output.
:param str filename: The name of the file to read.
:param str contents: The file contents to write.
:param str encoding: The encoding to use. If not given, detect_encoding is
used to determine the encoding.
"""
if not encoding:
encoding = "utf-8"
fname = from_posix(filename)
with open(fname, mode="w", encoding=encoding) as handle:
handle.write(contents)
|