1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407
|
# Copyright 2018 Adobe. All rights reserved.
"""
Helper script for diff'ing files.
Used as part of the integration tests.
"""
import argparse
import difflib
import filecmp
import logging
import os
import re
import sys
__version__ = '0.3.3'
logger = logging.getLogger('differ')
TXT_MODE = 'txt'
BIN_MODE = 'bin'
SPLIT_MARKER = '_+:+_'
DFLT_ENC = 'utf-8'
class Differ(object):
def __init__(self, opts):
self.mode = opts.mode
self.path1 = opts.path1
self.path2 = opts.path2
# tuple of strings containing the starts of lines to skip
self.skip_strings = opts.skip_strings
# tuple of integers for the line numbers to skip
self.skip_lines = opts.skip_lines
# regex pattern matching the beginning of line
self.skip_regex = opts.skip_regex
self.encoding = opts.encoding
def diff_paths(self):
"""
Diffs the contents of two paths using the parameters provided.
Returns True if the contents match, and False if they don't.
"""
if os.path.isdir(self.path1):
return self._diff_dirs()
else: # paths are files
if self.mode == TXT_MODE:
return self._diff_txt_files(self.path1, self.path2)
elif self.mode == BIN_MODE:
return filecmp.cmp(self.path1, self.path2)
def _diff_txt_files(self, path1, path2):
"""
Diffs two text-based files using difflib.
Returns True if the contents match, and False if they don't.
NOTE: This method CANNOT use self.path1 and self.path2 because it
gets called from _diff_dirs().
"""
expected = self._read_txt_file(path1)
actual = self._read_txt_file(path2)
if actual != expected:
for line in difflib.unified_diff(expected, actual,
fromfile=path1,
tofile=path2, n=1):
sys.stdout.write(line)
return False
return True
def _read_txt_file(self, path):
"""
Reads a text file and returns a list of its lines.
"""
# Hard code a first line; this way the difflib results start
# from 1 instead of zero, thus matching the file's line numbers
lines = ['']
try:
with open(path, "r", encoding=self.encoding) as f:
for i, line in enumerate(f.readlines(), 1):
# Skip lines that change, such as timestamps
if self._line_to_skip(line):
logger.debug(f"Matched begin of line. "
f"Skipped: {line.rstrip()}")
# Blank the line instead of actually skipping (via
# 'continue'); this way the difflib results show the
# correct line numbers
line = ''
# Skip specific lines, referenced by number
elif i in self.skip_lines:
logger.debug(f"Matched line #{i}. "
f"Skipped: {line.rstrip()}")
line = ''
# Skip lines that match regex
elif self.skip_regex and self.skip_regex.match(line):
logger.debug(f"Matched regex begin of line. "
f"Skipped: {line.rstrip()}")
line = ''
# Use os-native line separator to enable running difflib
lines.append(line.rstrip() + os.linesep)
except UnicodeDecodeError:
logger.error(f"Couldn't read text file using '{self.encoding}' "
f"encoding.\n File path: {path}")
sys.exit(1)
return lines
def _line_to_skip(self, line):
"""
Loops over the skip items.
Returns True if the beginning of the line matches a skip item.
"""
for item in self.skip_strings:
if line.startswith(item):
return True
return False
def _diff_dirs(self):
"""
Diffs two folders containing files.
Returns True if all files match. Returns False if the folders' contents
don't match, or as soon as one non-matching file is found.
"""
all_rel_file_paths = self._compare_dir_contents()
if all_rel_file_paths is None:
return False
for rel_file_path in all_rel_file_paths:
path1 = self.path1 + rel_file_path
assert os.path.exists(path1), f"Not a valid path1: {path1}"
path2 = self.path2 + rel_file_path
assert os.path.exists(path2), f"Not a valid path2: {path2}"
if self.mode == BIN_MODE:
diff_result = filecmp.cmp(path1, path2)
else:
diff_result = self._diff_txt_files(path1, path2)
if not diff_result:
logger.debug(f"Non-matching file: {rel_file_path}")
return False
return True
def _report_dir_diffs(self, all_paths1, all_paths2):
"""
Returns a string listing the paths that exist in folder 1 but not in 2,
and vice-versa.
"""
diffs_str = ''
set_1st = set(all_paths1)
set_2nd = set(all_paths2)
diff1 = sorted(set_1st - set_2nd)
diff2 = sorted(set_2nd - set_1st)
if diff1:
dir1 = os.path.basename(self.path1)
dj1 = '\n '.join(diff1)
diffs_str += (f"\n In 1st folder ({dir1}) but not in 2nd:"
f"\n {dj1}")
if diff2:
dir2 = os.path.basename(self.path2)
dj2 = '\n '.join(diff2)
diffs_str += (f"\n In 2nd folder ({dir2}) but not in 1st:"
f"\n {dj2}")
return diffs_str
def _compare_dir_contents(self):
"""
Checks if two directory trees have the same files and folders.
Returns a list of relative paths to all files if the dirs' contents
match, and None if they don't.
"""
all_paths1 = self._get_all_file_paths_in_dir_tree(self.path1)
all_paths2 = self._get_all_file_paths_in_dir_tree(self.path2)
if all_paths1 != all_paths2:
dd = self._report_dir_diffs(all_paths1, all_paths2)
logger.info(f"Folders' contents don't match.{dd}")
return None
return all_paths1
@staticmethod
def _get_all_file_paths_in_dir_tree(start_path):
"""
Returns a list of relative paths of all files in a directory tree.
The list's items are ordered top-down according to the tree.
"""
all_paths = []
for dir_name, _, file_names in os.walk(start_path):
all_paths.extend(
[os.path.join(dir_name, f_name) for f_name in file_names])
# Make the paths relative, and enforce order.
all_paths = sorted(
[path.replace(start_path, '') for path in all_paths])
logger.debug(f"All paths: {all_paths}")
return all_paths
def _get_path_kind(pth):
"""
Returns a string describing the kind of path.
Possible values are 'file', 'folder', and 'invalid'.
"""
try:
if os.path.isfile(pth):
return 'file'
elif os.path.isdir(pth):
return 'folder'
elif not os.path.exists(pth):
return 'invalid'
except TypeError:
return 'invalid'
def _paths_are_same_kind(path1, path2):
"""
Checks that both paths are either files or folders.
Returns boolean.
"""
if all([os.path.isfile(path) for path in (path1, path2)]):
return True
elif all([os.path.isdir(path) for path in (path1, path2)]):
return True
return False
def _validate_path(path_str):
valid_path = os.path.abspath(os.path.realpath(path_str))
if not os.path.exists(valid_path):
raise argparse.ArgumentTypeError(
f"{path_str} is not a valid path.")
return valid_path
def _split_string_sequence(str_seq):
return tuple(str_seq.split(SPLIT_MARKER))
def _split_num_range_or_delta(num_str):
num_range = num_str.split('-') + ['-']
num_delta = num_str.split('+') + ['+']
if len(num_range) == 3:
return num_range
elif len(num_delta) == 3:
return num_delta
elif num_str.isnumeric():
return num_str
else:
raise argparse.ArgumentTypeError(
f"Invalid number range or delta: {num_str}")
def _convert_to_int(num_str):
try:
return int(num_str)
except ValueError:
raise argparse.ArgumentTypeError(f"Not a number: {num_str}")
def _expand_num_range_or_delta(num_str_lst):
start_num = _convert_to_int(num_str_lst[0])
rng_dlt_num = _convert_to_int(num_str_lst[1])
sign = num_str_lst[2]
if sign == '+':
return list(range(start_num, start_num + rng_dlt_num + 1))
else: # sign == '-'
if not (rng_dlt_num >= start_num):
raise argparse.ArgumentTypeError(
f"The start of range value is larger than the end of range "
f"value: {start_num}-{rng_dlt_num}")
return list(range(start_num, rng_dlt_num + 1))
def _convert_seq_to_ints(num_seq):
seq = []
for item in num_seq:
if isinstance(item, list):
seq.extend(_expand_num_range_or_delta(item))
else:
seq.append(_convert_to_int(item))
return sorted(set(seq))
def _split_linenumber_sequence(str_seq):
num_seq = [_split_num_range_or_delta(item) for item in str_seq.split(',')]
return tuple(_convert_seq_to_ints(num_seq))
def _compile_regex(str_seq):
if not str_seq.startswith('^'):
raise argparse.ArgumentTypeError(
"The expression must start with the caret '^' character")
try:
return re.compile(str_seq)
except re.error as err:
raise argparse.ArgumentTypeError(
f'The expression is invalid: {err}')
def get_options(args):
parser = argparse.ArgumentParser(
formatter_class=argparse.RawTextHelpFormatter,
description=__doc__
)
parser.add_argument(
'--version',
action='version',
version=__version__
)
parser.add_argument(
'-v',
'--verbose',
action='count',
default=0,
help='verbose mode\n'
'Use -vv for debug mode'
)
parser.add_argument(
'-m',
'--mode',
default=TXT_MODE,
choices=(TXT_MODE, BIN_MODE),
help='diff mode (default: %(default)s)'
)
parser.add_argument(
'-s',
'--string',
dest='skip_strings',
type=_split_string_sequence,
default=(),
help=f'string for matching the beginning of a line to skip\n'
f'For multiple strings, separate them with {SPLIT_MARKER}'
)
parser.add_argument(
'-l',
'--line',
dest='skip_lines',
type=_split_linenumber_sequence,
default=(),
help='number of a line to skip\n'
'For multiple line numbers, separate them with a comma (,).\n'
'For ranges of lines, use a minus (-) between two numbers.\n'
'For a line delta, use a plus (+) between two numbers.'
)
parser.add_argument(
'-r',
'--regex',
dest='skip_regex',
type=_compile_regex,
help='regular expression matching the beginning of a line to skip\n'
"The expression must start with the caret '^' character, "
'and characters such as backslash, semicolon, and space need '
'to be escaped.'
)
parser.add_argument(
'-e',
'--encoding',
default=DFLT_ENC,
choices=(DFLT_ENC, 'macroman'),
help='encoding to use when opening text files (default: %(default)s)'
)
parser.add_argument(
'path1',
metavar='PATH1',
type=_validate_path,
help='1st path for comparison'
)
parser.add_argument(
'path2',
metavar='PATH2',
type=_validate_path,
help='2nd path for comparison'
)
options = parser.parse_args(args)
if not options.verbose:
level = "WARNING"
elif options.verbose == 1:
level = "INFO"
else:
level = "DEBUG"
logging.basicConfig(level=level)
if not _paths_are_same_kind(options.path1, options.path2):
kp1 = _get_path_kind(options.path1)
kp2 = _get_path_kind(options.path2)
parser.error(f"The paths are not of the same kind. "
f"Path1's kind is {kp1}. "
f"Path2's kind is {kp2}.")
logger.debug(f"Line numbers: {options.skip_lines}")
regexpat = getattr(options.skip_regex, 'pattern', None)
logger.debug(f"Regular expression: {regexpat}")
return options
def main(args=None):
"""
Returns True if the inputs match, and False if they don't.
"""
opts = get_options(args)
differ = Differ(opts)
return differ.diff_paths()
if __name__ == "__main__":
sys.exit(0 if main() else 1)
|