File: differ.py

package info (click to toggle)
psautohint 2.4.0-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 1,148 kB
  • sloc: ansic: 11,179; python: 5,556; makefile: 6; sh: 1
file content (407 lines) | stat: -rwxr-xr-x 13,295 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
# Copyright 2018 Adobe. All rights reserved.

"""
Helper script for diff'ing files.
Used as part of the integration tests.
"""

import argparse
import difflib
import filecmp
import logging
import os
import re
import sys

__version__ = '0.3.3'

logger = logging.getLogger('differ')


TXT_MODE = 'txt'
BIN_MODE = 'bin'
SPLIT_MARKER = '_+:+_'
DFLT_ENC = 'utf-8'


class Differ(object):

    def __init__(self, opts):
        self.mode = opts.mode
        self.path1 = opts.path1
        self.path2 = opts.path2
        # tuple of strings containing the starts of lines to skip
        self.skip_strings = opts.skip_strings
        # tuple of integers for the line numbers to skip
        self.skip_lines = opts.skip_lines
        # regex pattern matching the beginning of line
        self.skip_regex = opts.skip_regex
        self.encoding = opts.encoding

    def diff_paths(self):
        """
        Diffs the contents of two paths using the parameters provided.
        Returns True if the contents match, and False if they don't.
        """
        if os.path.isdir(self.path1):
            return self._diff_dirs()
        else:  # paths are files
            if self.mode == TXT_MODE:
                return self._diff_txt_files(self.path1, self.path2)
            elif self.mode == BIN_MODE:
                return filecmp.cmp(self.path1, self.path2)

    def _diff_txt_files(self, path1, path2):
        """
        Diffs two text-based files using difflib.
        Returns True if the contents match, and False if they don't.

        NOTE: This method CANNOT use self.path1 and self.path2 because it
        gets called from _diff_dirs().
        """
        expected = self._read_txt_file(path1)
        actual = self._read_txt_file(path2)
        if actual != expected:
            for line in difflib.unified_diff(expected, actual,
                                             fromfile=path1,
                                             tofile=path2, n=1):
                sys.stdout.write(line)
            return False
        return True

    def _read_txt_file(self, path):
        """
        Reads a text file and returns a list of its lines.
        """
        # Hard code a first line; this way the difflib results start
        # from 1 instead of zero, thus matching the file's line numbers
        lines = ['']
        try:
            with open(path, "r", encoding=self.encoding) as f:
                for i, line in enumerate(f.readlines(), 1):
                    # Skip lines that change, such as timestamps
                    if self._line_to_skip(line):
                        logger.debug(f"Matched begin of line. "
                                     f"Skipped: {line.rstrip()}")
                        # Blank the line instead of actually skipping (via
                        # 'continue'); this way the difflib results show the
                        # correct line numbers
                        line = ''
                    # Skip specific lines, referenced by number
                    elif i in self.skip_lines:
                        logger.debug(f"Matched line #{i}. "
                                     f"Skipped: {line.rstrip()}")
                        line = ''
                    # Skip lines that match regex
                    elif self.skip_regex and self.skip_regex.match(line):
                        logger.debug(f"Matched regex begin of line. "
                                     f"Skipped: {line.rstrip()}")
                        line = ''
                    # Use os-native line separator to enable running difflib
                    lines.append(line.rstrip() + os.linesep)
        except UnicodeDecodeError:
            logger.error(f"Couldn't read text file using '{self.encoding}' "
                         f"encoding.\n      File path: {path}")
            sys.exit(1)
        return lines

    def _line_to_skip(self, line):
        """
        Loops over the skip items.
        Returns True if the beginning of the line matches a skip item.
        """
        for item in self.skip_strings:
            if line.startswith(item):
                return True
        return False

    def _diff_dirs(self):
        """
        Diffs two folders containing files.
        Returns True if all files match. Returns False if the folders' contents
        don't match, or as soon as one non-matching file is found.
        """
        all_rel_file_paths = self._compare_dir_contents()
        if all_rel_file_paths is None:
            return False
        for rel_file_path in all_rel_file_paths:
            path1 = self.path1 + rel_file_path
            assert os.path.exists(path1), f"Not a valid path1: {path1}"
            path2 = self.path2 + rel_file_path
            assert os.path.exists(path2), f"Not a valid path2: {path2}"
            if self.mode == BIN_MODE:
                diff_result = filecmp.cmp(path1, path2)
            else:
                diff_result = self._diff_txt_files(path1, path2)
            if not diff_result:
                logger.debug(f"Non-matching file: {rel_file_path}")
                return False
        return True

    def _report_dir_diffs(self, all_paths1, all_paths2):
        """
        Returns a string listing the paths that exist in folder 1 but not in 2,
        and vice-versa.
        """
        diffs_str = ''
        set_1st = set(all_paths1)
        set_2nd = set(all_paths2)
        diff1 = sorted(set_1st - set_2nd)
        diff2 = sorted(set_2nd - set_1st)
        if diff1:
            dir1 = os.path.basename(self.path1)
            dj1 = '\n    '.join(diff1)
            diffs_str += (f"\n  In 1st folder ({dir1}) but not in 2nd:"
                          f"\n    {dj1}")
        if diff2:
            dir2 = os.path.basename(self.path2)
            dj2 = '\n    '.join(diff2)
            diffs_str += (f"\n  In 2nd folder ({dir2}) but not in 1st:"
                          f"\n    {dj2}")
        return diffs_str

    def _compare_dir_contents(self):
        """
        Checks if two directory trees have the same files and folders.
        Returns a list of relative paths to all files if the dirs' contents
        match, and None if they don't.
        """
        all_paths1 = self._get_all_file_paths_in_dir_tree(self.path1)
        all_paths2 = self._get_all_file_paths_in_dir_tree(self.path2)
        if all_paths1 != all_paths2:
            dd = self._report_dir_diffs(all_paths1, all_paths2)
            logger.info(f"Folders' contents don't match.{dd}")
            return None
        return all_paths1

    @staticmethod
    def _get_all_file_paths_in_dir_tree(start_path):
        """
        Returns a list of relative paths of all files in a directory tree.
        The list's items are ordered top-down according to the tree.
        """
        all_paths = []
        for dir_name, _, file_names in os.walk(start_path):
            all_paths.extend(
                [os.path.join(dir_name, f_name) for f_name in file_names])

        # Make the paths relative, and enforce order.
        all_paths = sorted(
            [path.replace(start_path, '') for path in all_paths])

        logger.debug(f"All paths: {all_paths}")

        return all_paths


def _get_path_kind(pth):
    """
    Returns a string describing the kind of path.
    Possible values are 'file', 'folder', and 'invalid'.
    """
    try:
        if os.path.isfile(pth):
            return 'file'
        elif os.path.isdir(pth):
            return 'folder'
        elif not os.path.exists(pth):
            return 'invalid'
    except TypeError:
        return 'invalid'


def _paths_are_same_kind(path1, path2):
    """
    Checks that both paths are either files or folders.
    Returns boolean.
    """
    if all([os.path.isfile(path) for path in (path1, path2)]):
        return True
    elif all([os.path.isdir(path) for path in (path1, path2)]):
        return True
    return False


def _validate_path(path_str):
    valid_path = os.path.abspath(os.path.realpath(path_str))
    if not os.path.exists(valid_path):
        raise argparse.ArgumentTypeError(
            f"{path_str} is not a valid path.")
    return valid_path


def _split_string_sequence(str_seq):
    return tuple(str_seq.split(SPLIT_MARKER))


def _split_num_range_or_delta(num_str):
    num_range = num_str.split('-') + ['-']
    num_delta = num_str.split('+') + ['+']
    if len(num_range) == 3:
        return num_range
    elif len(num_delta) == 3:
        return num_delta
    elif num_str.isnumeric():
        return num_str
    else:
        raise argparse.ArgumentTypeError(
            f"Invalid number range or delta: {num_str}")


def _convert_to_int(num_str):
    try:
        return int(num_str)
    except ValueError:
        raise argparse.ArgumentTypeError(f"Not a number: {num_str}")


def _expand_num_range_or_delta(num_str_lst):
    start_num = _convert_to_int(num_str_lst[0])
    rng_dlt_num = _convert_to_int(num_str_lst[1])
    sign = num_str_lst[2]
    if sign == '+':
        return list(range(start_num, start_num + rng_dlt_num + 1))
    else:  # sign == '-'
        if not (rng_dlt_num >= start_num):
            raise argparse.ArgumentTypeError(
                f"The start of range value is larger than the end of range "
                f"value: {start_num}-{rng_dlt_num}")
        return list(range(start_num, rng_dlt_num + 1))


def _convert_seq_to_ints(num_seq):
    seq = []
    for item in num_seq:
        if isinstance(item, list):
            seq.extend(_expand_num_range_or_delta(item))
        else:
            seq.append(_convert_to_int(item))
    return sorted(set(seq))


def _split_linenumber_sequence(str_seq):
    num_seq = [_split_num_range_or_delta(item) for item in str_seq.split(',')]
    return tuple(_convert_seq_to_ints(num_seq))


def _compile_regex(str_seq):
    if not str_seq.startswith('^'):
        raise argparse.ArgumentTypeError(
            "The expression must start with the caret '^' character")
    try:
        return re.compile(str_seq)
    except re.error as err:
        raise argparse.ArgumentTypeError(
            f'The expression is invalid: {err}')


def get_options(args):
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawTextHelpFormatter,
        description=__doc__
    )
    parser.add_argument(
        '--version',
        action='version',
        version=__version__
    )
    parser.add_argument(
        '-v',
        '--verbose',
        action='count',
        default=0,
        help='verbose mode\n'
             'Use -vv for debug mode'
    )
    parser.add_argument(
        '-m',
        '--mode',
        default=TXT_MODE,
        choices=(TXT_MODE, BIN_MODE),
        help='diff mode (default: %(default)s)'
    )
    parser.add_argument(
        '-s',
        '--string',
        dest='skip_strings',
        type=_split_string_sequence,
        default=(),
        help=f'string for matching the beginning of a line to skip\n'
             f'For multiple strings, separate them with {SPLIT_MARKER}'
    )
    parser.add_argument(
        '-l',
        '--line',
        dest='skip_lines',
        type=_split_linenumber_sequence,
        default=(),
        help='number of a line to skip\n'
             'For multiple line numbers, separate them with a comma (,).\n'
             'For ranges of lines, use a minus (-) between two numbers.\n'
             'For a line delta, use a plus (+) between two numbers.'
    )
    parser.add_argument(
        '-r',
        '--regex',
        dest='skip_regex',
        type=_compile_regex,
        help='regular expression matching the beginning of a line to skip\n'
             "The expression must start with the caret '^' character, "
             'and characters such as backslash, semicolon, and space need '
             'to be escaped.'
    )
    parser.add_argument(
        '-e',
        '--encoding',
        default=DFLT_ENC,
        choices=(DFLT_ENC, 'macroman'),
        help='encoding to use when opening text files (default: %(default)s)'
    )
    parser.add_argument(
        'path1',
        metavar='PATH1',
        type=_validate_path,
        help='1st path for comparison'
    )
    parser.add_argument(
        'path2',
        metavar='PATH2',
        type=_validate_path,
        help='2nd path for comparison'
    )
    options = parser.parse_args(args)

    if not options.verbose:
        level = "WARNING"
    elif options.verbose == 1:
        level = "INFO"
    else:
        level = "DEBUG"
    logging.basicConfig(level=level)

    if not _paths_are_same_kind(options.path1, options.path2):
        kp1 = _get_path_kind(options.path1)
        kp2 = _get_path_kind(options.path2)
        parser.error(f"The paths are not of the same kind. "
                     f"Path1's kind is {kp1}. "
                     f"Path2's kind is {kp2}.")

    logger.debug(f"Line numbers: {options.skip_lines}")
    regexpat = getattr(options.skip_regex, 'pattern', None)
    logger.debug(f"Regular expression: {regexpat}")

    return options


def main(args=None):
    """
    Returns True if the inputs match, and False if they don't.
    """
    opts = get_options(args)

    differ = Differ(opts)
    return differ.diff_paths()


if __name__ == "__main__":
    sys.exit(0 if main() else 1)