File: pdf_chapter_diff.py

package info (click to toggle)
openxr-sdk-source 1.0.14~dfsg1-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 6,564 kB
  • sloc: python: 16,103; cpp: 12,052; ansic: 8,813; xml: 3,480; sh: 410; makefile: 338; ruby: 247
file content (482 lines) | stat: -rwxr-xr-x 16,646 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
#!/usr/bin/env python3
# Copyright (c) 2019 Collabora, Ltd.
#
# SPDX-License-Identifier: Apache-2.0
#
# Author(s):    Ryan Pavlik <ryan.pavlik@collabora.com>
#
# Purpose:      This script helps drive a per-section PDF diff.

import re
from itertools import chain, zip_longest
from pathlib import Path
from pprint import pprint

import attr
from PyPDF2 import PdfFileReader

from pdf_diff import command_line as pdf_diff

NUMBERED_TITLE_RE = re.compile(
    r'(?P<section_num>([0-9]+[.])+) (?P<title_text>.*)')


@attr.s
class Bookmark:
    title = attr.ib()
    level = attr.ib()
    page_number = attr.ib()
    top = attr.ib(default=None)
    bottom = attr.ib(default=None)
    left = attr.ib(default=None)
    right = attr.ib(default=None)
    nested_title = attr.ib(default=None)

    def asdict(self):
        # fields = attr.fields(Section)
        return attr.asdict(self)


@attr.s
class Section:
    title = attr.ib()
    level = attr.ib()
    page_start = attr.ib()
    page_end = attr.ib()
    pdf = attr.ib(default=None)
    page_start_top = attr.ib(default=None)
    page_end_bottom = attr.ib(default=None)

    @property
    def page_numbers(self):
        return range(self.page_start, self.page_end + 1)

    @property
    def section_number(self):
        match = NUMBERED_TITLE_RE.match(self.title)
        if match:
            return match.group('section_num')
        return None

    @property
    def title_text(self):
        match = NUMBERED_TITLE_RE.match(self.title)
        if match:
            return match.group('title_text')
        return self.title

    @property
    def pdf_diff_options(self):
        fields = attr.fields(Section)
        ret = attr.asdict(self, filter=attr.filters.include(
            fields.page_start, fields.page_end))
        ret['fn'] = str(self.pdf.fn)
        if self.page_start_top:
            ret['page_start_top'] = self.page_start_top

        if self.page_end_bottom:
            ret['page_end_bottom'] = self.page_end_bottom
        return ret

    def asdict(self):
        return attr.asdict(self,
                           filter=attr.filters.exclude(
                               attr.fields(Section).pdf))


def add_nested_title(bookmarks):
    title_stack = []
    for i, bookmark in enumerate(bookmarks):
        while bookmark.level >= len(title_stack):
            title_stack.pop()
        title_stack.append(bookmark.title)
        bookmark.nested_title = " : ".join(title_stack)


def outline_to_bookmarks(reader, outline=None, level=1, bookmarks=None):
    if outline is None:
        outline = reader.getOutlines()
    if bookmarks is None:
        bookmarks = []
    for elt in outline:
        if isinstance(elt, list):
            outline_to_bookmarks(reader, outline=elt, level=level+1,
                                 bookmarks=bookmarks)
        else:
            page_num = reader.getDestinationPageNumber(elt)
            bookmark = Bookmark(
                title=elt.title,
                level=level,
                page_number=page_num + 1)
            page = reader.getPage(page_num)
            _, ul_y = page.bleedBox.upperLeft
            # print(page.bleedBox.upperLeft, page.bleedBox.lowerRight)
            # Coordinate system is flipped compared to pdf_diff...
            if elt.top:
                bookmark.top = float(ul_y) - float(elt.top)
            if elt.bottom:
                bookmark.bottom = float(ul_y) - float(elt.bottom)
            if elt.left:
                bookmark.left = float(elt.left)
            if elt.right:
                bookmark.right = float(elt.right)
            bookmarks.append(bookmark)
    return bookmarks


def add_section_to_page_map(section, page_map):
    for i in section.page_numbers:
        if i not in page_map:
            page_map[i] = []
        page_map[i].append(section)


def compute_section_page_map(sections):
    page_map = {}
    for sec in sections:
        # +1 is for an inclusive range
        for page_num in sec.page_numbers:
            if page_num not in page_map:
                page_map[page_num] = []
            page_map[page_num].append(sec)
    return page_map


class PdfSpec:
    def __init__(self, fn):
        self.fn = fn
        self.reader = PdfFileReader(open(str(fn), 'rb'))
        self.bookmark_data = outline_to_bookmarks(self.reader)

        self._page_pdfs = None

        self._dom = None

    @property
    def dom(self):
        if self._dom is None:
            self._dom = pdf_diff.pdf_to_dom(str(self.fn))
        return self._dom

    @property
    def page_pdfs(self):
        import pypdftk
        if not self._page_pdfs:
            self._page_pdfs = pypdftk.split(self.fn)
        return self._page_pdfs

    def pdf_for_page(self, pagenum):
        if pagenum is None:
            return None
        return self.page_pdfs[pagenum]

    def compute_sections(self,
                         level=None,
                         bookmarks=None,
                         bookmark_predicate=None):
        if level is None:
            level = 1
        if bookmarks is None:
            if bookmark_predicate is not None:
                bookmarks = [x for x in self.bookmark_data
                             if bookmark_predicate(x)]
            else:
                bookmarks = [x for x in self.bookmark_data
                             if x.level == level]

        sections = []
        # Add a placeholder section taking up all front-matter pages
        first_bookmark = bookmarks[0]
        if first_bookmark.page_number != 1:
            sections.append(Section(title="0. Front Matter",
                                    level=level,
                                    page_start=1,
                                    page_end=first_bookmark.page_number,
                                    pdf=self))

        prev_bookmark = None
        for i, bookmark in enumerate(bookmarks):
            if prev_bookmark is not None:
                s = prev_bookmark.page_number
                e = bookmark.page_number
                if not bookmark.top or bookmark.top == 0:
                    # If no "top", then assume the section ends on a page break.
                    e -= 1
                sec = Section(title=prev_bookmark.title,
                              level=prev_bookmark.level,
                              page_start=s,
                              page_end=e,
                              pdf=self)
                if prev_bookmark.top:
                    sec.page_start_top = prev_bookmark.top
                if bookmark.top:
                    sec.page_end_bottom = bookmark.top
                sections.append(sec)
            prev_bookmark = bookmark
        if bookmark is not None:
            # TODO Deal with the last section here!
            pass

        # Now, populate the object fields
        self.comparable_sections = sections
        self.section_by_title = {sec.title: sec
                                 for sec in self.comparable_sections}
        self.section_by_title_text = {sec.title_text: sec
                                      for sec in self.comparable_sections
                                      if sec.title_text}
        self.section_by_number = {sec.section_number: sec
                                  for sec in self.comparable_sections
                                  if sec.section_number}
        self.page_map = compute_section_page_map(self.comparable_sections)
        return sections

    def find_corresponding_section(self, section):
        """Find our own section corresponding to the supplied section from another PDF."""
        own_section = self.section_by_title.get(section.title)
        if own_section:
            # Easy - full title matches
            return own_section

        own_section = self.section_by_title_text.get(section.title_text)
        if own_section:
            # Not as easy, we had a section renumber, possible issue here!
            return own_section

        own_section = self.section_by_number.get(section.section_number)
        if own_section:
            # Only the section number matched - WARNING! might be bad match!
            return own_section

        # Total failure
        return None


@attr.s
class MatchingSection:
    title = attr.ib()
    orig_range = attr.ib()
    new_range = attr.ib()
    changes = attr.ib(default=None)

    def __str__(self):
        return '{} ({}:{}-{}, {}:{}-{})'.format(
            self.title,

            self.orig_range['fn'],
            self.orig_range['page_start'],
            self.orig_range['page_end'],

            self.new_range['fn'],
            self.new_range['page_start'],
            self.new_range['page_end'],
        )


def get_section_range_pairs(orig_section, new_pdf):
    """Return MatchingSection for a section."""
    other_section = new_pdf.find_corresponding_section(orig_section)
    if not other_section:
        print("Skipping section {} - no match in the other doc!".format(
            orig_section.title))
        return None
    return MatchingSection(
        title=orig_section.title,
        orig_range=orig_section.pdf_diff_options,
        new_range=other_section.pdf_diff_options)


def get_section_page_pairs(orig_section, new_pdf):
    """Return (orig_page_num, new_page_num) pairs for each page in section."""
    other_section = new_pdf.find_corresponding_section(orig_section)
    if not other_section:
        print("Skipping section {} - no match in the other doc!".format(
            orig_section.title))
        return []
    return zip_longest(orig_section.page_numbers, other_section.page_numbers)


def get_page_pairs_by_section(orig_pdf, new_pdf):
    """Return an iterable of lists of (orig_page_num, new_page_num) pairs.

    One such list of pairs is returned for each section in orig_pdf."""
    return (list(get_section_page_pairs(sec, new_pdf))
            for sec in orig_pdf.comparable_sections)


def get_all_page_pairs(orig_pdf, new_pdf):
    """Get a single list of all page pairs.

    This accommodates inserted pages between sections."""
    # Flatten into a single list of pairs
    raw_pairs = list(chain.from_iterable(
        get_page_pairs_by_section(orig_pdf, new_pdf)))
    # For any "full pair" (where both parts are non-None),
    # we can skip any half-diffs involving either page
    # in the next pass. We compute that set here.

    # For example, if we have the pairs:
    # (1, 1), (2, None), (2, 1), (3, 2)
    # we can drop the (2, None) pair because the original page 2
    # is already being compared against the new page 1 -- see the (2, 1) --
    # so there's no sense in saying it only exists in the original.
    unique_full_pairs = set(((orig_page, new_page)
                             for orig_page, new_page in raw_pairs
                             if orig_page is not None
                             and new_page is not None))
    skip_orig_only = set(((orig_page, None)
                          for orig_page, _ in unique_full_pairs))
    skip_new_only = set(((None, new_page)
                         for _, new_page in unique_full_pairs))
    skip_half_pairs = skip_orig_only.union(skip_new_only)

    # Main filter pass: deduplicate and filter out excluded half-diffs
    pairs = []
    included_pairs = set()
    for page_pair in raw_pairs:
        if page_pair in skip_half_pairs:
            print("Dropping half-pair covered by other full pair", page_pair)
            # Don't unnecessarily include a "page only in document X" item.
            continue
        if page_pair in included_pairs:
            print("Dropping duplicated pair", page_pair)
            # Don't let small sections result in big dupes.
            continue
        pairs.append(page_pair)
        included_pairs.add(page_pair)

    return pairs


class SequenceGapFinder:
    def __init__(self):
        self.prev = None

    def process_and_get_gap(self, current):
        """Return the range of expected numbers skipped between the last call
        to this method and the current one.

        Return None if no numbers skipped."""
        ret = None
        if current is not None:
            if self.prev is not None and current > self.prev + 1:
                ret = range(self.prev + 1, current)
            self.prev = current
        return ret


def zip_longest_permitting_none(a, b):
    """Do zip_longest except treat None as a zero-element iterable."""
    if a is None and b is None:
        return ()
    if a is None:
        return ((None, elt) for elt in b)
    if b is None:
        return ((elt, None) for elt in a)
    return zip_longest(a, b)


def fill_pair_gaps(pairs):
    """Add any missing pages to the list of pairs."""
    orig_pages = [orig_page for orig_page, _ in pairs
                  if orig_page is not None]
    new_pages = [new_page for _, new_page in pairs
                 if new_page is not None]

    assert(orig_pages == sorted(orig_pages))
    assert(new_pages == sorted(new_pages))

    fixed_pairs = []

    orig_gaps = SequenceGapFinder()
    new_gaps = SequenceGapFinder()
    for orig_page, new_page in pairs:
        orig_gap = orig_gaps.process_and_get_gap(orig_page)
        new_gap = new_gaps.process_and_get_gap(new_page)
        if orig_gap is not None or new_gap is not None:
            gap_pairs = list(zip_longest_permitting_none(orig_gap, new_gap))
            print("Found gap pairs", gap_pairs)
            fixed_pairs.extend(gap_pairs)
        fixed_pairs.append((orig_page, new_page))
    return fixed_pairs


class GranularPdfDiff:
    def __init__(self, orig_fn, new_fn):
        self.orig_pdf = PdfSpec(orig_fn)
        self.new_pdf = PdfSpec(new_fn)

    def generate_matching_sections(self):
        """Return a generator of MatchingSection.

        At most one MatchingSection is returned for each section in orig_pdf.
        """
        for sec in self.orig_pdf.comparable_sections:
            matching = get_section_range_pairs(sec, self.new_pdf)
            if not matching:
                continue
            matching.orig_range["dom"] = self.orig_pdf.dom
            matching.new_range["dom"] = self.new_pdf.dom
            changes = pdf_diff.compute_changes(
                matching.orig_range, matching.new_range, bottom_margin=93)
            if changes:
                matching.changes = changes
                yield matching
            else:
                print("No changes in", matching)

    def compute_sections(self, **kwargs):
        self.orig_pdf.compute_sections(**kwargs)
        self.new_pdf.compute_sections(**kwargs)

    def compute_page_pairs(self):
        pairs = get_all_page_pairs(self.orig_pdf, self.new_pdf)
        return fill_pair_gaps(pairs)

    def generate_page_diff(self, orig_page_num, new_page_num):
        orig_page = self.orig_pdf.pdf_for_page(orig_page_num)
        new_page = self.new_pdf.pdf_for_page(new_page_num)
        if orig_page and new_page:
            changes = pdf_diff.compute_changes(
                orig_page, new_page)
            pprint(changes)
        # TODO

    def generate_diff_from_pairs(self, pairs):
        out_docs = []
        for orig_page_num, new_page_num in pairs:
            page_diff = self.generate_page_diff(orig_page_num, new_page_num)
            if page_diff:
                out_docs.append(page_diff)
        # TODO


if __name__ == "__main__":
    SPECDIR = Path(__file__).resolve().parent.parent
    assert(SPECDIR.name == "specification")
    ORIG = SPECDIR / 'compare-base' / 'openxr.pdf'
    NEW = SPECDIR / 'out' / '1.0' / 'openxr.pdf'
    DIFFDIR = SPECDIR / 'diffs'
    DIFFDIR.mkdir(exist_ok=True)

    def is_separate_diff_section(bookmark):
        # All chapters, except the extension chapter
        if bookmark.level == 1 and "List of Extensions" not in bookmark.title:
            return True
        # All the individual sub-sections in the extension chapter
        # (one for each extension)
        if bookmark.level == 2 and "XR_KHR" in bookmark.title:
            return True
        return False

    diff = GranularPdfDiff(ORIG, NEW)
    diff.compute_sections(bookmark_predicate=is_separate_diff_section)
    for i, matching in enumerate(diff.generate_matching_sections(), 1):
        img = pdf_diff.render_changes(matching.changes,
                                      ('strike', 'underline'),
                                      900)
        fn = "Diff part {:02d} - {}.diff.png".format(i, matching.title)
        full_path = DIFFDIR / fn

        print('Writing', full_path.relative_to(SPECDIR))
        with open(str(full_path), 'wb') as fp:
            img.save(fp, 'PNG')