File: djvu-crop-text

package info (click to toggle)
python-djvulibre 0.9.3-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 648 kB
  • sloc: python: 2,437; makefile: 38; sh: 25
file content (114 lines) | stat: -rwxr-xr-x 3,790 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/env python

# Copyright © 2008-2018 Jakub Wilk <jwilk@jwilk.net>
# Copyright © 2022-2024 FriedrichFroebel
#
# This file is part of djvulibre-python.
#
# djvulibre-python is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License version 2 as published by
# the Free Software Foundation.
#
# djvulibre-python is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
# more details.

import argparse
import os
import sys

import djvu.const
import djvu.decode
import djvu.sexpr


EMPTY_TEXT_SEXPR = djvu.sexpr.Expression([djvu.const.TEXT_ZONE_PAGE, 0, 0, 0, 0, ''])


class ArgumentParser(argparse.ArgumentParser):

    def __init__(self):
        argparse.ArgumentParser.__init__(self)
        self.add_argument('-p', '--pages', dest='pages', action='store', help='pages to process')
        self.add_argument('path', metavar='DJVU-FILE', action='store', help='DjVu file to process')

    def parse_args(self):
        options = argparse.ArgumentParser.parse_args(self)
        try:
            if options.pages is not None:
                pages = []
                for rng in options.pages.split(','):
                    if '-' in rng:
                        x, y = map(int, options.pages.split('-', 1))
                        pages += range(x, y + 1)
                    else:
                        pages += [int(rng)]
                options.pages = pages
        except (TypeError, ValueError):
            self.error('Unable to parse page numbers')
        return options


def crop_text(sexpr, width, height):
    if isinstance(sexpr, djvu.sexpr.ListExpression) and len(sexpr) >= 5:
        tp = sexpr[0]
        x0, y0, x1, y1 = (sexpr[i].value for i in range(1, 5))
        if x1 < 0 or y1 < 0 or x0 >= width or y0 >= height:
            return
        x0 = max(0, x0)
        y0 = max(0, y0)
        x1 = min(x1, width)
        y1 = min(y1, height)
        children = (crop_text(child, width, height) for child in sexpr[5:])
        children = [child for child in children if child is not None]
        if not children:
            return
        return djvu.sexpr.Expression([tp, x0, y0, x1, y1] + children)
    else:
        return sexpr


class Context(djvu.decode.Context):

    def handle_message(self, message):
        if isinstance(message, djvu.decode.ErrorMessage):
            print(message, file=sys.stderr)
            # Exceptions in handle_message() are ignored, so sys.exit()
            # wouldn't work here.
            os._exit(1)

    def process_page(self, page):
        print('- Page #{0}'.format(page.n + 1), file=sys.stderr)
        page.get_info()
        text = crop_text(page.text.sexpr, page.width, page.height)
        if not text:
            text = EMPTY_TEXT_SEXPR
        return text

    def process(self, path, pages=None):
        print('Processing {path!r}:'.format(path=path), file=sys.stderr)
        document = self.new_document(djvu.decode.FileURI(path))
        document.decoding_job.wait()
        sed_file = sys.stdout
        if pages is None:
            pages = iter(document.pages)
        else:
            pages = (document.pages[i - 1] for i in pages)
        sed_file.write('remove-txt\n')
        for page in pages:
            sed_file.write('select {0}\n'.format(page.n + 1))
            sed_file.write('set-txt\n')
            self.process_page(page).print_into(sed_file)
            sed_file.write('\n.\n\n')


def main():
    parser = ArgumentParser()
    options = parser.parse_args()
    context = Context()
    context.process(options.path, options.pages)


if __name__ == '__main__':
    main()