File: utils.py

package info (click to toggle)
python-ebooklib 0.20-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 708 kB
  • sloc: python: 2,541; makefile: 132; sh: 53
file content (133 lines) | stat: -rw-r--r-- 3,586 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# This file is part of EbookLib.
# Copyright (c) 2013 Aleksandar Erkalovic <aerkalov@gmail.com>
#
# EbookLib is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# EbookLib is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with EbookLib.  If not, see <http://www.gnu.org/licenses/>.

import io
import mimetypes
import os

from lxml import etree

mimetype_initialised = False


def debug(obj):
    import pprint

    pp = pprint.PrettyPrinter(indent=4)
    pp.pprint(obj)


def parse_string(s):
    parser = etree.XMLParser(recover=True, resolve_entities=False)
    try:
        tree = etree.parse(io.BytesIO(s.encode("utf-8")), parser=parser)
    except Exception:
        tree = etree.parse(io.BytesIO(s), parser=parser)

    return tree


def parse_html_string(s):
    from lxml import html

    utf8_parser = html.HTMLParser(encoding="utf-8")

    html_tree = html.document_fromstring(s, parser=utf8_parser)

    return html_tree


def guess_type(extenstion):
    global mimetype_initialised

    if not mimetype_initialised:
        mimetypes.init()
        mimetypes.add_type("application/xhtml+xml", ".xhtml")
        mimetype_initialised = True

    return mimetypes.guess_type(extenstion)


def create_pagebreak(pageref, label=None, html=True):
    from ebooklib.epub import NAMESPACES

    pageref_attributes = {
        "{%s}type" % NAMESPACES["EPUB"]: "pagebreak",  # noqa
        "title": "{pageref}".format(pageref=pageref),  # noqa: UP032
        "id": "{pageref}".format(pageref=pageref),  # noqa: UP032
    }

    pageref_elem = etree.Element("span", pageref_attributes, nsmap={"epub": NAMESPACES["EPUB"]})

    if label:
        pageref_elem.text = label

    if html:
        return etree.tostring(pageref_elem, encoding="unicode")

    return pageref_elem


def get_headers(elem):
    for n in range(1, 7):
        headers = elem.xpath("./h{n}".format(n=n))  # noqa: UP032

        if len(headers) > 0:
            text = headers[0].text_content().strip()
            if len(text) > 0:
                return text
    return None


def get_pages(item):
    body = parse_html_string(item.get_body_content())
    pages = []

    for elem in body.iter():
        if "epub:type" in elem.attrib:
            if elem.get("id") is not None:
                _text = None

                if elem.text is not None and elem.text.strip() != "":
                    _text = elem.text.strip()

                if _text is None:
                    _text = elem.get("aria-label")

                if _text is None:
                    _text = get_headers(elem)

                pages.append((item.get_name(), elem.get("id"), _text or elem.get("id")))

    return pages


def get_pages_for_items(items):
    pages_from_docs = [get_pages(item) for item in items]

    return [item for pages in pages_from_docs for item in pages]


class Directory(object):  # noqa: UP004
    def __init__(self, directory_path):
        self.directory_path = directory_path

    def read(self, subname):
        with open(os.path.join(self.directory_path, subname), "rb") as fp:
            return fp.read()

    def close(self):
        pass