File: xlsx.py

package info (click to toggle)
python-petl 1.7.17-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 2,224 kB
  • sloc: python: 22,617; makefile: 109; xml: 9
file content (205 lines) | stat: -rw-r--r-- 6,881 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function, division

import itertools

from petl.compat import PY3, text_type
from petl.util.base import Table, data
from petl.io.sources import read_source_from_arg, write_source_from_arg


def fromxlsx(filename, sheet=None, range_string=None, min_row=None,
             min_col=None, max_row=None, max_col=None, read_only=False,
             **kwargs):
    """
    Extract a table from a sheet in an Excel .xlsx file.

    N.B., the sheet name is case sensitive.

    The `sheet` argument can be omitted, in which case the first sheet in
    the workbook is used by default.

    The `range_string` argument can be used to provide a range string
    specifying a range of cells to extract.

    The `min_row`, `min_col`, `max_row` and `max_col` arguments can be
    used to limit the range of cells to extract. They will be ignored
    if `range_string` is provided.

    The `read_only` argument determines how openpyxl returns the loaded 
    workbook. Default is `False` as it prevents some LibreOffice files
    from getting truncated at 65536 rows. `True` should be faster if the
    file use is read-only and the files are made with Microsoft Excel.

    Any other keyword arguments are passed through to
    :func:`openpyxl.load_workbook()`.

    """

    return XLSXView(filename, sheet=sheet, range_string=range_string,
                    min_row=min_row, min_col=min_col, max_row=max_row,
                    max_col=max_col, read_only=read_only, **kwargs)


class XLSXView(Table):
    
    def __init__(self, filename, sheet=None, range_string=None,
                 min_row=None, min_col=None, max_row=None, max_col=None, 
                 read_only=False, **kwargs):
        self.filename = filename
        self.sheet = sheet
        self.range_string = range_string
        self.min_row = min_row
        self.min_col = min_col
        self.max_row = max_row
        self.max_col = max_col
        self.read_only = read_only
        self.kwargs = kwargs

    def __iter__(self):
        import openpyxl
        source = read_source_from_arg(self.filename)
        with source.open('rb') as source2:
            wb = openpyxl.load_workbook(filename=source2,
                                        read_only=self.read_only,
                                        **self.kwargs)
            if self.sheet is None:
                ws = wb[wb.sheetnames[0]]
            elif isinstance(self.sheet, int):
                ws = wb[wb.sheetnames[self.sheet]]
            else:
                ws = wb[str(self.sheet)]
            if self.range_string is not None:
                rows = ws[self.range_string]
            else:
                rows = ws.iter_rows(min_row=self.min_row,
                                    min_col=self.min_col,
                                    max_row=self.max_row,
                                    max_col=self.max_col)
            for row in rows:
                yield tuple(cell.value for cell in row)
            try:
                wb._archive.close()
            except AttributeError:
                # just here in case openpyxl stops exposing an _archive property.
                pass


def toxlsx(tbl, filename, sheet=None, write_header=True, mode="replace"):
    """
    Write a table to a new Excel .xlsx file.

    N.B., the sheet name is case sensitive.

    The `mode` argument controls how the file and sheet are treated:

      - `replace`: This is the default. It either replaces or adds a
        named sheet, or if no sheet name is provided, all sheets
        (overwrites the entire file).

      - `overwrite`: Always overwrites the file. This produces a file
        with a single sheet.

      - `add`: Adds a new sheet. Raises `ValueError` if a named sheet
        already exists.

    The `sheet` argument can be omitted in all cases. The new sheet
    will then get a default name.
    If the file does not exist, it will be created, unless `replace`
    mode is used with a named sheet. In the latter case, the file
    must exist and be a valid .xlsx file.
    """
    wb = _load_or_create_workbook(filename, mode, sheet)
    ws = _insert_sheet_on_workbook(mode, sheet, wb)
    if write_header:
        it = iter(tbl)
        try:
            hdr = next(it)
            flds = list(map(text_type, hdr))
            rows = itertools.chain([flds], it)
        except StopIteration:
            rows = it
    else:
        rows = data(tbl)
    for row in rows:
        ws.append(row)
    target = write_source_from_arg(filename)
    with target.open('wb') as target2:
        wb.save(target2)


def _load_or_create_workbook(filename, mode, sheet):
    if PY3:
        FileNotFound = FileNotFoundError
    else:
        FileNotFound = IOError

    import openpyxl
    wb = None
    if not (mode == "overwrite" or (mode == "replace" and sheet is None)):
        try:
            source = read_source_from_arg(filename)
            with source.open('rb') as source2:
                wb = openpyxl.load_workbook(filename=source2, read_only=False)
        except FileNotFound:
            wb = None
    if wb is None:
        wb = openpyxl.Workbook(write_only=True)
    return wb


def _insert_sheet_on_workbook(mode, sheet, wb):
    if mode == "replace":
        try:
            ws = wb[str(sheet)]
            ws.delete_rows(1, ws.max_row)
        except KeyError:
            ws = wb.create_sheet(title=sheet)
    elif mode == "add":
        ws = wb.create_sheet(title=sheet)
        # it creates a sheet named "foo1" if "foo" exists.
        if sheet is not None and ws.title != sheet:
            raise ValueError("Sheet %s already exists in file" % sheet)
    elif mode == "overwrite":
        ws = wb.create_sheet(title=sheet)
    else:
        raise ValueError("Unknown mode '%s'" % mode)
    return ws


Table.toxlsx = toxlsx


def appendxlsx(tbl, filename, sheet=None, write_header=False):
    """
    Appends rows to an existing Excel .xlsx file.
    """

    import openpyxl
    source = read_source_from_arg(filename)
    with source.open('rb') as source2:
        wb = openpyxl.load_workbook(filename=source2, read_only=False)
        if sheet is None:
            ws = wb[wb.sheetnames[0]]
        elif isinstance(sheet, int):
            ws = wb[wb.sheetnames[sheet]]
        else:
            ws = wb[str(sheet)]
        if write_header:
            it = iter(tbl)
            try:
                hdr = next(it)
                flds = list(map(text_type, hdr))
                rows = itertools.chain([flds], it)
            except StopIteration:
                rows = it
        else:
            rows = data(tbl)
        for row in rows:
            ws.append(row)
        target = write_source_from_arg(filename)
        with target.open('wb') as target2:
            wb.save(target2)


Table.appendxlsx = appendxlsx