1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205
|
# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function, division
import itertools
from petl.compat import PY3, text_type
from petl.util.base import Table, data
from petl.io.sources import read_source_from_arg, write_source_from_arg
def fromxlsx(filename, sheet=None, range_string=None, min_row=None,
min_col=None, max_row=None, max_col=None, read_only=False,
**kwargs):
"""
Extract a table from a sheet in an Excel .xlsx file.
N.B., the sheet name is case sensitive.
The `sheet` argument can be omitted, in which case the first sheet in
the workbook is used by default.
The `range_string` argument can be used to provide a range string
specifying a range of cells to extract.
The `min_row`, `min_col`, `max_row` and `max_col` arguments can be
used to limit the range of cells to extract. They will be ignored
if `range_string` is provided.
The `read_only` argument determines how openpyxl returns the loaded
workbook. Default is `False` as it prevents some LibreOffice files
from getting truncated at 65536 rows. `True` should be faster if the
file use is read-only and the files are made with Microsoft Excel.
Any other keyword arguments are passed through to
:func:`openpyxl.load_workbook()`.
"""
return XLSXView(filename, sheet=sheet, range_string=range_string,
min_row=min_row, min_col=min_col, max_row=max_row,
max_col=max_col, read_only=read_only, **kwargs)
class XLSXView(Table):
def __init__(self, filename, sheet=None, range_string=None,
min_row=None, min_col=None, max_row=None, max_col=None,
read_only=False, **kwargs):
self.filename = filename
self.sheet = sheet
self.range_string = range_string
self.min_row = min_row
self.min_col = min_col
self.max_row = max_row
self.max_col = max_col
self.read_only = read_only
self.kwargs = kwargs
def __iter__(self):
import openpyxl
source = read_source_from_arg(self.filename)
with source.open('rb') as source2:
wb = openpyxl.load_workbook(filename=source2,
read_only=self.read_only,
**self.kwargs)
if self.sheet is None:
ws = wb[wb.sheetnames[0]]
elif isinstance(self.sheet, int):
ws = wb[wb.sheetnames[self.sheet]]
else:
ws = wb[str(self.sheet)]
if self.range_string is not None:
rows = ws[self.range_string]
else:
rows = ws.iter_rows(min_row=self.min_row,
min_col=self.min_col,
max_row=self.max_row,
max_col=self.max_col)
for row in rows:
yield tuple(cell.value for cell in row)
try:
wb._archive.close()
except AttributeError:
# just here in case openpyxl stops exposing an _archive property.
pass
def toxlsx(tbl, filename, sheet=None, write_header=True, mode="replace"):
"""
Write a table to a new Excel .xlsx file.
N.B., the sheet name is case sensitive.
The `mode` argument controls how the file and sheet are treated:
- `replace`: This is the default. It either replaces or adds a
named sheet, or if no sheet name is provided, all sheets
(overwrites the entire file).
- `overwrite`: Always overwrites the file. This produces a file
with a single sheet.
- `add`: Adds a new sheet. Raises `ValueError` if a named sheet
already exists.
The `sheet` argument can be omitted in all cases. The new sheet
will then get a default name.
If the file does not exist, it will be created, unless `replace`
mode is used with a named sheet. In the latter case, the file
must exist and be a valid .xlsx file.
"""
wb = _load_or_create_workbook(filename, mode, sheet)
ws = _insert_sheet_on_workbook(mode, sheet, wb)
if write_header:
it = iter(tbl)
try:
hdr = next(it)
flds = list(map(text_type, hdr))
rows = itertools.chain([flds], it)
except StopIteration:
rows = it
else:
rows = data(tbl)
for row in rows:
ws.append(row)
target = write_source_from_arg(filename)
with target.open('wb') as target2:
wb.save(target2)
def _load_or_create_workbook(filename, mode, sheet):
if PY3:
FileNotFound = FileNotFoundError
else:
FileNotFound = IOError
import openpyxl
wb = None
if not (mode == "overwrite" or (mode == "replace" and sheet is None)):
try:
source = read_source_from_arg(filename)
with source.open('rb') as source2:
wb = openpyxl.load_workbook(filename=source2, read_only=False)
except FileNotFound:
wb = None
if wb is None:
wb = openpyxl.Workbook(write_only=True)
return wb
def _insert_sheet_on_workbook(mode, sheet, wb):
if mode == "replace":
try:
ws = wb[str(sheet)]
ws.delete_rows(1, ws.max_row)
except KeyError:
ws = wb.create_sheet(title=sheet)
elif mode == "add":
ws = wb.create_sheet(title=sheet)
# it creates a sheet named "foo1" if "foo" exists.
if sheet is not None and ws.title != sheet:
raise ValueError("Sheet %s already exists in file" % sheet)
elif mode == "overwrite":
ws = wb.create_sheet(title=sheet)
else:
raise ValueError("Unknown mode '%s'" % mode)
return ws
Table.toxlsx = toxlsx
def appendxlsx(tbl, filename, sheet=None, write_header=False):
"""
Appends rows to an existing Excel .xlsx file.
"""
import openpyxl
source = read_source_from_arg(filename)
with source.open('rb') as source2:
wb = openpyxl.load_workbook(filename=source2, read_only=False)
if sheet is None:
ws = wb[wb.sheetnames[0]]
elif isinstance(sheet, int):
ws = wb[wb.sheetnames[sheet]]
else:
ws = wb[str(sheet)]
if write_header:
it = iter(tbl)
try:
hdr = next(it)
flds = list(map(text_type, hdr))
rows = itertools.chain([flds], it)
except StopIteration:
rows = it
else:
rows = data(tbl)
for row in rows:
ws.append(row)
target = write_source_from_arg(filename)
with target.open('wb') as target2:
wb.save(target2)
Table.appendxlsx = appendxlsx
|