File: __init__.py | Debian Sources

File: init.py

package info (click to toggle)
python-xlrd 2.0.1-2
links: PTS, VCS
area: main
in suites: sid, trixie
size: 1,832 kB
sloc: python: 7,531; makefile: 118; sh: 7
file content (213 lines) | stat: -rw-r--r-- 7,320 bytes
# Copyright (c) 2005-2012 Stephen John Machin, Lingfo Pty Ltd
# This module is part of the xlrd package, which is released under a
# BSD-style licence.
import os
import pprint
import sys
import zipfile

from . import timemachine
from .biffh import (
    XL_CELL_BLANK, XL_CELL_BOOLEAN, XL_CELL_DATE, XL_CELL_EMPTY, XL_CELL_ERROR,
    XL_CELL_NUMBER, XL_CELL_TEXT, XLRDError, biff_text_from_num,
    error_text_from_code,
)
from .book import Book, colname, open_workbook_xls
from .compdoc import SIGNATURE as XLS_SIGNATURE
from .formula import *  # is constrained by __all__
from .info import __VERSION__, __version__
from .sheet import empty_cell
from .xldate import XLDateError, xldate_as_datetime, xldate_as_tuple


#: descriptions of the file types :mod:`xlrd` can :func:`inspect <inspect_format>`.
FILE_FORMAT_DESCRIPTIONS = {
    'xls': 'Excel xls',
    'xlsb': 'Excel 2007 xlsb file',
    'xlsx': 'Excel xlsx file',
    'ods': 'Openoffice.org ODS file',
    'zip': 'Unknown ZIP file',
    None: 'Unknown file type',
}

ZIP_SIGNATURE = b"PK\x03\x04"

PEEK_SIZE = max(len(XLS_SIGNATURE), len(ZIP_SIGNATURE))


def inspect_format(path=None, content=None):
    """
    Inspect the content at the supplied path or the :class:`bytes` content provided
    and return the file's type as a :class:`str`, or ``None`` if it cannot
    be determined.

    :param path:
      A :class:`string <str>` path containing the content to inspect.
      ``~`` will be expanded.

    :param content:
      The :class:`bytes` content to inspect.

    :returns:
       A :class:`str`, or ``None`` if the format cannot be determined.
       The return value can always be looked up in :data:`FILE_FORMAT_DESCRIPTIONS`
       to return a human-readable description of the format found.
    """
    if content:
        peek = content[:PEEK_SIZE]
    else:
        path = os.path.expanduser(path)
        with open(path, "rb") as f:
            peek = f.read(PEEK_SIZE)

    if peek.startswith(XLS_SIGNATURE):
        return 'xls'

    if peek.startswith(ZIP_SIGNATURE):
        zf = zipfile.ZipFile(timemachine.BYTES_IO(content) if content else path)

        # Workaround for some third party files that use forward slashes and
        # lower case names. We map the expected name in lowercase to the
        # actual filename in the zip container.
        component_names = {name.replace('\\', '/').lower(): name
                           for name in zf.namelist()}

        if 'xl/workbook.xml' in component_names:
            return 'xlsx'
        if 'xl/workbook.bin' in component_names:
            return 'xlsb'
        if 'content.xml' in component_names:
            return 'ods'
        return 'zip'


def open_workbook(filename=None,
                  logfile=sys.stdout,
                  verbosity=0,
                  use_mmap=True,
                  file_contents=None,
                  encoding_override=None,
                  formatting_info=False,
                  on_demand=False,
                  ragged_rows=False,
                  ignore_workbook_corruption=False
                  ):
    """
    Open a spreadsheet file for data extraction.

    :param filename: The path to the spreadsheet file to be opened.

    :param logfile: An open file to which messages and diagnostics are written.

    :param verbosity: Increases the volume of trace material written to the
                      logfile.

    :param use_mmap:

      Whether to use the mmap module is determined heuristically.
      Use this arg to override the result.

      Current heuristic: mmap is used if it exists.

    :param file_contents:

      A string or an :class:`mmap.mmap` object or some other behave-alike
      object. If ``file_contents`` is supplied, ``filename`` will not be used,
      except (possibly) in messages.

    :param encoding_override:

      Used to overcome missing or bad codepage information
      in older-version files. See :doc:`unicode`.

    :param formatting_info:

      The default is ``False``, which saves memory.
      In this case, "Blank" cells, which are those with their own formatting
      information but no data, are treated as empty by ignoring the file's
      ``BLANK`` and ``MULBLANK`` records.
      This cuts off any bottom or right "margin" of rows of empty or blank
      cells.
      Only :meth:`~xlrd.sheet.Sheet.cell_value` and
      :meth:`~xlrd.sheet.Sheet.cell_type` are available.

      When ``True``, formatting information will be read from the spreadsheet
      file. This provides all cells, including empty and blank cells.
      Formatting information is available for each cell.

      Note that this will raise a NotImplementedError when used with an
      xlsx file.

    :param on_demand:

      Governs whether sheets are all loaded initially or when demanded
      by the caller. See :doc:`on_demand`.

    :param ragged_rows:

      The default of ``False`` means all rows are padded out with empty cells so
      that all rows have the same size as found in
      :attr:`~xlrd.sheet.Sheet.ncols`.

      ``True`` means that there are no empty cells at the ends of rows.
      This can result in substantial memory savings if rows are of widely
      varying sizes. See also the :meth:`~xlrd.sheet.Sheet.row_len` method.


    :param ignore_workbook_corruption:

      This option allows to read corrupted workbooks.
      When ``False`` you may face CompDocError: Workbook corruption.
      When ``True`` that exception will be ignored.

    :returns: An instance of the :class:`~xlrd.book.Book` class.
    """

    file_format = inspect_format(filename, file_contents)
    # We have to let unknown file formats pass through here, as some ancient
    # files that xlrd can parse don't start with the expected signature.
    if file_format and file_format != 'xls':
        raise XLRDError(FILE_FORMAT_DESCRIPTIONS[file_format]+'; not supported')

    bk = open_workbook_xls(
        filename=filename,
        logfile=logfile,
        verbosity=verbosity,
        use_mmap=use_mmap,
        file_contents=file_contents,
        encoding_override=encoding_override,
        formatting_info=formatting_info,
        on_demand=on_demand,
        ragged_rows=ragged_rows,
        ignore_workbook_corruption=ignore_workbook_corruption,
    )

    return bk


def dump(filename, outfile=sys.stdout, unnumbered=False):
    """
    For debugging: dump an XLS file's BIFF records in char & hex.

    :param filename: The path to the file to be dumped.
    :param outfile: An open file, to which the dump is written.
    :param unnumbered: If true, omit offsets (for meaningful diffs).
    """
    from .biffh import biff_dump
    bk = Book()
    bk.biff2_8_load(filename=filename, logfile=outfile, )
    biff_dump(bk.mem, bk.base, bk.stream_len, 0, outfile, unnumbered)


def count_records(filename, outfile=sys.stdout):
    """
    For debugging and analysis: summarise the file's BIFF records.
    ie: produce a sorted file of ``(record_name, count)``.

    :param filename: The path to the file to be summarised.
    :param outfile: An open file, to which the summary is written.
    """
    from .biffh import biff_count_records
    bk = Book()
    bk.biff2_8_load(filename=filename, logfile=outfile, )
    biff_count_records(bk.mem, bk.base, bk.stream_len, outfile)