File: format.py

package info (click to toggle)
python-numpy 1%3A1.1.0-3%2Blenny1
links: PTS, VCS
area: main
in suites: lenny
size: 9,548 kB
ctags: 14,321
sloc: ansic: 73,119; python: 59,655; cpp: 822; makefile: 179; fortran: 121; f90: 42
file content (412 lines) | stat: -rw-r--r-- 14,556 bytes
""" Define a simple format for saving numpy arrays to disk with the full
information about them.

WARNING: Due to limitations in the interpretation of structured dtypes, dtypes
with fields with empty names will have the names replaced by 'f0', 'f1', etc.
Such arrays will not round-trip through the format entirely accurately. The data
is intact; only the field names will differ. We are working on a fix for this.
This fix will not require a change in the file format. The arrays with such
structures can still be saved and restored, and the correct dtype may be
restored by using the `loadedarray.view(correct_dtype)` method.

Format Version 1.0
------------------

The first 6 bytes are a magic string: exactly "\\x93NUMPY".

The next 1 byte is an unsigned byte: the major version number of the file
format, e.g. \\x01.

The next 1 byte is an unsigned byte: the minor version number of the file
format, e.g. \\x00. Note: the version of the file format is not tied to the
version of the numpy package.

The next 2 bytes form a little-endian unsigned short int: the length of the
header data HEADER_LEN.

The next HEADER_LEN bytes form the header data describing the array's format. It
is an ASCII string which contains a Python literal expression of a dictionary.
It is terminated by a newline ('\\n') and padded with spaces ('\\x20') to make
the total length of the magic string + 4 + HEADER_LEN be evenly divisible by 16
for alignment purposes.

The dictionary contains three keys:

    "descr" : dtype.descr
        An object that can be passed as an argument to the numpy.dtype()
        constructor to create the array's dtype.
    "fortran_order" : bool
        Whether the array data is Fortran-contiguous or not. Since
        Fortran-contiguous arrays are a common form of non-C-contiguity, we
        allow them to be written directly to disk for efficiency.
    "shape" : tuple of int
        The shape of the array.

For repeatability and readability, this dictionary is formatted using
pprint.pformat() so the keys are in alphabetic order. This is for convenience
only. A writer SHOULD implement this if possible. A reader MUST NOT depend on
this.

Following the header comes the array data. If the dtype contains Python objects
(i.e. dtype.hasobject is True), then the data is a Python pickle of the array.
Otherwise the data is the contiguous (either C- or Fortran-, depending on
fortran_order) bytes of the array. Consumers can figure out the number of bytes
by multiplying the number of elements given by the shape (noting that shape=()
means there is 1 element) by dtype.itemsize.
"""

import cPickle
import pprint
import struct

import numpy
from numpy.lib.utils import safe_eval


MAGIC_PREFIX = '\x93NUMPY'
MAGIC_LEN = len(MAGIC_PREFIX) + 2

def magic(major, minor):
    """ Return the magic string for the given file format version.

    Parameters
    ----------
    major : int in [0, 255]
    minor : int in [0, 255]

    Returns
    -------
    magic : str

    Raises
    ------
    ValueError if the version cannot be formatted.
    """
    if major < 0 or major > 255:
        raise ValueError("major version must be 0 <= major < 256")
    if minor < 0 or minor > 255:
        raise ValueError("minor version must be 0 <= minor < 256")
    return '%s%s%s' % (MAGIC_PREFIX, chr(major), chr(minor))

def read_magic(fp):
    """ Read the magic string to get the version of the file format.

    Parameters
    ----------
    fp : filelike object

    Returns
    -------
    major : int
    minor : int
    """
    magic_str = fp.read(MAGIC_LEN)
    if len(magic_str) != MAGIC_LEN:
        raise ValueError("could not read %d characters for the magic string; got %r" % (MAGIC_LEN, magic_str))
    if magic_str[:-2] != MAGIC_PREFIX:
        raise ValueError("the magic string is not correct; expected %r, got %r" % (MAGIC_PREFIX, magic_str[:-2]))
    major, minor = map(ord, magic_str[-2:])
    return major, minor

def dtype_to_descr(dtype):
    """ Get a serializable descriptor from the dtype.

    The .descr attribute of a dtype object cannot be round-tripped through the
    dtype() constructor. Simple types, like dtype('float32'), have a descr which
    looks like a record array with one field with '' as a name. The dtype()
    constructor interprets this as a request to give a default name. Instead, we
    construct descriptor that can be passed to dtype().
    """
    if dtype.names is not None:
        # This is a record array. The .descr is fine.
        # XXX: parts of the record array with an empty name, like padding bytes,
        # still get fiddled with. This needs to be fixed in the C implementation
        # of dtype().
        return dtype.descr
    else:
        return dtype.str

def header_data_from_array_1_0(array):
    """ Get the dictionary of header metadata from a numpy.ndarray.

    Parameters
    ----------
    array : numpy.ndarray

    Returns
    -------
    d : dict
        This has the appropriate entries for writing its string representation
        to the header of the file.
    """
    d = {}
    d['shape'] = array.shape
    if array.flags.c_contiguous:
        d['fortran_order'] = False
    elif array.flags.f_contiguous:
        d['fortran_order'] = True
    else:
        # Totally non-contiguous data. We will have to make it C-contiguous
        # before writing. Note that we need to test for C_CONTIGUOUS first
        # because a 1-D array is both C_CONTIGUOUS and F_CONTIGUOUS.
        d['fortran_order'] = False

    d['descr'] = dtype_to_descr(array.dtype)
    return d

def write_array_header_1_0(fp, d):
    """ Write the header for an array using the 1.0 format.

    Parameters
    ----------
    fp : filelike object
    d : dict
        This has the appropriate entries for writing its string representation
        to the header of the file.
    """
    header = pprint.pformat(d)
    # Pad the header with spaces and a final newline such that the magic string,
    # the header-length short and the header are aligned on a 16-byte boundary.
    # Hopefully, some system, possibly memory-mapping, can take advantage of
    # our premature optimization.
    current_header_len = MAGIC_LEN + 2 + len(header) + 1  # 1 for the newline
    topad = 16 - (current_header_len % 16)
    header = '%s%s\n' % (header, ' '*topad)
    if len(header) >= (256*256):
        raise ValueError("header does not fit inside %s bytes" % (256*256))
    header_len_str = struct.pack('<H', len(header))
    fp.write(header_len_str)
    fp.write(header)

def read_array_header_1_0(fp):
    """ Read an array header from a filelike object using the 1.0 file format
    version.

    This will leave the file object located just after the header.

    Parameters
    ----------
    fp : filelike object

    Returns
    -------
    shape : tuple of int
        The shape of the array.
    fortran_order : bool
        The array data will be written out directly if it is either C-contiguous
        or Fortran-contiguous. Otherwise, it will be made contiguous before
        writing it out.
    dtype : dtype

    Raises
    ------
    ValueError if the data is invalid.
    """
    # Read an unsigned, little-endian short int which has the length of the
    # header.
    hlength_str = fp.read(2)
    if len(hlength_str) != 2:
        raise ValueError("EOF at %s before reading array header length" % fp.tell())
    header_length = struct.unpack('<H', hlength_str)[0]
    header = fp.read(header_length)
    if len(header) != header_length:
        raise ValueError("EOF at %s before reading array header" % fp.tell())

    # The header is a pretty-printed string representation of a literal Python
    # dictionary with trailing newlines padded to a 16-byte boundary. The keys
    # are strings.
    #   "shape" : tuple of int
    #   "fortran_order" : bool
    #   "descr" : dtype.descr
    try:
        d = safe_eval(header)
    except SyntaxError, e:
        raise ValueError("Cannot parse header: %r\nException: %r" % (header, e))
    if not isinstance(d, dict):
        raise ValueError("Header is not a dictionary: %r" % d)
    keys = d.keys()
    keys.sort()
    if keys != ['descr', 'fortran_order', 'shape']:
        raise ValueError("Header does not contain the correct keys: %r" % (keys,))

    # Sanity-check the values.
    if (not isinstance(d['shape'], tuple) or
        not numpy.all([isinstance(x, int) for x in d['shape']])):
        raise ValueError("shape is not valid: %r" % (d['shape'],))
    if not isinstance(d['fortran_order'], bool):
        raise ValueError("fortran_order is not a valid bool: %r" % (d['fortran_order'],))
    try:
        dtype = numpy.dtype(d['descr'])
    except TypeError, e:
        raise ValueError("descr is not a valid dtype descriptor: %r" % (d['descr'],))

    return d['shape'], d['fortran_order'], dtype

def write_array(fp, array, version=(1,0)):
    """ Write an array to a file, including a header.

    If the array is neither C-contiguous or Fortran-contiguous AND if the
    filelike object is not a real file object, then this function will have to
    copy data in memory.

    Parameters
    ----------
    fp : filelike object
    array : numpy.ndarray
    version : (int, int), optional
        The version number of the format.

    Raises
    ------
    ValueError if the array cannot be persisted.
    Various other errors from pickling if the array contains Python objects as
    part of its dtype.
    """
    if version != (1, 0):
        raise ValueError("we only support format version (1,0), not %s" % (version,))
    fp.write(magic(*version))
    write_array_header_1_0(fp, header_data_from_array_1_0(array))
    if array.dtype.hasobject:
        # We contain Python objects so we cannot write out the data directly.
        # Instead, we will pickle it out with version 2 of the pickle protocol.
        cPickle.dump(array, fp, protocol=2)
    elif array.flags.f_contiguous and not array.flags.c_contiguous:
        # Use a suboptimal, possibly memory-intensive, but correct way to handle
        # Fortran-contiguous arrays.
        fp.write(array.data)
    else:
        if isinstance(fp, file):
            array.tofile(fp)
        else:
            # XXX: We could probably chunk this using something like
            # arrayterator.
            fp.write(array.tostring('C'))

def read_array(fp):
    """ Read an array from a file.

    Parameters
    ----------
    fp : filelike object
        If this is not a real file object, then this may take extra memory and
        time.

    Returns
    -------
    array : numpy.ndarray

    Raises
    ------
    ValueError if the data is invalid.
    """
    version = read_magic(fp)
    if version != (1, 0):
        raise ValueError("only support version (1,0) of file format, not %r" % (version,))
    shape, fortran_order, dtype = read_array_header_1_0(fp)
    if len(shape) == 0:
        count = 1
    else:
        count = numpy.multiply.reduce(shape)

    # Now read the actual data.
    if dtype.hasobject:
        # The array contained Python objects. We need to unpickle the data.
        array = cPickle.load(fp)
    else:
        if isinstance(fp, file):
            # We can use the fast fromfile() function.
            array = numpy.fromfile(fp, dtype=dtype, count=count)
        else:
            # This is not a real file. We have to read it the memory-intensive way.
            # XXX: we can probably chunk this to avoid the memory hit.
            data = fp.read(count * dtype.itemsize)
            array = numpy.fromstring(data, dtype=dtype, count=count)

        if fortran_order:
            array.shape = shape[::-1]
            array = array.transpose()
        else:
            array.shape = shape

    return array


def open_memmap(filename, mode='r+', dtype=None, shape=None,
    fortran_order=False, version=(1,0)):
    """ Open a .npy file as a memory-mapped array.

    Parameters
    ----------
    filename : str
    mode : str, optional
        The mode to open the file with. In addition to the standard file modes,
        'c' is also accepted to mean "copy on write".
    dtype : dtype, optional
    shape : tuple of int, optional
    fortran_order : bool, optional
        If the mode is a "write" mode, then the file will be created using this
        dtype, shape, and contiguity.
    version : tuple of int (major, minor)
        If the mode is a "write" mode, then this is the version of the file
        format used to create the file.

    Returns
    -------
    marray : numpy.memmap

    Raises
    ------
    ValueError if the data or the mode is invalid.
    IOError if the file is not found or cannot be opened correctly.
    """
    if 'w' in mode:
        # We are creating the file, not reading it.
        # Check if we ought to create the file.
        if version != (1, 0):
            raise ValueError("only support version (1,0) of file format, not %r" % (version,))
        # Ensure that the given dtype is an authentic dtype object rather than
        # just something that can be interpreted as a dtype object.
        dtype = numpy.dtype(dtype)
        if dtype.hasobject:
            raise ValueError("the dtype includes Python objects; the array cannot be memory-mapped")
        d = dict(
            descr=dtype_to_descr(dtype),
            fortran_order=fortran_order,
            shape=shape,
        )
        # If we got here, then it should be safe to create the file.
        fp = open(filename, mode+'b')
        try:
            fp.write(magic(*version))
            write_array_header_1_0(fp, d)
            offset = fp.tell()
        finally:
            fp.close()
    else:
        # Read the header of the file first.
        fp = open(filename, 'rb')
        try:
            version = read_magic(fp)
            if version != (1, 0):
                raise ValueError("only support version (1,0) of file format, not %r" % (version,))
            shape, fortran_order, dtype = read_array_header_1_0(fp)
            if dtype.hasobject:
                raise ValueError("the dtype includes Python objects; the array cannot be memory-mapped")
            offset = fp.tell()
        finally:
            fp.close()

    if fortran_order:
        order = 'F'
    else:
        order = 'C'

    # We need to change a write-only mode to a read-write mode since we've
    # already written data to the file.
    if mode == 'w+':
        mode = 'r+'

    marray = numpy.memmap(filename, dtype=dtype, shape=shape, order=order,
        mode=mode, offset=offset)

    return marray