File: array_import.py

package info (click to toggle)
python-scipy 0.6.0-12
links: PTS, VCS
area: main
in suites: lenny
size: 32,016 kB
ctags: 46,675
sloc: cpp: 124,854; ansic: 110,614; python: 108,664; fortran: 76,260; objc: 424; makefile: 384; sh: 10
file content (487 lines) | stat: -rw-r--r-- 17,194 bytes
# Authors:  Travis Oliphant, Trent Oliphant
# with support from Lee Barford's group at Agilent, Inc.
#

"""This module allows for the loading of an array from an ASCII
Text File

"""

__all__ = ['read_array', 'write_array']

# Standard library imports.
import os
import re
import sys
import types

# Numpy imports.
import numpy
from numpy import array, take, concatenate, asarray, real, imag
# Sadly, this module is still written with typecodes in mind.
from numpy.oldnumeric import Float

# Local imports.
import numpyio

default = None
_READ_BUFFER_SIZE = 1024*1024
#_READ_BUFFER_SIZE = 1000
#_READ_BUFFER_SIZE = 160

# ASCII Text object stream with automatic (un)compression and URL access.
#
# Adapted from
#    TextFile class Written by: Konrad Hinsen <hinsen@cnrs-orleans.fr>
#
# Written by Travis Oliphant and Trent Oliphant
#   with support from Agilent, Inc.
#


def convert_separator(sep):
    newsep = ''
    for k in sep:
        if k in '.^$*+?{[\\|()':
            newsep = newsep + '\\' + k
        else:
            newsep = newsep + k
    return newsep

def build_numberlist(lines):
    if lines is default:
        linelist = [-1]
    else:
        linelist = []
        errstr = "Argument lines must be a sequence of integers and/or range tuples."
        try:
            for num in lines[:-1]:  # handle all but last element
                if type(num) not in [types.IntType, types.TupleType]:
                    raise ValueError, errstr
                if isinstance(num, types.IntType):
                    linelist.append(num)
                else:
                    if not 1 < len(num) < 4:
                        raise ValueError, "Tuples must be valid range tuples."
                    linelist.extend(range(*num))
        except TypeError:
            raise ValueError, errstr
        num = lines[-1]
        if type(num) is types.IntType:
            linelist.append(num)
        elif type(num) is types.TupleType:
            if [types.IntType]*len(num) != map(type, num):
                if len(num) > 1 and num[1] is not None:
                    raise ValueError, errstr
            if len(num) == 1:
                linelist.extend([num[0],-1])
            elif len(num) == 2:
                if num[1] is None:
                    linelist.extend([num[0], -1])
                else:
                    linelist.extend(range(*num))
            elif len(num) == 3:
                if num[1] is None:
                    linelist.extend([num[0], -num[2]])
                else:
                    linelist.extend(range(*num))
            else:
                raise ValueError, errstr
    return linelist

def get_open_file(fileobject, mode='rb'):
    try:
        # this is the duck typing check: if fileobject
        # can be used is os.path.expanduser, it is a string
        # otherwise it is a fileobject
        fileobject = os.path.expanduser(fileobject)

        if mode[0]=='r' and not os.path.exists(fileobject):
            raise IOError, (2, 'No such file or directory: '
                            + fileobject)
        else:
            try:
                file = open(fileobject, mode)
            except IOError, details:
                file = None
                if type(details) == type(()):
                    details = details + (fileobject,)
                raise IOError, details
    except AttributeError:
        # it is assumed that the fileobject is a python
        # file object if it can not be used in os.path.expanduser
        file = fileobject

    return file


class ascii_stream(object):
    """Text files with line iteration

    Ascii_stream instances can be used like normal read-only file objects
    (i.e. by calling readline() and readlines()), but can
    also be used as sequences of lines in for-loops.

    Finally, ascii_stream objects accept file names that start with '~' or
    '~user' to indicate a home directory(for reading only).

    Constructor: ascii_stream(|fileobject|, |lines|,|comment|),
    where |fileobject| is either an open python file object or
    the name of the file, |lines| is a sequence of integers
    or tuples(indicating ranges) of lines to be read, |comment| is the
    comment line identifier """

    def __init__(self, fileobject, lines=default, comment="#",
                 linesep='\n'):
        if not isinstance(comment, types.StringType):
            raise ValueError, "Comment must be a string."
        self.linelist = build_numberlist(lines)
        self.comment = comment
        self.lencomment = len(comment)
        self.file = get_open_file(fileobject, mode='r')
        self.should_close_file = not (self.file is fileobject)
        self._pos = self.file.tell()
        self._lineindex = 0
        if self.linelist[-1] < 0:
            self._linetoget = self.linelist[-1]
        else:
            self._linetoget = 0
        self._oldbuflines = 0
        self._linesplitter = linesep
        self._buffer = self.readlines(_READ_BUFFER_SIZE)
        self._totbuflines = len(self._buffer)

    def readlines(self, sizehint):
        buffer = self.file.read(sizehint)
        lines = buffer.split(self._linesplitter)
        if len(buffer) < sizehint:  # EOF
            if buffer == '':
                return []
            else:
                return lines
        else:
            if len(lines) < 2:
                raise ValueError, "Buffer size too small."
            backup = len(lines[-1])
            self.file.seek(-backup, 1)
            return lines[:-1]

    def __del__(self):
        if hasattr(self.file,'close') and self.should_close_file:
            self.file.close()

    def __getitem__(self, item):
        while 1:
            line = self.readnextline()
            if line is None:
                raise IndexError
            if len(line) < self.lencomment or line[:self.lencomment] != self.comment:
                break
        return line

    def readnextline(self):
        if self.linelist[self._lineindex] >= 0:
            self._linetoget = self.linelist[self._lineindex]
            self._lineindex += 1
        else:
            self._linetoget = self._linetoget - self.linelist[self._lineindex]
        while self._linetoget >= self._totbuflines:
            self._buffer = self.readlines(_READ_BUFFER_SIZE)
            self._oldbuflines = self._totbuflines
            self._totbuflines += len(self._buffer)
            if (self._totbuflines == self._oldbuflines):
                return None
        line = self._buffer[self._linetoget - self._oldbuflines]
        return line

    def close(self):
        self.file.close()

    def flush(self):
        self.file.flush()


def move_past_spaces(firstline):
    ind = 0
    firstline = firstline.lstrip()
    while firstline[ind] not in [' ','\n','\t','\v','\f','\r']:
        ind += 1
    return firstline[ind:], ind


def extract_columns(arlist, collist, atype, missing):
    if collist[-1] < 0:
        if len(collist) == 1:
            toconvlist = arlist[::-collist[-1]]
        else:
            toconvlist = take(arlist,collist[:-1],0)
            toconvlist = concatenate((toconvlist,
                                      arlist[(collist[-2]-collist[-1])::(-collist[-1])]))
    else:
        toconvlist = take(arlist, collist,0)

    return numpyio.convert_objectarray(toconvlist, atype, missing)


# Given a string representing one line, a separator tuple, a list of
#  columns to read for each element of the atype list and a missing
#  value to insert when conversion fails.

# Regular expressions for detecting complex numbers and for dealing
#  with spaces between the real and imaginary parts

_obj = re.compile(r"""
      ([0-9.eE]+)            # Real part
      ([\t ]*)               # Space between real and imaginary part
      ([+-])                 # +/- sign
      ([\t ]*)               # 0 or more spaces
      (([0-9.eE]+[iIjJ])
      |([iIjJ][0-9.eE]+))    # Imaginary part
      """, re.VERBOSE)

_not_warned = 1
def process_line(line, separator, collist, atype, missing):
    global _not_warned
    strlist = []
    line = _obj.sub(r"\1\3\5",line)  # remove spaces between real
                                     # and imaginary parts of complex numbers

    if _not_warned:
        warn = 0
        if (_obj.search(line) is not None):
            warn = 1
            for k in range(len(atype)):
                if atype[k] in numpy.typecodes['Complex']:
                    warn = 0
        if warn:
            numpy.disp("Warning: Complex data detected, but no requested typecode was complex.")
            _not_warned = 0
    for mysep in separator[:-1]:
        if mysep is None:
            newline, ind = move_past_spaces(line)
            strlist.append(line[:ind])
            line = newline
        else:
            ind = line.find(mysep)
            strlist.append(line[:ind])
            line = line[ind+len(mysep):]
    strlist.extend(line.split(separator[-1]))
    arlist = array(strlist,'O')
    N = len(atype)
    vals = [None]*N
    for k in range(len(atype)):
        vals[k] = extract_columns(arlist, collist[k], atype[k], missing)
    return vals

def getcolumns(stream, columns, separator):
    global _not_warned
    comment = stream.comment
    lenc = stream.lencomment
    k, K = stream.linelist[0], len(stream._buffer)
    while k < K:
        firstline = stream._buffer[k]
        if firstline != '' and firstline[:lenc] != comment:
            break
        k = k + 1
    if k == K:
        raise ValueError, "First line to read not within %d lines of top." % K
    firstline = stream._buffer[k]
    N = len(columns)
    collist = [None]*N
    colsize = [None]*N
    for k in range(N):
        collist[k] = build_numberlist(columns[k])
    _not_warned = 0
    val = process_line(firstline, separator, collist, [Float]*N, 0)
    for k in range(N):
        colsize[k] = len(val[k])
    return colsize, collist

def convert_to_equal_lists(cols, atype):
    if not isinstance(cols, types.ListType):
        cols = [cols]
    if not isinstance(atype, types.ListType):
        atype = [atype]
    N = len(cols) - len(atype)
    if N > 0:
        atype.extend([atype[-1]]*N)
    elif N < 0:
        cols.extend([cols[-1]]*(-N))
    return cols, atype


def read_array(fileobject, separator=default, columns=default, comment="#",
               lines=default, atype=Float, linesep='\n',
               rowsize=10000, missing=0):
    """Return an array or arrays from ascii_formatted data in |fileobject|.

    Inputs:

      fileobject -- An open file object or a string for a valid filename.
                    The string can be prepended by "~/" or "~<name>/" to
                    read a file from the home directory.
      separator -- a string or a tuple of strings to indicate the column
                   separators.  If the length of the string tuple is less
                   than the total number of columns, then the last separator
                   is assumed to be the separator for the rest of the columns.
      columns -- a tuple of integers and range-tuples which describe the
                 columns to read from the file.  A negative entry in the
                 last column specifies the negative skip value to the end.
                 Example:  columns=(1, 4, (5, 9), (11, 15, 3), 17, -2)
                         will read [1,4,5,6,7,8,11,14,17,19,21,23,...]
                 If multiple arrays are to be returned, then this argument
                 should be an ordered list of such tuples.  There should be
                 one entry in the list for each arraytype in the atype list.
      lines   -- a tuple with the same structure as columns which indicates
                 the lines to read.
      comment -- the comment character (line will be ignored even if it is
                 specified by the lines tuple)
      linesep -- separator between rows.
      missing -- value to insert in array when conversion to number fails.
      atype -- the typecode of the output array.  If multiple outputs are
               desired, then this should be a list of typecodes.  The columns
               to fill the array represented by the given typecode is
               determined from the columns argument.  If the length of atype
               does not match the length of the columns list, then, the
               smallest one is expanded to match the largest by repeatedly
               copying the last entry.
      rowsize -- the allocation row size (array grows by this amount as
               data is read in).

    Output -- the 1 or 2d array, or a tuple of output arrays of different
              types, sorted in order of the first column to be placed
              in the output array.

    """

    global _not_warned
    # Make separator into a tuple of separators.
    if type(separator) in [types.StringType, type(default)]:
        sep = (separator,)
    else:
        sep = tuple(separator)
    # Create ascii_object from |fileobject| argument.
    ascii_object = ascii_stream(fileobject, lines=lines, comment=comment, linesep=linesep)
    columns, atype = convert_to_equal_lists(columns, atype)
    numout = len(atype)
    # Get the number of columns to read and expand the columns argument
    colsize, collist = getcolumns(ascii_object, columns, sep)
    # Intialize the output arrays
    outrange = range(numout)
    outarr = []
    typecodes = "".join(numpy.typecodes.values())
    for k in outrange:
        if not atype[k] in typecodes:
            raise ValueError, "One of the array types is invalid, k=%d" % k
        outarr.append(numpy.zeros((rowsize, colsize[k]),atype[k]))
    row = 0
    block_row = 0
    _not_warned = 1
    for line in ascii_object:
        if line.strip() == '':
            continue
        vals = process_line(line, sep, collist, atype, missing)
        for k in outrange:
            outarr[k][row] = vals[k]
        row += 1
        block_row += 1
        if block_row >= rowsize:
            for k in outrange:
                outarr[k].resize((outarr[k].shape[0] + rowsize,colsize[k]))
            block_row = 0
    for k in outrange:
        if outarr[k].shape[0] != row:
            outarr[k].resize((row,colsize[k]))
        a = outarr[k]
        if a.shape[0] == 1 or a.shape[1] == 1:
            outarr[k] = numpy.ravel(a)
    if len(outarr) == 1:
        return outarr[0]
    else:
        return tuple(outarr)


# takes 1-d array and returns a string
def str_array(arr, precision=5,col_sep=' ',row_sep="\n",ss=0):
    thestr = []
    arr = asarray(arr)
    N,M = arr.shape
    thistype = arr.dtype.char
    nofloat = (thistype in '1silbwu') or (thistype in 'Oc')
    cmplx = thistype in 'FD'
    fmtstr = "%%.%de" % precision
    cmpnum = pow(10.0,-precision)
    for n in xrange(N):
        theline = []
        for m in xrange(M):
            val = arr[n,m]
            if ss and abs(val) < cmpnum:
                val = 0*val
            if nofloat or val==0:
                thisval = str(val)
            elif cmplx:
                rval = real(val)
                ival = imag(val)
                thisval = eval('fmtstr % rval')
                if (ival >= 0):
                    istr = eval('fmtstr % ival')
                    thisval = '%s+j%s' % (thisval, istr)
                else:
                    istr = eval('fmtstr % abs(ival)')
                    thisval = '%s-j%s' % (thisval, istr)
            else:
                thisval = eval('fmtstr % val')
            theline.append(thisval)
        strline = col_sep.join(theline)
        thestr.append(strline)
    return row_sep.join(thestr)


def write_array(fileobject, arr, separator=" ", linesep='\n',
                precision=5, suppress_small=0, keep_open=0):
    """Write a rank-2 or less array to file represented by fileobject.

    Inputs:

      fileobject -- An open file object or a string to a valid filename.
      arr -- The array to write.
      separator -- separator to write between elements of the array.
      linesep -- separator to write between rows of array
      precision -- number of digits after the decimal place to write.
      suppress_small -- non-zero to round small numbers down to 0.0
      keep_open = non-zero to return the open file, otherwise, the file is closed.
    Outputs:

      file -- The open file (if keep_open is non-zero)
    """
    file = get_open_file(fileobject, mode='wa')
    rank = numpy.rank(arr)
    if rank > 2:
        raise ValueError, "Can-only write up to 2-D arrays."

    if rank == 0:
        h = 1
        arr = numpy.reshape(arr, (1,1))
    elif rank == 1:
        h = numpy.shape(arr)[0]
        arr = numpy.reshape(arr, (h,1))
    else:
        h = numpy.shape(arr)[0]
        arr = asarray(arr)

    for ch in separator:
        if ch in '0123456789-+FfeEgGjJIi.':
            raise ValueError, "Bad string for separator"

    astr = str_array(arr, precision=precision,
                     col_sep=separator, row_sep=linesep,
                     ss = suppress_small)
    file.write(astr)
    file.write('\n')
    if keep_open:
        return file
    else:
        if file is sys.stdout or file is sys.stderr:
            return
        file.close()
    return