File: logfileparser.py

package info (click to toggle)
cclib-data 1.6.2-2
  • links: PTS, VCS
  • area: non-free
  • in suites: bookworm, bullseye, sid
  • size: 87,912 kB
  • sloc: python: 16,440; sh: 131; makefile: 79; cpp: 31
file content (555 lines) | stat: -rw-r--r-- 21,718 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
# -*- coding: utf-8 -*-
#
# Copyright (c) 2019, the cclib development team
#
# This file is part of cclib (http://cclib.github.io) and is distributed under
# the terms of the BSD 3-Clause License.
"""Generic output file parser and related tools"""


import bz2
import fileinput
import gzip
import inspect
import io
import logging
import os
import random
import sys
import zipfile

if sys.version_info.major == 2:
    getargspec = inspect.getargspec
else:
    getargspec = inspect.getfullargspec

import numpy

from cclib.parser import utils
from cclib.parser.data import ccData
from cclib.parser.data import ccData_optdone_bool


# This seems to avoid a problem with Avogadro.
logging.logMultiprocessing = 0


class myBZ2File(bz2.BZ2File):
    """Return string instead of bytes"""
    def __next__(self):
        line = super(bz2.BZ2File, self).__next__()
        return line.decode("ascii", "replace")

    def next(self):
        line = self.__next__()
        return line


class myGzipFile(gzip.GzipFile):
    """Return string instead of bytes"""
    def __next__(self):
        super_ob = super(gzip.GzipFile, self)
        # seemingly different versions of gzip can have either next or __next__
        if hasattr(super_ob, 'next'):
            line = super_ob.next()
        else:
            line = super_ob.__next__()
        return line.decode("ascii", "replace")

    def next(self):
        line = self.__next__()
        return line


class myFileinputFile(fileinput.FileInput):
    """Implement next() method"""
    def next(self):
        line = next(self)
        return line


class FileWrapper(object):
    """Wrap a file-like object or stream with some custom tweaks"""

    def __init__(self, source, pos=0):

        self.src = source

        # Most file-like objects have seek and tell methods, but streams returned
        # by urllib.urlopen in Python2 do not, which will raise an AttributeError
        # in this code. On the other hand, in Python3 these methods do exist since
        # urllib uses the stream class in the io library, but they raise a different
        # error, namely io.UnsupportedOperation. That is why it is hard to be more
        # specific with except block here.
        try:
            self.src.seek(0, 2)
            self.size = self.src.tell()
            self.src.seek(pos, 0)

        except (AttributeError, IOError, io.UnsupportedOperation):
            # Stream returned by urllib should have size information.
            if hasattr(self.src, 'headers') and 'content-length' in self.src.headers:
                self.size = int(self.src.headers['content-length'])
            else:
                self.size = pos

        # Assume the position is what was passed to the constructor.
        self.pos = pos

    def next(self):
        line = next(self.src)
        self.pos += len(line)
        return line

    def __next__(self):
        return self.next()

    def __iter__(self):
        return self

    def close(self):
        self.src.close()

    def seek(self, pos, ref):

        # If we are seeking to end, we can emulate it usually. As explained above,
        # we cannot be too specific with the except clause due to differences
        # between Python2 and 3. Yet another reason to drop Python 2 soon!
        try:
            self.src.seek(pos, ref)
        except:
            if ref == 2:
                self.src.read()
            else:
                raise

        if ref == 0:
            self.pos = pos
        if ref == 1:
            self.pos += pos
        if ref == 2 and hasattr(self, 'size'):
            self.pos = self.size


def openlogfile(filename, object=None):
    """Return a file object given a filename or if object specified decompresses it
    if needed and wrap it up.

    Given the filename or file object of a log file or a gzipped, zipped, or bzipped
    log file, this function returns a file-like object.

    Given a list of filenames, this function returns a FileInput object,
    which can be used for seamless iteration without concatenation.
    """

    # If there is a single string argument given.
    if type(filename) in [str, str]:

        extension = os.path.splitext(filename)[1]

        if extension == ".gz":
            fileobject = myGzipFile(filename, "r", fileobj=object)

        elif extension == ".zip":
            zip = zipfile.ZipFile(object, "r") if object else zipfile.ZipFile(filename, "r")
            assert len(zip.namelist()) == 1, "ERROR: Zip file contains more than 1 file"
            fileobject = io.StringIO(zip.read(zip.namelist()[0]).decode("ascii", "ignore"))

        elif extension in ['.bz', '.bz2']:
            # Module 'bz2' is not always importable.
            assert bz2 is not None, "ERROR: module bz2 cannot be imported"
            fileobject = myBZ2File(object, "r") if object else myBZ2File(filename, "r")

        else:
            # Assuming that object is text file encoded in utf-8
            fileobject = io.StringIO(object.decode('utf-8')) if object \
                    else FileWrapper(io.open(filename, "r", errors='ignore'))

        return fileobject

    elif hasattr(filename, "__iter__"):

        # This is needed, because fileinput will assume stdin when filename is empty.
        if len(filename) == 0:
            return None

        # Compression (gzip and bzip) is supported as of Python 2.5.
        if sys.version_info[0] >= 2 and sys.version_info[1] >= 5:
            fileobject = fileinput.input(filename, openhook=fileinput.hook_compressed)
        else:
            fileobject = myFileinputFile(filename)

        return fileobject


class Logfile(object):
    """Abstract class for logfile objects.

    Subclasses defined by cclib:
        ADF, DALTON, GAMESS, GAMESSUK, Gaussian, Jaguar, Molpro, MOPAC,
        NWChem, ORCA, Psi, Q-Chem
    """

    def __init__(self, source, loglevel=logging.ERROR, logname="Log",
                 logstream=sys.stderr, datatype=ccData_optdone_bool, **kwds):
        """Initialise the Logfile object.

        This should be called by a subclass in its own __init__ method.

        Inputs:
            source - a logfile, list of logfiles, or stream with at least a read method
            loglevel - integer corresponding to a log level from the logging module
            logname - name of the source logfile passed to this constructor
            logstream - where to output the logging information
            datatype - class to use for gathering data attributes
        """

        # Set the filename to source if it is a string or a list of strings, which are
        # assumed to be filenames. Otherwise, assume the source is a file-like object
        # if it has a read method, and we will try to use it like a stream.
        self.isfileinput = False
        if isinstance(source, str):
            self.filename = source
            self.isstream = False
        elif isinstance(source, list) and all([isinstance(s, str) for s in source]):
            self.filename = source
            self.isstream = False
        elif isinstance(source, fileinput.FileInput):
            self.filename = source
            self.isstream = False
            self.isfileinput = True
        elif hasattr(source, "read"):
            self.filename = "stream %s" % str(type(source))
            self.isstream = True
            self.stream = source
        else:
            raise ValueError("Unexpected source type.")

        # Set up the logger.
        # Note that calling logging.getLogger() with one name always returns the same instance.
        # Presently in cclib, all parser instances of the same class use the same logger,
        #   which means that care needs to be taken not to duplicate handlers.
        self.loglevel = loglevel
        self.logname = logname
        self.logger = logging.getLogger('%s %s' % (self.logname, self.filename))
        self.logger.setLevel(self.loglevel)
        if len(self.logger.handlers) == 0:
            handler = logging.StreamHandler(logstream)
            handler.setFormatter(logging.Formatter("[%(name)s %(levelname)s] %(message)s"))
            self.logger.addHandler(handler)

        # Set up the metadata.
        if not hasattr(self, "metadata"):
            self.metadata = {}
            self.metadata["package"] = self.logname
            self.metadata["methods"] = []
            # Indicate if the computation has completed successfully
            self.metadata['success'] = False


        # Periodic table of elements.
        self.table = utils.PeriodicTable()

        # This is the class that will be used in the data object returned by parse(), and should
        # normally be ccData or a subclass of it.
        self.datatype = datatype

        # Change the class used if we want optdone to be a list or if the 'future' option
        # is used, which might have more consequences in the future.
        optdone_as_list = kwds.get("optdone_as_list", False) or kwds.get("future", False)
        optdone_as_list = optdone_as_list if isinstance(optdone_as_list, bool) else False
        if optdone_as_list:
            self.datatype = ccData
        # Parsing of Natural Orbitals and Natural Spin Orbtials into one attribute
        self.unified_no_nso = kwds.get("future",False)

    def __setattr__(self, name, value):

        # Send info to logger if the attribute is in the list of attributes.
        if name in ccData._attrlist and hasattr(self, "logger"):

            # Call logger.info() only if the attribute is new.
            if not hasattr(self, name):
                if type(value) in [numpy.ndarray, list]:
                    self.logger.info("Creating attribute %s[]" % name)
                else:
                    self.logger.info("Creating attribute %s: %s" % (name, str(value)))

        # Set the attribute.
        object.__setattr__(self, name, value)

    def parse(self, progress=None, fupdate=0.05, cupdate=0.002):
        """Parse the logfile, using the assumed extract method of the child."""

        # Check that the sub-class has an extract attribute,
        #  that is callable with the proper number of arguemnts.
        if not hasattr(self, "extract"):
            raise AttributeError("Class %s has no extract() method." % self.__class__.__name__)
        if not callable(self.extract):
            raise AttributeError("Method %s._extract not callable." % self.__class__.__name__)
        if len(getargspec(self.extract)[0]) != 3:
            raise AttributeError("Method %s._extract takes wrong number of arguments." % self.__class__.__name__)

        # Save the current list of attributes to keep after parsing.
        # The dict of self should be the same after parsing.
        _nodelete = list(set(self.__dict__.keys()))

        # Initiate the FileInput object for the input files.
        # Remember that self.filename can be a list of files.
        if not self.isstream:
            if not self.isfileinput:
                inputfile = openlogfile(self.filename)
            else:
                inputfile = self.filename
        else:
            inputfile = FileWrapper(self.stream)

        # Intialize self.progress
        is_compressed = isinstance(inputfile, myGzipFile) or isinstance(inputfile, myBZ2File)
        if progress and not (is_compressed):
            self.progress = progress
            self.progress.initialize(inputfile.size)
            self.progress.step = 0
        self.fupdate = fupdate
        self.cupdate = cupdate

        # Maybe the sub-class has something to do before parsing.
        self.before_parsing()

        # Loop over lines in the file object and call extract().
        # This is where the actual parsing is done.
        for line in inputfile:
            self.updateprogress(inputfile, "Unsupported information", cupdate)

            # This call should check if the line begins a section of extracted data.
            # If it does, it parses some lines and sets the relevant attributes (to self).
            # Any attributes can be freely set and used across calls, however only those
            #   in data._attrlist will be moved to final data object that is returned.
            try:
                self.extract(inputfile, line)
            except StopIteration:
                self.logger.error("Unexpectedly encountered end of logfile.")
                break

        # Close input file object.
        if not self.isstream:
            inputfile.close()

        # Maybe the sub-class has something to do after parsing.
        self.after_parsing()

        # If atomcoords were not parsed, but some input coordinates were ("inputcoords").
        # This is originally from the Gaussian parser, a regression fix.
        if not hasattr(self, "atomcoords") and hasattr(self, "inputcoords"):
            self.atomcoords = numpy.array(self.inputcoords, 'd')

        # Set nmo if not set already - to nbasis.
        if not hasattr(self, "nmo") and hasattr(self, "nbasis"):
            self.nmo = self.nbasis

        # Create a default coreelectrons array, unless it's impossible
        # to determine.
        if not hasattr(self, "coreelectrons") and hasattr(self, "natom"):
            self.coreelectrons = numpy.zeros(self.natom, "i")
        if hasattr(self, "incorrect_coreelectrons"):
            self.__delattr__("coreelectrons")

        # Create the data object we want to return. This is normally ccData, but can be changed
        # by passing the datatype argument to the constructor. All supported cclib attributes
        # are copied to this object, but beware that in order to be moved an attribute must be
        # included in the data._attrlist of ccData (or whatever else).
        # There is the possibility of passing assitional argument via self.data_args, but
        # we use this sparingly in cases where we want to limit the API with options, etc.
        data = self.datatype(attributes=self.__dict__)

        # Now make sure that the cclib attributes in the data object are all the correct type,
        # including arrays and lists of arrays.
        data.arrayify()

        # Delete all temporary attributes (including cclib attributes).
        # All attributes should have been moved to a data object, which will be returned.
        for attr in list(self.__dict__.keys()):
            if not attr in _nodelete:
                self.__delattr__(attr)

        # Perform final checks on values of attributes.
        data.check_values(logger=self.logger)

        # Update self.progress as done.
        if hasattr(self, "progress"):
            self.progress.update(inputfile.size, "Done")

        return data

    def before_parsing(self):
        """Set parser-specific variables and do other initial things here."""
        pass

    def after_parsing(self):
        """Correct data or do parser-specific validation after parsing is finished."""
        pass

    def updateprogress(self, inputfile, msg, xupdate=0.05):
        """Update progress."""

        if hasattr(self, "progress") and random.random() < xupdate:
            newstep = inputfile.pos
            if newstep != self.progress.step:
                self.progress.update(newstep, msg)
                self.progress.step = newstep

    def normalisesym(self, symlabel):
        """Standardise the symmetry labels between parsers.

        This method should be overwritten by individual parsers, and should
        contain appropriate doctests. If is not overwritten, this is detected
        as an error by unit tests.
        """
        raise NotImplementedError("normalisesym(self, symlabel) must be overriden by the parser.")

    def float(self, number):
        """Convert a string to a float.

        This method should perform certain checks that are specific to cclib,
        including avoiding the problem with Ds instead of Es in scientific notation.
        Another point is converting string signifying numerical problems (*****)
        to something we can manage (Numpy's NaN).
        """

        if list(set(number)) == ['*']:
            return numpy.nan

        return float(number.replace("D", "E"))

    def new_internal_job(self):
        """Delete attributes that can be problematic in multistep jobs.

        TODO: instead of this hack, parse each job in a multistep comptation
        as a different ccData object (this is for 2.x).

        Some computations are actually sequences of several jobs, and some
        attributes won't work well if parsed across jobs. There include:
            mpenergies: if different jobs go to different orders then
                        these won't be consistent and can't be converted
                        to an array easily
        """
        for name in ("mpenergies",):
            if hasattr(self, name):
                delattr(self, name)

    def set_attribute(self, name, value, check_change=True):
        """Set an attribute and perform an optional check when it already exists.

        Note that this can be used for scalars and lists alike, whenever we want
        to set a value for an attribute.
        
        Parameters
        ----------
        name: str
            The name of the attribute.
        value: str
            The value for the attribute.
        check_change: bool
            By default we want to check that the value does not change
            if the attribute already exists.
        """
        if check_change and hasattr(self, name):
            try:
                numpy.testing.assert_equal(getattr(self, name), value)
            except AssertionError:
                self.logger.warning("Attribute %s changed value (%s -> %s)" % (name, getattr(self, name), value))

        setattr(self, name, value)

    def append_attribute(self, name, value):
        """Appends a value to an attribute."""

        if not hasattr(self, name):
            self.set_attribute(name, [])
        getattr(self, name).append(value)

    def extend_attribute(self, name, values):
        """Appends an iterable of values to an attribute."""
        
        if not hasattr(self, name):
            self.set_attribute(name, [])
        getattr(self, name).extend(values)

    def _assign_coreelectrons_to_element(self, element, ncore,
                                         ncore_is_total_count=False):
        """Assign core electrons to all instances of the element.

        It's usually reasonable to do this for all atoms of a given element,
        because mixed usage isn't normally allowed within elements.

        Parameters
        ----------
        element: str
          the chemical element to set coreelectrons for
        ncore: int
          the number of core electrons
        ncore_is_total_count: bool
          whether the ncore argument is the total count, in which case it is
          divided by the number of atoms of this element
        """
        atomsymbols = [self.table.element[atomno] for atomno in self.atomnos]
        indices = [i for i, el in enumerate(atomsymbols) if el == element]
        if ncore_is_total_count:
            ncore = ncore // len(indices)

        if not hasattr(self, 'coreelectrons'):
            self.coreelectrons = numpy.zeros(self.natom, 'i')
        self.coreelectrons[indices] = ncore

    def skip_lines(self, inputfile, sequence):
        """Read trivial line types and check they are what they are supposed to be.

        This function will read len(sequence) lines and do certain checks on them,
        when the elements of sequence have the appropriate values. Currently the
        following elements trigger checks:
            'blank' or 'b'      - the line should be blank
            'dashes' or 'd'     - the line should contain only dashes (or spaces)
            'equals' or 'e'     - the line should contain only equal signs (or spaces)
            'stars' or 's'      - the line should contain only stars (or spaces)
        """

        expected_characters = {
            '-': ['dashes', 'd'],
            '=': ['equals', 'e'],
            '*': ['stars', 's'],
        }

        lines = []
        for expected in sequence:

            # Read the line we want to skip.
            line = next(inputfile)

            # Blank lines are perhaps the most common thing we want to check for.
            if expected in ["blank", "b"]:
                try:
                    assert line.strip() == ""
                except AssertionError:
                    frame, fname, lno, funcname, funcline, index = inspect.getouterframes(inspect.currentframe())[1]
                    parser = fname.split('/')[-1]
                    msg = "In %s, line %i, line not blank as expected: %s" % (parser, lno, line.strip())
                    self.logger.warning(msg)

            # All cases of heterogeneous lines can be dealt with by the same code.
            for character, keys in expected_characters.items():
                if expected in keys:
                    try:
                        assert all([c == character for c in line.strip() if c != ' '])
                    except AssertionError:
                        frame, fname, lno, funcname, funcline, index = inspect.getouterframes(inspect.currentframe())[1]
                        parser = fname.split('/')[-1]
                        msg = "In %s, line %i, line not all %s as expected: %s" % (parser, lno, keys[0], line.strip())
                        self.logger.warning(msg)
                        continue

            # Save the skipped line, and we will return the whole list.
            lines.append(line)

        return lines

    skip_line = lambda self, inputfile, expected: self.skip_lines(inputfile, [expected])