1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
|
"""
A grammar that just splits the source into input lines and then
lets you name character ranges.
"""
#c Copyright 2008-2020, the GAVO project
#c
#c This program is free software, covered by the GNU GPL. See the
#c COPYING file in the source distribution.
from gavo import base
from gavo import utils
from gavo.grammars.common import Grammar, FileRowIterator, FileRowAttributes
from gavo.utils import parsetricks
class SplitLineIterator(FileRowIterator):
def __init__(self, grammar, sourceToken, **kwargs):
FileRowIterator.__init__(self, grammar, sourceToken, **kwargs)
for i in range(self.grammar.topIgnoredLines):
self.inputFile.readline()
self.lineNo = self.grammar.topIgnoredLines
def _iterRows(self):
while True:
self.lineNo += 1
inputLine = self.inputFile.readline()
if not inputLine:
break
if (self.grammar.commentIntroducer is not base.NotGiven
and inputLine.startswith(self.grammar.commentIntroducer)):
continue
res = self._parse(inputLine)
yield res
self.recNo += 1
self.inputFile.close()
self.grammar = None
def _parse(self, inputLine):
res = {}
try:
for key, slice in self.grammar.colRanges.items():
if self.grammar.strip:
res[key] = inputLine[slice].strip()
else:
res[key] = inputLine[slice]
except IndexError:
raise base.ui.logOldExc(base.SourceParseError("Short line", inputLine,
self.getLocator(), self.sourceToken))
return res
def getLocator(self):
return "line %d"%self.lineNo
class ColRangeAttribute(base.UnicodeAttribute):
"""A range of indices.
Ranges can be specified as either <int1>-<int2>, just <int>
(which is equivalent to <int>-<int>), or as half-open ranges
(<int>- or -<int>) Ranges are, contrary to
python slices, inclusive on both sides, and start counting
from one.
"""
def parse(self, value):
if isinstance(value, slice):
# we're already parsed
return value
try:
if "-" in value:
startLit, endLit = value.split("-")
start, end = None, None
if startLit.strip():
start = int(startLit)-1
if endLit.strip():
end = int(endLit)
return slice(start, end)
else:
col = int(value)
return slice(col-1, col)
except ValueError:
raise base.ui.logOldExc(
base.LiteralParseError("colRanges", value, hint="A column range,"
" (either int1-int2 or just an int) is expected here."))
class ColumnGrammar(Grammar, FileRowAttributes):
"""A grammar that builds rowdicts out of character index ranges.
This works by using the colRanges attribute like <col key="mag">12-16</col>,
which will take the characters 12 through 16 inclusive from each input
line to build the input column mag.
As a shortcut, you can also use the colDefs attribute; it contains
a string of the form {<key>:<range>}, i.e.,
a whitespace-separated list of colon-separated items of key and range
as accepted by cols, e.g.::
<colDefs>
a: 3-4
_u: 7
</colDefs>
"""
name_ = "columnGrammar"
_til = base.IntAttribute("topIgnoredLines", default=0, description=
"Skip this many lines at the top of each source file.",
copyable=True)
_cols = base.DictAttribute("colRanges", description="Mapping of"
" source keys to column ranges.", itemAttD=ColRangeAttribute("col"),
copyable=True)
_colDefs = base.ActionAttribute("colDefs", description="Shortcut"
" way of defining cols", methodName="_parseColDefs")
_commentIntroducer = base.UnicodeAttribute("commentIntroducer",
default=base.NotGiven, description="A character sequence"
" that, when found at the beginning of a line makes this line"
" ignored", copyable=True)
_strip = base.BooleanAttribute("strip", default=True,
description="Strip all parsed strings?", copyable=True)
def _getColDefGrammar(self):
with parsetricks.pyparsingWhitechars("\n\t\r "):
intLiteral = parsetricks.Word(parsetricks.nums)
# need to manually swallow whitespace after literals
blindWhite = parsetricks.Suppress(parsetricks.Optional(parsetricks.White()))
dash = blindWhite + parsetricks.Literal("-") + blindWhite
range = parsetricks.Combine(
dash + blindWhite + intLiteral
| intLiteral + parsetricks.Optional(dash + parsetricks.Optional(intLiteral)))
range.setName("Column range")
identifier = parsetricks.Regex(utils.identifierPattern.pattern[:-1])
identifier.setName("Column key")
clause = (identifier + parsetricks.Literal(":") + blindWhite + range
).addParseAction(lambda s,p,t: (t[0], t[2]))
colDefs = parsetricks.ZeroOrMore(clause)+parsetricks.StringEnd()
# range.setDebug(True);identifier.setDebug(True);clause.setDebug(True)
return colDefs
def _parseColDefs(self, ctx):
# the handler for colDefs -- parse shortcut colDefs
try:
for key, range in utils.pyparseString(self._getColDefGrammar(),
self.colDefs):
self.colRanges[key] = self._cols.itemAttD.parse(range)
except parsetricks.ParseException as ex:
raise base.LiteralParseError("colDefs", self.colDefs,
hint="colDefs is a whitespace-separated list of key:range pairs."
" Your literal doesn't look like this, and here's what the"
" parser had to complain: %s"%ex)
rowIterator = SplitLineIterator
|