File: tokenize.py

package info (click to toggle)
frescobaldi 1.1.4-1
  • links: PTS, VCS
  • area: main
  • in suites: squeeze
  • size: 4,240 kB
  • ctags: 2,434
  • sloc: python: 15,614; lisp: 28; sh: 25; makefile: 2
file content (827 lines) | stat: -rw-r--r-- 26,220 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
# This file is part of the Frescobaldi project, http://www.frescobaldi.org/
#
# Copyright (c) 2008, 2009, 2010 by Wilbert Berendsen
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
# See http://www.gnu.org/licenses/ for more information.

from __future__ import unicode_literals

"""
This module defines a Tokenizer class to parse and tokenize LilyPond text.

Usage:

>>> from ly.tokenizer import Tokenizer
>>> tokenizer = Tokenizer()
>>> lilypond = r"\relative c' { c d-\markup { Hi There } }"
>>> for token in tokenizer.tokens(lilypond):
...  print token.__class__.__name__, repr(token)
...
Command u'\\relative'
Space u' '
PitchWord u'c'
Unparsed u"'"
Space u' '
OpenDelimiter u'{'
Space u' '
PitchWord u'c'
Space u' '
PitchWord u'd'
Unparsed u'-'
Markup u'\\markup'
Space u' '
OpenBracket u'{'
Space u' '
MarkupWord u'Hi'
Space u' '
MarkupWord u'There'
Space u' '
CloseBracket u'}'
Space u' '
CloseDelimiter u'}'
>>>

Some LilyPond construct enter a different parsing mode, you can get the current
Tokenizer.Parser instance with parser().

The tokens returned by the iterator returned by tokens() are all instances
of subclasses of unicode. They are either instances of a subclass of
Tokenizer.Token (if they were parsed) or Tokenizer.Unparsed (if the piece of
text was not understood).

The Unparsed class and all Token subclasses are attributes of the Tokenizer
class (so they are nested classes). You can subclass Tokenizer to add your own
token classes. Each token class defines the regular expression pattern it
matches in its rx class attribute.

There are also Parser subclasses, defined as Tokenizer class attributes.
Those are instantiated to look for specific tokens in LilyPond input text.
The items() static method of the Parser subclasses should return a tuple of
token classes (found as attributes of the Tokenizer (sub)class).

Upon class construction of the/a Tokenizer (sub)class, a regular expression is
automatically created for each Parser subclass to parse a piece of LilyPond input
text for the list of tokens returned by its items() method. You can also easily
subclass the Parser classes.
"""

import re
import ly.rx
import ly.pitch
import ly.words


def _make_re(classes):
    """
    Expects a list of classes representing LilyPond input atoms. Returns
    compiled regular expression with named groups, to match input of the listed
    types. Reads the rx class attribute of the given classes.
    """
    return re.compile(
        "|".join("(?P<{0}>{1})".format(cls.__name__, cls.rx) for cls in classes),
        re.DOTALL)


class _tokenizer_meta(type):
    """
    This metaclass makes sure that the regex patterns of Parser subclasses
    inside a subclassed Tokenizer are always correct.
    
    It checks the items() method of all Parser subclasses and creates a
    pattern attribute. If that's different, a new copy (subclass) of the Parser
    subclass is created with the correct pattern.
    """
    def __init__(cls, className, bases, attrd):
        for name in dir(cls):
            attr = getattr(cls, name)
            if (isinstance(attr, type) and issubclass(attr, cls.Parser)
                    and attr is not cls.Parser):
                # We have a Parser subclass. If it has already a pattern
                # that's different from the one created from the items()
                # method output, copy the class. (The pattern is a compiled
                # regex pattern.)
                pattern = _make_re(attr.items(cls))
                if 'pattern' not in attr.__dict__:
                    attr.pattern = pattern
                elif attr.pattern.pattern != pattern.pattern:
                    setattr(cls, name, type(name, (attr,), {'pattern': pattern}))


class Tokenizer(object):
    """
    This class defines an environment to parse LilyPond text input.
    
    There are two types of nested classes (accessible as class attributes, but
    also via a Tokenizer instance):
    
    - Subclasses of Token (or Unparsed): tokens of LilyPond input.
    - Subclasses of Parser: container with regex to parse LilyPond input.
    """
    __metaclass__ = _tokenizer_meta
    
    def __init__(self, parserClass = None):
        self.reset(parserClass)
        
    def reset(self, parserClass = None):
        """
        Reset the tokenizer instance (forget state), so that it can be used
        again.
        """
        if parserClass is None:
            parserClass = self.ToplevelParser
        self.state = [parserClass()]
        self.incomplete = None
        self.language = "nederlands"

    def parser(self, depth = -1):
        """ Return the current (or given) parser instance. """
        return self.state[depth]
        
    def enter(self, parserClass, token = None, argcount = None):
        """ (Internal) Enter a new parser. """
        self.state.append(parserClass(token, argcount))

    def leave(self):
        """ (Internal) Leave the current parser and pop back to the previous. """
        if len(self.state) > 1:
            self.state.pop()
        
    def endArgument(self):
        """
        (Internal) End an argument. Decrease argcount and leave the parser
        if it would reach 0.
        """
        while len(self.state) > 1 and self.state[-1].level == 0:
            if self.state[-1].argcount > 1:
                self.state[-1].argcount -= 1
                return
            elif self.state[-1].argcount == 0:
                return
            self.state.pop()
            
    def inc(self):
        """
        (Internal) Up the level of the current parser. Indicates nesting
        while staying in the same parser.
        """
        self.state[-1].level += 1
        
    def dec(self):
        """
        (Internal) Down the level of the current parser. If it has reached zero,
        leave the current parser. Otherwise decrease argcount and leave if that
        would reach zero.
        """
        while self.state[-1].level == 0 and len(self.state) > 1:
            self.state.pop()
        if self.state[-1].level > 0:
            self.state[-1].level -= 1
            self.endArgument()
            
    def depth(self):
        """
        Return a two-tuple representing the depth of the current state.
        This is useful to quickly check when a part of LilyPond input ends.
        """
        return len(self.state), self.state[-1].level

    def tokens(self, text, pos = 0):
        """
        Iterate over the LilyPond tokens in the string.
        All returned tokens are a subclass of unicode.
        When they are reassembled, the original string is restored (i.e. no
        data is lost).
        The tokenizer does its best to parse LilyPond input and return
        meaningful strings. It recognizes being in a Scheme context, and also
        "LilyPond in Scheme" (the #{ and #} constructs).
        """
        if self.incomplete:
            # last token parsed was incomplete, continue now
            m = self.incomplete.end_rx.match(text, pos)
            if m:
                # we found the end
                yield self.incomplete(m, self)
                pos = m.end()
                self.incomplete = None
            else:
                # the incomplete token is still continuing...
                if pos < len(text):
                    token = unicode.__new__(self.incomplete, text[pos:])
                    token.pos, token.end = pos, len(text)
                    yield token
                return
                    
        tokenClass = None
        m = self.parser().parse(text, pos)
        while m:
            if pos < m.start():
                yield self.Unparsed(text[pos:m.start()], pos)
            tokenClass = getattr(self, m.lastgroup)
            yield tokenClass(m, self)
            pos = m.end()
            m = self.parser().parse(text, pos)
        if pos < len(text):
            yield self.Unparsed(text[pos:], pos)
        if tokenClass and issubclass(tokenClass, self.Incomplete):
            self.incomplete = tokenClass
    
    def freeze(self):
        """
        Returns the frozen state of this tokenizer as an immutable tuple
        """
        state = tuple((
                    parser.__class__,
                    parser.token,
                    parser.level,
                    parser.argcount,
                ) for parser in self.state)
        return state, self.incomplete, self.language
            
    def thaw(self, frozenState):
        """
        Accepts a tuple such as returned by freeze(), and restores
        the state of this tokenizer from it.
        """
        state, self.incomplete, self.language = frozenState
        self.state = []
        for cls, token, level, argcount in state:
            parser = cls(token, argcount)
            parser.level = level
            self.state.append(parser)
    
    
    # Classes that represent pieces of lilypond text:
    # base classes:
    class Token(unicode):
        """
        Represents a parsed piece of LilyPond text, the subclass determines
        the type.
        The matchObj delivers the string and the position.
        The state can be manipulated on instantiation.
        """
        def __new__(cls, matchObj, tokenizer):
            obj = unicode.__new__(cls, matchObj.group())
            obj.pos, obj.end = matchObj.span()
            return obj

    class Item(Token):
        """
        Represents a token that decreases the argument count of its calling
        command.
        """
        def __init__(self, matchObj, tokenizer):
            tokenizer.endArgument()

    class Incomplete(Token):
        """
        Represents an unfinished item, e.g. string or block comment.
        the end_rx attribute is a regex that can be used to find its end in
        a following piece of text.
        """
        end_rx = None

    class Increaser(Token):
        def __init__(self, matchObj, tokenizer):
            tokenizer.inc()
            
    class Decreaser(Token):
        def __init__(self, matchObj, tokenizer):
            tokenizer.dec()

    class Leaver(Token):
        def __init__(self, matchObj, tokenizer):
            tokenizer.leave()


    # real types of lilypond input
    class Unparsed(Token):
        """
        Represents an unparsed piece of LilyPond text.
        Needs to be given a value and a position (where the string was found)
        """
        def __new__(cls, value, pos):
            obj = unicode.__new__(cls, value)
            obj.pos = pos
            obj.end = pos + len(obj)
            return obj

    class Command(Item):
        rx = r"\\[A-Za-z]+(-[A-Za-z]+)*"

    class String(Item):
        rx = r'"(\\[\\"]|[^"])*"'

    class IncompleteString(Incomplete, String):
        rx = r'".*$'
        end_rx = re.compile(r'(\\[\\"]|[^"])*"', re.DOTALL)
        
    class PitchWord(Item):
        """ A word with just alphanumeric letters """
        rx = r'[A-Za-z]+'
        
    class SchemeToken(Token):
        """ Base class for Scheme tokens. """
        pass
    
    class Scheme(SchemeToken):
        rx = "#"
        def __init__(self, matchObj, tokenizer):
            tokenizer.enter(tokenizer.SchemeParser, self)

    class Comment(Token):
        """ Base class for LineComment and BlockComment (also Scheme) """
        pass
    
    class LineComment(Comment):
        rx = r'%[^\n]*'
        
    class BlockComment(Comment):
        rx = r'%\{.*?%\}'
        
    class IncompleteBlockComment(Incomplete, Comment):
        rx = r'%\{.*$'
        end_rx = re.compile(r'.*?%\}', re.DOTALL)
        
    class Space(Token):
        rx = r"\s+"

    class Markup(Command):
        rx = r"\\markup\b"
        def __init__(self, matchObj, tokenizer):
            tokenizer.enter(tokenizer.MarkupParser, self)

    class MarkupLines(Command):
        rx = r"\\markuplines\b"
        def __init__(self, matchObj, tokenizer):
            tokenizer.enter(tokenizer.MarkupParser, self)
    
    class Include(Command):
        rx = r"\\include\b"
        def __init__(self, matchObj, tokenizer):
            tokenizer.enter(tokenizer.IncludeParser, self)
    
    class IncludeFile(String):
        pass
    
    class IncludeLanguageFile(IncludeFile):
        rx = r'"({0})\.ly"'.format('|'.join(ly.pitch.pitchInfo.keys()))
        def __init__(self, matchObj, tokenizer):
            tokenizer.language = self[1:-4]
            tokenizer.endArgument()
    
    class OpenDelimiter(Increaser):
        rx = r"<<|\{"
        
    class CloseDelimiter(Decreaser):
        rx = r">>|\}"

    class Dynamic(Token):
        rx = r"\\[<>!]"

    class VoiceSeparator(Token):
        rx = r"\\\\"

    class Articulation(Token):
        rx = "[-_^][_.>|+^-]"
        
    class EndSchemeLily(Leaver):
        rx = r"#\}"

    class SchemeOpenParenthesis(Increaser, SchemeToken):
        rx = r"\("

    class SchemeCloseParenthesis(Decreaser, SchemeToken):
        rx = r"\)"

    class SchemeQuote(SchemeToken):
        rx = r"[',`]"
    
    class SchemeChar(Item, SchemeToken):
        rx = r'#\\([a-z]+|.)'

    class SchemeWord(Item, SchemeToken):
        rx = r'[^()"{}\s]+'

    class SchemeLineComment(Comment, SchemeToken):
        rx = r";[^\n]*"
    
    class SchemeBlockComment(Comment, SchemeToken):
        rx = r"#!.*?!#"
        
    class IncompleteSchemeBlockComment(Incomplete, Comment, SchemeToken):
        rx = r"#!.*$"
        end_rx = re.compile(r'.*?!#', re.DOTALL)
        
    class SchemeLily(Token):
        rx = "#\{"
        def __init__(self, matchObj, tokenizer):
            tokenizer.enter(tokenizer.ToplevelParser, self)

    class OpenBracket(Increaser):
        rx = r"\{"

    class CloseBracket(Decreaser):
        rx = r"\}"

    class MarkupScore(Command):
        rx = r"\\score\b"
        def __init__(self, matchObj, tokenizer):
            tokenizer.enter(tokenizer.ToplevelParser, self, 1)
            
    class MarkupCommand(Command):
        def __init__(self, matchObj, tokenizer):
            command = self[1:]
            if command in ly.words.markupcommands_nargs[0]:
                tokenizer.endArgument()
            else:
                for argcount in 2, 3, 4:
                    if command in ly.words.markupcommands_nargs[argcount]:
                        break
                else:
                    argcount = 1
                tokenizer.enter(tokenizer.MarkupParser, self, argcount)

    class MarkupWord(Item):
        rx = r'[^{}"\\\s#]+'

    class LyricMode(Command):
        rx = r'\\(lyricmode|((old)?add)?lyrics|lyricsto)\b'
        def __init__(self, matchObj, tokenizer):
            if self == "\\lyricsto":
                argcount = 2
            else:
                argcount = 1
            tokenizer.enter(tokenizer.LyricModeParser, self, argcount)
            
    class ChordMode(Command):
        rx = r'\\(chords|chordmode)\b'
        def __init__(self, matchObj, tokenizer):
            tokenizer.enter(tokenizer.ChordModeParser, self)

    class FigureMode(Command):
        rx = r'\\(figures|figuremode)\b'
        def __init__(self, matchObj, tokenizer):
            tokenizer.enter(tokenizer.FigureModeParser, self)

    class NoteMode(Command):
        rx = r'\\(notes|notemode)\b'
        def __init__(self, matchObj, tokenizer):
            tokenizer.enter(tokenizer.NoteModeParser, self)

    class LyricWord(Item):
        rx = r'[^\W\d]+'
        
    class Section(Command):
        """Introduce a section with no music, like \\layout, etc."""
        rx = r'\\(with|layout|midi|paper|header)\b'
        def __init__(self, matchObj, tokenizer):
            tokenizer.enter(tokenizer.SectionParser, self)

    class Context(Command):
        """ Introduce a \context section within layout, midi. """
        rx = r'\\context\b'
        def __init__(self, matchObj, tokenizer):
            tokenizer.enter(tokenizer.ContextParser, self)
            

    ### Parsers
    class Parser(object):
        """
        This is the base class for parsers.  The Tokenizer's meta class 
        looks for descendants of this class and creates parsing patterns.
        """
        pattern = None  # This is filled in by the Tokenizer's meta class.
        items = staticmethod(lambda cls: ())
        argcount = 0
        
        def __init__(self, token = None, argcount = None):
            self.level = 0
            self.token = token
            if argcount is not None:
                self.argcount = argcount

        def parse(self, text, pos):
            return self.pattern.search(text, pos)


    # base stuff to parse in LilyPond input
    lilybaseItems = classmethod(lambda cls: (
        cls.BlockComment,
        cls.IncompleteBlockComment,
        cls.LineComment,
        cls.String,
        cls.IncompleteString,
        cls.EndSchemeLily,
        cls.Scheme,
        cls.Section,
        cls.LyricMode,
        cls.ChordMode,
        cls.FigureMode,
        cls.NoteMode,
        cls.Markup,
        cls.MarkupLines,
        cls.Include,
        cls.Command,
        cls.Space,
    ))
    
    class ToplevelParser(Parser):
        items = staticmethod(lambda cls: (
            cls.OpenDelimiter,
            cls.CloseDelimiter,
            cls.PitchWord,
            cls.Dynamic,
            cls.VoiceSeparator,
            cls.Articulation,
        ) + cls.lilybaseItems())
    
    class SchemeParser(Parser):
        argcount = 1
        items = staticmethod(lambda cls: (
            cls.String,
            cls.IncompleteString,
            cls.SchemeChar,
            cls.SchemeQuote,
            cls.SchemeLineComment,
            cls.SchemeBlockComment,
            cls.IncompleteSchemeBlockComment,
            cls.SchemeOpenParenthesis,
            cls.SchemeCloseParenthesis,
            cls.SchemeLily,
            cls.SchemeWord,
            cls.Space,
        ))
    
    class MarkupParser(Parser):
        argcount = 1
        items = staticmethod(lambda cls: (
            cls.MarkupScore,
            cls.MarkupCommand,
            cls.OpenBracket,
            cls.CloseBracket,
            cls.MarkupWord,
        ) + cls.lilybaseItems())
        
    class InputModeParser(Parser):
        """
        Abstract base class for input modes such as \lyricmode, \figuremode,
        \chordmode etc.
        """
        argcount = 1

    class LyricModeParser(InputModeParser):
        items = staticmethod(lambda cls: (
            cls.OpenBracket,
            cls.CloseBracket,
            cls.LyricWord,
        ) + cls.lilybaseItems())

    class ChordModeParser(ToplevelParser, InputModeParser):
        argcount = 1

    class FigureModeParser(ToplevelParser, InputModeParser):
        argcount = 1

    class NoteModeParser(ToplevelParser, InputModeParser):
        argcount = 1

    class SectionParser(Parser):
        argcount = 1
        items = staticmethod(lambda cls: (
            cls.OpenBracket,
            cls.CloseBracket,
            cls.Context,
        ) + cls.lilybaseItems())
    
    class ContextParser(Parser):
        argcount = 1
        items = staticmethod(lambda cls: (
            cls.OpenBracket,
            cls.CloseBracket,
        ) + cls.lilybaseItems())

    class IncludeParser(Parser):
        argcount = 1
        items = staticmethod(lambda cls: (
            cls.IncludeLanguageFile,
            cls.IncludeFile,
        ) + cls.lilybaseItems())


class MusicTokenizer(Tokenizer):
    """
    A Tokenizer more directed to parsing music.
    It detects full pitches, chords, etc.
    """
    class OpenChord(Tokenizer.Token):
        rx = "<"
        
    class CloseChord(Tokenizer.Token):
        rx = ">"
        
    class Pitch(Tokenizer.Item):
        rx = ly.rx.named_pitch
        def __init__(self, matchObj, tokenizer):
            self.step = matchObj.group('step')
            self.octave = matchObj.group('octave') or ''
            self.cautionary = matchObj.group('cautionary') or ''
            self.octcheck = matchObj.group('octcheck') or ''
        
    class ToplevelParser(Tokenizer.ToplevelParser):
        items = staticmethod(lambda cls: (
            cls.OpenDelimiter,
            cls.CloseDelimiter,
            cls.OpenChord,
            cls.CloseChord,
            cls.Pitch,
            cls.Dynamic,
            cls.VoiceSeparator,
            cls.Articulation,
        ) + cls.lilybaseItems())
    
    class ChordModeParser(ToplevelParser, Tokenizer.ChordModeParser): pass
    class NoteModeParser(ToplevelParser, Tokenizer.NoteModeParser): pass

    def readStep(self, pitchToken):
        return ly.pitch.pitchReader[self.language](pitchToken.step)


class LineColumnMixin(object):
    """
    Mixin to iterate over tokens, adding line and column attributes
    to every token.
    """
    def tokens(self, text, pos = 0):
        cursor = Cursor()
        if pos:
            cursor.walk(text[:pos])
        for token in super(LineColumnMixin, self).tokens(text, pos):
            token.line = cursor.line
            token.column = cursor.column
            yield token
            cursor.walk(token)


class LineColumnTokenizer(LineColumnMixin, Tokenizer):
    """
    Basic Tokenizer which records line and column, adding those
    as attributes to every token.
    """
    pass


class Cursor(object):
    """
    A Cursor instance can walk() over any piece of plain text,
    maintaining line and column positions by looking at newlines in the text.

    Subclass this to let a ChangeList perform changes on the instance.
    The actions are called in sorted order, but the cursor positions
    reflect the updated state of the document.
    """
    def __init__(self, other = None, column = 0):
        if isinstance(other, Cursor):
            self.line = other.line
            self.column = other.column
            self.anchorLine = other.anchorLine
            self.anchorColumn = other.anchorColumn
        else:
            self.line = other or 0
            self.column = column
            self.anchorLine = 0
            self.anchorColumn = 0
    
    def walk(self, text):
        lines = text.count('\n')
        if lines:
            self.line += lines
            self.column = len(text) - text.rfind('\n') - 1
        else:
            self.column += len(text)
    
    def anchor(self, text):
        """
        Sets the anchor to the end of text.
        """
        lines = text.count('\n')
        if lines:
            self.anchorLine = self.line + lines
            self.anchorColumn = len(text) - text.rfind('\n') - 1
        else:
            self.anchorLine = self.line
            self.anchorColumn = self.column + len(text)

    def __enter__(self):
        """ Called before edits are made. """
        pass
    
    def __exit__(self, *args):
        """ Called after edits have been done. """
        pass
    
    def insertText(self, text):
        """ Insert text at current cursor position. """
        pass
    
    def removeText(self):
        """ Delete text from current position to anchor. """
        pass
    
    def replaceText(self, text):
        """ Replace text from current position to anchor with text. """
        pass
        

class ChangeList(object):
    """
    Manages a list of changes to a string.
    Each entry is a tuple(pos, end, text).
    
    pos and end define the slice of the original string to remove, text
    is the text to insert at that position.
    """
    
    # whether our items must be sorted.
    # If False, the user must add the changes in the correct order!
    sortItems = True
    
    def __init__(self, text):
        self._changes = []
        self._text = text
        
    def replace(self, pos, end, text):
        if text != self._text[pos:end]:
            self._changes.append((pos, end, text))
        
    def replaceToken(self, token, text):
        if token != text:
            self._changes.append((token.pos, token.end, text))
            
    def remove(self, pos, end):
        self._changes.append((pos, end, None))
    
    def removeToken(self, token):
        self._changes.append((token.pos, token.end, None))
        
    def insert(self, pos, text):
        self._changes.append((pos, pos, text))
        
    def changes(self):
        """
        Return an iterator over the changes.
        """
        if self.sortItems:
            return sorted(self._changes)
        else:
            return self._changes
        
    def apply(self):
        """
        Return a new string constructed from the original string
        with all the changes applied.
        """
        def parts():
            index = 0
            for pos, end, text in self.changes():
                if pos > index:
                    yield self._text[index:pos]
                if text:
                    yield text
                index = end
            if index < len(self._text):
                yield self._text[index:]
        
        return ''.join(parts())

    def applyToCursor(self, cursor):
        index = 0
        with cursor:
            for pos, end, text in self.changes():
                if pos > index:
                    cursor.walk(self._text[index:pos])
                if end > pos:
                    cursor.anchor(self._text[pos:end])
                    if text:
                        cursor.replaceText(text)
                        cursor.walk(text)
                    else:
                        cursor.removeText()
                else:
                    cursor.insertText(text)
                    cursor.walk(text)
                index = end