File: tokenize-backtrack.lexc

package info (click to toggle)
hfst 3.16.2-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 14,532 kB
  • sloc: cpp: 101,875; sh: 6,717; python: 5,225; yacc: 4,985; lex: 2,900; makefile: 2,017; xml: 6
file content (65 lines) | stat: -rw-r--r-- 1,704 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
Multichar_Symbols

% N
% Prop
% V
% Abbr
% PUNCT
@PMATCH_INPUT_MARK@
@PMATCH_BACKTRACK@
% ErrSpace
%#
% Adv
% Num
% Prn
% Cmp

LEXICON Root

skuvla% N:skuvla #;
busse% N:busse #;
busset% V:busse #;
< {skuvla} "@PMATCH_BACKTRACK@":0 "@PMATCH_INPUT_MARK@":" " {busse} " N":0 " ErrSpace":0 > # ;

logi% Num:logi #;
guokte% Num:guokte #;
njeallje% Num:njeallje #;
lo% Cmp#:lo GI;
lo% Num:lo UNSPACEGI;
! Silly example, but just to test that spaces on both sides work, as well as multiple backtracking points:
< {njeallje} "@PMATCH_BACKTRACK@":0 "@PMATCH_INPUT_MARK@":" " 0:" " 0:" " {logi} "@PMATCH_BACKTRACK@":0 "@PMATCH_INPUT_MARK@":" " {guokte} " Num":0 " ErrSpace":0 > # ;

Jan% N% Prop:Jan #;
.% PUNCT:. #;
Jan DOTABBR;

su% Prn:su #;
su% Adv% Abbr:su DOTNOTAG;


LEXICON DOTABBR
! This one should create a new backtrack-point before the dot, but *no* sub-reading:
< "@PMATCH_BACKTRACK@":0 "@PMATCH_INPUT_MARK@":0 {.}   " N":0 " Abbr":0 > # ;
! This one should create a new backtrack-point before the dot, *as well as* a sub-reading:
< "@PMATCH_BACKTRACK@":0 "@PMATCH_INPUT_MARK@":0 {.}:0 " N":0 " Abbr":0 "@PMATCH_INPUT_MARK@":0 {.} " PUNCT":0 > # ;

! Want:
!"<Jan.>"
!	"." PUNCT "<.>"
!		"Jan." N Abbr "<Jan>"
!	"Jan." N Abbr
!	"." PUNCT "<.>"
!		"Jan" N Prop "<Jan>"

LEXICON DOTNOTAG
! As the sub-reading-line of DOTABBR, but the two input-marks are right next to each other:
< "@PMATCH_INPUT_MARK@":0 "@PMATCH_BACKTRACK@":0 "@PMATCH_INPUT_MARK@":0 {.} " PUNCT":0 > # ;
! TODO: This doesn't give the sub-reading, however; should it?
!< "@PMATCH_BACKTRACK@":0 "@PMATCH_INPUT_MARK@":0 "@PMATCH_INPUT_MARK@":0 {.} " PUNCT":0 > # ;


LEXICON UNSPACEGI
@PMATCH_INPUT_MARK@:0 GI;

LEXICON GI
gi% N:gi #;