File: tokenize-backtrack.pmscript

package info (click to toggle)
hfst 3.16.2-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 14,532 kB
  • sloc: cpp: 101,875; sh: 6,717; python: 5,225; yacc: 4,985; lex: 2,900; makefile: 2,017; xml: 6
file content (16 lines) | stat: -rw-r--r-- 683 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
Define morphology @bin"tokenize-backtrack.hfst" ;

Define blank           Whitespace | Punct ;

! Define incondword       morphology & [ Punct 0:?* ] ;
Define morphoword       morphology                   LC([blank | #]) RC([blank | # ]);

Define alphabet "a-z" | {á}|{š}|{ž}|{č}|{đ}|{ŋ}|{æ}|{ø}|{å} | "A-Z" | {Á}|{Š}|{Ž}|{Č}|{Đ}|{Ŋ}|{Æ}|{Ø}|{Å} ;
Define alphaword alphabet+;
Define unknownform [ [alphaword].u - [morphology].u];
Define unknownwordEmpty unknownform:0 LC([blank | #]) RC([[blank ] | # ]);

! Define token [ morphoword | unknownwordEmpty | incondword ] EndTag(token);
Define token [ morphoword | unknownwordEmpty ] EndTag(token);

Define TOP token ;