File: manual.html

package info (click to toggle)
jflex 1.7.0-3
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 13,944 kB
  • sloc: java: 421,255; xml: 1,130; makefile: 123; lisp: 90; yacc: 65; sh: 13
file content (1338 lines) | stat: -rw-r--r-- 128,812 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  <meta http-equiv="Content-Style-Type" content="text/css" />
  <meta name="generator" content="pandoc" />
  <title>JFlex User’s Manual</title>
  <style type="text/css">code{white-space: pre;}</style>
  <link rel="stylesheet" href="manual.css" type="text/css" />
</head>
<body>
<div id="header">
<h1 class="title">JFlex User’s Manual</h1>
</div>
<center>
<div class="figure">
<img src="fig/jflex-black.png" />

</div>
<p>The Fast Lexical Analyser Generator</p>
<p>Copyright © 1998–2018 by <a href="http://www.doclsf.de">Gerwin Klein</a>, Steve Rowe, and <a href="http://regis.decamps.info/">Régis Décamps</a>.</p>
Version 1.7.0, 21 September 2018
</center>
<h1 id="Intro">Introduction</h1>
<p>JFlex is a lexical analyser generator for Java<a href="#fn1" class="footnoteRef" id="fnref1"><sup>1</sup></a> written in Java. It is also a rewrite of the tool JLex <span class="citation">(Berk 1996)</span> which was developed by Elliot Berk at Princeton University. As Vern Paxson states for his C/C++ tool flex <span class="citation">(Paxson 1995)</span>: they do not share any code though.</p>
<p>A lexical analyser generator takes as input a specification with a set of regular expressions and corresponding actions. It generates a program (a <em>lexer</em>) that reads input, matches the input against the regular expressions in the spec file, and runs the corresponding action if a regular expression matched. Lexers usually are the first front-end step in compilers, matching keywords, comments, operators, etc, and generating an input token stream for parsers. They can also be used for many other purposes.</p>
<h2 id="design-goals">Design goals</h2>
<p>The main design goals of JFlex are:</p>
<ul>
<li><strong>Unicode support</strong></li>
<li><strong>Fast generated scanners</strong></li>
<li><strong>Fast scanner generation</strong></li>
<li><strong>Convenient specification syntax</strong></li>
<li><strong>Platform independence</strong></li>
<li><strong>JLex compatibility</strong></li>
</ul>
<h2 id="about-this-manual">About this manual</h2>
<p>This manual gives a brief but complete description of the tool JFlex. It assumes that you are familiar with the topic of lexical analysis in parsing. The references <span class="citation">Aho, Sethi, and Ullman (1986)</span> and <span class="citation">Appel (1998)</span> provide a good introduction.</p>
<p>The next section of this manual describes <a href="#Installing">installation procedures</a> for JFlex. <a href="#Example">Working with JFlex - an example</a> runs through an example specification and explains how it works. The section on <a href="#Specifications">Lexical specifications</a> presents all JFlex options and the complete specification syntax; <a href="#sec:encodings">Encodings, Platforms, and Unicode</a> provides information about Unicode and scanning text vs. binary files. <a href="#performance">A few words on performance</a> gives tips on how to write fast scanners. The section on <a href="#Porting">porting scanners</a> shows how to port scanners from JLex, and from the <code>lex</code> and <code>flex</code> tools for C. Finally, <a href="#WorkingTog">working together</a> discusses interfacing JFlex scanners with the LALR parser generators CUP, CUP2, BYacc/J, Jay.</p>
<h1 id="Installing">Installing and Running JFlex</h1>
<h2 id="installing-jflex">Installing JFlex</h2>
<h3 id="windows">Windows</h3>
<p>To install JFlex on Windows, follow these three steps:</p>
<ol style="list-style-type: decimal">
<li>Unzip the file you downloaded into the directory you want JFlex in. If you unzipped it to say <code>C:\</code>, the following directory structure should be generated:</li>
</ol>
<pre><code>    C:\jflex-1.7.0\ 
        +--bin\                        (start scripts) 
        +--doc\                        (FAQ and manual) 
        +--examples\ 
            +--byaccj\                 (calculator example for BYacc/J) 
            +--cup-maven\              (calculator example for cup and maven) 
            +--interpreter\            (interpreter example for cup) 
            +--java\                   (Java lexer specification) 
            +--simple-maven\           (example scanner built with maven) 
            +--standalone-maven\       (a simple standalone scanner, 
                                        built with maven) 
            +--zero-reader\            (Readers that return 0 characters) 
        +--lib\                        (precompiled classes) 
        +--src\ 
            +--main\ 
                +--config\             (PMD source analyzer configuration) 
                +--cup\                (JFlex parser spec) 
                +--java\ 
                    +--jflex\          (source code of JFlex) 
                        +--anttask\    (source code of JFlex Ant Task) 
                        +--gui\        (source code of JFlex UI classes) 
                        +--unicode\    (source code for Unicode properties) 
                +--jflex\              (JFlex scanner spec) 
                +--resources\          (messages and default skeleton file) 
            +--test\                   (unit tests)</code></pre>
<ol start="2" style="list-style-type: decimal">
<li><p>Edit the file <strong><code>bin\jflex.bat</code></strong> (in the example it’s <code>C:\jflex-1.7.0\bin\jflex.bat</code>) such that</p>
<ul>
<li><p><strong><code>JAVA_HOME</code></strong> contains the directory where your Java JDK is installed (for instance <code>C:\java</code>) and</p></li>
<li><p><strong><code>JFLEX_HOME</code></strong> the directory that contains JFlex (in the example: <code>C:\jflex-1.7.0</code>)</p></li>
</ul></li>
<li><p>Include the <code>bin\</code> directory of JFlex in your path. (the one that contains the start script, in the example: <code>C:\jflex-1.7.0\bin</code>).</p></li>
</ol>
<h3 id="macunix-with-tar">Mac/Unix with tar</h3>
<p>To install JFlex on a Mac or Unix system, follow these two steps:</p>
<ul>
<li><p>Decompress the archive into a directory of your choice with GNU tar, for instance to <code>/usr/share</code>:</p>
<p><code>tar -C /usr/share -xvzf jflex-1.7.0.tar.gz</code></p>
<p>(The example is for site wide installation. You need to be root for that. User installation works exactly the same way — just choose a directory where you have write permission)</p></li>
<li><p>Make a symbolic link from somewhere in your binary path to <code>bin/jflex</code>, for instance:</p>
<p><code>ln -s /usr/share/jflex-1.7.0/bin/jflex /usr/bin/jflex</code></p>
<p>If the Java interpreter is not in your binary path, you need to supply its location in the script <code>bin/jflex</code>.</p></li>
</ul>
<p>You can verify the integrity of the downloaded file with the SHA1 checksum available on the <a href="http://jflex.de/download.html">JFlex download page</a>. If you put the checksum file in the same directory as the archive, and run:</p>
<p><code>shasum --check jflex-1.7.0.tar.gz.sha1</code></p>
<p>it should tell you</p>
<p><code>jflex-1.7.0.tar.gz: OK</code></p>
<h2 id="running-jflex">Running JFlex</h2>
<p>You run JFlex with:</p>
<p><code>jflex &lt;options&gt; &lt;inputfiles&gt;</code></p>
<p>It is also possible to skip the start script in <code>bin/</code> and include the file <code>lib/jflex-1.7.0.jar</code> in your <code>CLASSPATH</code> environment variable instead.</p>
<p>Then you run JFlex with:</p>
<p><code>java jflex.Main &lt;options&gt; &lt;inputfiles&gt;</code></p>
<p>or with:</p>
<p><code>java -jar jflex-1.7.0.jar &lt;options&gt; &lt;inputfiles&gt;</code></p>
<p>The input files and options are in both cases optional. If you don’t provide a file name on the command line, JFlex will pop up a window to ask you for one.</p>
<p>JFlex knows about the following options:</p>
<p><code>-d &lt;directory&gt;</code><br />writes the generated file to the directory <code>&lt;directory&gt;</code></p>
<p><code>--encoding &lt;name&gt;</code><br />uses the character encoding <code>&lt;name&gt;</code> (e.g. <code>utf-8</code>) to read lexer specifications and write java files.</p>
<p><code>--skel &lt;file&gt;</code><br />uses external skeleton <code>&lt;file&gt;</code>. This is mainly for JFlex maintenance and special low level customisations. Use only when you know what you are doing! JFlex comes with a skeleton file in the <code>src</code> directory that reflects exactly the internal, pre-compiled skeleton and can be used with the <code>-skel</code> option.</p>
<p><code>--nomin</code><br />skip the DFA minimisation step during scanner generation.</p>
<p><code>--jlex</code><br />tries even harder to comply to JLex interpretation of specs.</p>
<p><code>--dot</code><br />generate graphviz dot files for the NFA, DFA and minimised DFA. This feature is still in alpha status, and not fully implemented yet.</p>
<p><code>--dump</code><br />display transition tables of NFA, initial DFA, and minimised DFA</p>
<p><code>--legacydot</code><br />dot (<code>.</code>) meta character matches <code>[^\n]</code> instead of<br /><code>[^\n\r\u000B\u000C\u0085\u2028\u2029]</code></p>
<p><code>--verbose</code> or <code>-v</code><br />display generation progress messages (enabled by default)</p>
<p><code>--quiet</code> or <code>-q</code><br />display error messages only (no chatter about what JFlex is currently doing)</p>
<p><code>--warn-unused</code><br />warn about unused macros (by default true in verbose mode and false in quiet mode)</p>
<p><code>--no-warn-unused</code><br />do not warn about unused macros (by default true in verbose mode and false in quiet mode)</p>
<p><code>--time</code><br />display time statistics about the code generation process (not very accurate)</p>
<p><code>--version</code><br />print version number</p>
<p><code>--info</code><br />print system and JDK information (useful if you’d like to report a problem)</p>
<p><code>--unicodever &lt;ver&gt;</code><br />print all supported properties for Unicode version <code>&lt;ver&gt;</code></p>
<p><code>--help</code> or <code>-h</code><br />print a help message explaining options and usage of JFlex.</p>
<h2 id="maven-plugin">Maven plugin</h2>
<p>The plugin reads JFlex grammar definition files (<code>.jflex</code>) and generates a corresponding Java parser (in <code>target/generated-source/jflex</code> by default).</p>
<h3 id="usage">Usage</h3>
<h4 id="minimal-configuration">Minimal configuration</h4>
<p>This configuration generates java code of a parser for all grammar files (<code>*.jflex</code>, <code>*.jlex</code>, <code>*.lex</code>, <code>*.flex</code>) found in <code>src/main/jflex/</code> and its sub-directories.</p>
<p>The name and package of the generated Java source code are the ones defined in the grammar. The generated Java source code is placed in <code>target/generated-source/jflex</code>, in sub-directories following the Java convention on package names.</p>
<p>Update the <code>pom.xml</code> to add the plugin:</p>
<pre><code>&lt;project&gt;
  &lt;!-- ... --&gt;
  &lt;build&gt;
    &lt;plugins&gt;
      &lt;plugin&gt;
        &lt;groupId&gt;de.jflex&lt;/groupId&gt;
        &lt;artifactId&gt;jflex-maven-plugin&lt;/artifactId&gt;
        &lt;version&gt;1.7.0&lt;/version&gt;
        &lt;executions&gt;
          &lt;execution&gt;
            &lt;goals&gt;
              &lt;goal&gt;generate&lt;/goal&gt;
            &lt;/goals&gt;
          &lt;/execution&gt;
        &lt;/executions&gt;
      &lt;/plugin&gt;
    &lt;/plugins&gt;
    &lt;!-- ... --&gt;
  &lt;/build&gt;
  &lt;!-- ... --&gt;
&lt;/project&gt;</code></pre>
<h4 id="more-complex-configuration">More complex configuration</h4>
<p>This example generates the source for the two grammars <code>src/main/lex/preprocessor.jflex</code> and <code>/pub/postprocessor.jflex</code>, as well as all grammar files found in <code>src/main/jflex</code> (and its sub-directories). The generated Java code is placed into <code>src/main/java</code> instead of <code>target/generated-sources/jflex</code>.</p>
<pre><code>      &lt;plugin&gt;
        &lt;groupId&gt;de.jflex&lt;/groupId&gt;
        &lt;artifactId&gt;jflex-maven-plugin&lt;/artifactId&gt;
        &lt;version&gt;1.6.0&lt;/version&gt;
        &lt;executions&gt;
          &lt;execution&gt;
            &lt;goals&gt;
              &lt;goal&gt;generate&lt;/goal&gt;
            &lt;/goals&gt;
            &lt;configuration&gt;
              &lt;outputDirectory&gt;src/main/java&lt;/outputDirectory&gt;
              &lt;lexDefinitions&gt;
                &lt;lexDefinition&gt;src/main/jflex&lt;/lexDefinition&gt;
                &lt;lexDefinition&gt;src/main/lex/preprocessor.jflex&lt;/lexDefinition&gt;
                &lt;lexDefinition&gt;/pub/postprocessor.jflex&lt;/lexDefinition&gt;
              &lt;/lexDefinitions&gt;
            &lt;/configuration&gt;
          &lt;/execution&gt;
        &lt;/executions&gt;
      &lt;/plugin&gt;</code></pre>
<h4 id="even-more-complex-configuration-using-several-executions">Even more complex configuration, using several executions</h4>
<p>This generates the source for</p>
<ul>
<li><p>all files found in <code>src/main/lex/</code>, using strict JLex compatibility.</p></li>
<li><p>and all files found in <code>src/main/jflex</code>, in verbose mode.</p></li>
</ul>
<pre><code>      &lt;plugin&gt;
        &lt;groupId&gt;de.jflex&lt;/groupId&gt;
        &lt;artifactId&gt;jflex-maven-plugin&lt;/artifactId&gt;
        &lt;version&gt;1.6.0&lt;/version&gt;
        &lt;executions&gt;
          &lt;execution&gt;
            &lt;id&gt;strict jlex&lt;/id&gt;
            &lt;goals&gt;
              &lt;goal&gt;generate&lt;/goal&gt;
            &lt;/goals&gt;
            &lt;configuration&gt;
              &lt;lexDefinitions&gt;
                &lt;lexDefinition&gt;src/main/lex&lt;/lexDefinition&gt;
              &lt;/lexDefinitions&gt;
              &lt;jlex&gt;true&lt;/jlex&gt;
            &lt;/configuration&gt;
          &lt;/execution&gt;
          &lt;execution&gt;
            &lt;id&gt;jflex&lt;/id&gt;
            &lt;goals&gt;
              &lt;goal&gt;generate&lt;/goal&gt;
            &lt;/goals&gt;
            &lt;configuration&gt;
              &lt;lexDefinitions&gt;
                &lt;lexDefinition&gt;src/main/jflex&lt;/lexDefinition&gt;
              &lt;/lexDefinitions&gt;
              &lt;verbose&gt;true&lt;/verbose&gt;
            &lt;/configuration&gt;
          &lt;/execution&gt;
        &lt;/executions&gt;
      &lt;/plugin&gt;</code></pre>
<p>More documentation on the configuration options can be found in the description for the mojo:</p>
<pre><code>mvn help:describe  -DgroupId=de.jflex -DartifactId=maven-jflex-plugin -Ddetail</code></pre>
<p>More information in the <a href="http://maven.apache.org/pom.html#Plugins">POM reference guide on plugins</a>.</p>
<h3 id="versions">Versions</h3>
<p>Which version of the plugin is best for you?</p>
<ul>
<li><p>jflex-maven-plugin-1.7.0 depends on 1.7.0 and requires Java 7 when you <code>mvn jflex:generate</code></p></li>
<li><p>jflex-maven-plugin-1.7.0 depends on 1.7.0 and requires Java 7 when you <code>mvn jflex:generate</code></p></li>
<li><p>jflex-maven-plugin-1.6.1 depends on JFlex 1.6.1 and requires Java 5 when you <code>mvn jflex:generate</code></p></li>
<li><p>jflex-maven-plugin-1.5.0 depends on JFlex 1.5.0 and requires Java 5 when you <code>mvn jflex:generate</code></p></li>
<li><p>maven-jflex-plugin-1.4.3-r1 depends on JFlex 1.4.3 and requires Java 1.3 when you <code>mvn jflex:generate</code></p></li>
</ul>
<h2 id="jflex-ant-task">JFlex Ant Task</h2>
<p>JFlex can easily be integrated with the <a href="http://ant.apache.org/">Ant</a> build tool. To use JFlex with Ant, simply copy the <code>lib/jflex-1.7.0.jar</code> file to the <code>$ANT_HOME/lib/</code> directory or explicitly set the path to <code>lib/jflex-1.7.0.jar</code> in the task definition (see example below).</p>
<p>The JFlex Ant Task invokes JFlex on a grammar file.</p>
<p>To use the JFlex task, place the following line in the Ant build file:</p>
<pre><code>&lt;taskdef classname=&quot;jflex.anttask.JFlexTask&quot; name=&quot;jflex&quot; /&gt;</code></pre>
<p>Or, setting the path to the JFlex jar explicitly:</p>
<pre><code>&lt;taskdef classname=&quot;jflex.anttask.JFlexTask&quot; name=&quot;jflex&quot;
         classpath=&quot;path-to-jflex.jar&quot; /&gt;</code></pre>
<p>The JFlex task requires the <code>file</code> attribute to be set to the source grammar file (<code>*.flex</code>). Unless the target directory is specified with the <code>destdir</code> option, the generated class will be saved to the same directory where the grammar file resides. Like <code>javac</code>, the JFlex task creates subdirectories in <code>destdir</code> according to the generated class package.</p>
<p>This task only invokes JFlex if the grammar file is newer than the generated files.</p>
<h3 id="parameters">Parameters</h3>
<p>The following attributes are available for invoking the JFlex task.</p>
<ul>
<li><p><code>file=&quot;file&quot;</code><br />The grammar file to process. This attribute is required.</p></li>
<li><p><code>destdir=&quot;dir&quot;</code><br />The directory to write the generated files to. If not set, the files are written to the directory containing the grammar file. Note that unlike JFlex’s <code>-d</code> command line option, <code>destdir</code> causes the generated file to be written to <code>{destdir}/</code><strong><code>{packagename}</code></strong>. This behaviour is similar to `javac -d dir.</p></li>
<li><p><code>outdir=&quot;dir&quot;</code> <br />The directory to write the generated files to. If not set, the files are written to the directory containing the grammar file. This options works exactly like JFlex’s <code>-d</code> command line option, it causes the output file to be written to <code>dir</code> regardless of the package name.</p></li>
<li><p><code>verbose</code> (default <code>&quot;off&quot;</code>)<br />Display generation process messages.</p></li>
<li><p><code>encoding</code> (if unset uses the JVM default encoding)<br />The character encoding to use when reading lexer specifications and writing java files.</p></li>
<li><p><code>dump</code> (default <code>&quot;off&quot;</code>) <br />Dump character classes, NFA and DFA tables.</p></li>
<li><p><code>time</code> or <code>timeStatistics</code> (default <code>&quot;off&quot;</code>)<br />Display generation time statistics.</p></li>
<li><p><code>nomin</code> or <code>skipMinimization</code> (default <code>&quot;off&quot;</code>)<br />Skip DFA minimisation step.</p></li>
<li><p><code>skel=&quot;file&quot;</code> or <code>skeleton=&quot;file&quot;</code><br />Use external skeleton file.</p></li>
<li><p><code>dot</code> or <code>generateDot</code> (default <code>&quot;off&quot;</code>)<br />Write graphviz <code>.dot</code> files for the generated automata.</p></li>
<li><p><code>nobak</code> (default <code>&quot;off&quot;</code>)<br />Do not make a backup if the generated file exists.</p></li>
<li><p><code>jlex</code> (default <code>&quot;off&quot;</code>)<br />Use JLex compatibility mode.</p></li>
<li><p><code>legacydot</code> (default <code>&quot;off&quot;</code>)<br />The dot <code>.</code> meta-character matches <code>[^\n]</code> instead of <code>[^\n\r\u000B\u000C\u0085\u2028\u202 9]</code></p></li>
<li><p><code>unusedwarning</code> (default <code>&quot;true&quot;</code>)<br />Warn about unused macro definitions in the lexer specification.</p></li>
</ul>
<h3 id="example">Example</h3>
<p>After the task definition, the <code>&lt;jflex ..&gt;</code> task is available in Ant. For example:</p>
<pre><code> &lt;jflex
     file=&quot;src/parser/Parser.flex&quot;
     destdir=&quot;build/generated/&quot;
 /&gt;</code></pre>
<p>JFlex generates the scanner for <code>src/parser/Scanner.flex</code> and saves the result to <code>build/generated/parser/</code>, providing <code>Scanner.flex</code> is declared to be in package <code>parser</code>.</p>
<pre><code> &lt;jflex
     file=&quot;src/parser/Scanner.flex&quot;
     destdir=&quot;build/generated/&quot;
 /&gt;
 &lt;javac
     srcdir=&quot;build/generated/&quot;
     destdir=&quot;build/classes/&quot;
 /&gt;</code></pre>
<p>The same as above plus compile generated classes to <code>build/classes</code></p>
<h1 id="Example">A simple Example: How to work with JFlex</h1>
<p>To demonstrate how a lexical specification with JFlex looks like, this section presents a part of the specification for the Java language. The example does not describe the whole lexical structure of Java programs, but only a small and simplified part of it (some keywords, some operators, comments and only two kinds of literals). It also shows how to interface with the LALR parser generator CUP <span class="citation">(Hudson 1996)</span> and therefore uses a class <code>sym</code> (generated by CUP), where integer constants for the terminal tokens of the CUP grammar are declared. JFlex comes with a directory <code>examples</code>, where you can find a small standalone scanner that doesn’t need other tools like CUP to give you working example code without dependencies.</p>
<p>The <code>examples</code> directory also contains a <em>complete</em> JFlex specification of the lexical structure of Java programs together with the CUP parser specification for Java by C. Scott Ananian, obtained from the CUP <span class="citation">(Hudson 1996)</span> web site (modified to interface with the JFlex scanner). Both specifications adhere to the Java Language Specification <span class="citation">(Gosling, Joy, and Steele 1996)</span>.</p>
<pre><code>    /* JFlex example: partial Java language lexer specification */
    import java_cup.runtime.*;

    /**
     * This class is a simple example lexer.
     */
    %%

    %class Lexer
    %unicode
    %cup
    %line
    %column

    %{
      StringBuffer string = new StringBuffer();

      private Symbol symbol(int type) {
        return new Symbol(type, yyline, yycolumn);
      }
      private Symbol symbol(int type, Object value) {
        return new Symbol(type, yyline, yycolumn, value);
      }
    %}

    LineTerminator = \r|\n|\r\n
    InputCharacter = [^\r\n]
    WhiteSpace     = {LineTerminator} | [ \t\f]

    /* comments */
    Comment = {TraditionalComment} | {EndOfLineComment} | {DocumentationComment}

    TraditionalComment   = &quot;/*&quot; [^*] ~&quot;*/&quot; | &quot;/*&quot; &quot;*&quot;+ &quot;/&quot;
    // Comment can be the last line of the file, without line terminator.
    EndOfLineComment     = &quot;//&quot; {InputCharacter}* {LineTerminator}?
    DocumentationComment = &quot;/**&quot; {CommentContent} &quot;*&quot;+ &quot;/&quot;
    CommentContent       = ( [^*] | \*+ [^/*] )*

    Identifier = [:jletter:] [:jletterdigit:]*

    DecIntegerLiteral = 0 | [1-9][0-9]*

    %state STRING

    %%

    /* keywords */
    &lt;YYINITIAL&gt; &quot;abstract&quot;           { return symbol(sym.ABSTRACT); }
    &lt;YYINITIAL&gt; &quot;boolean&quot;            { return symbol(sym.BOOLEAN); }
    &lt;YYINITIAL&gt; &quot;break&quot;              { return symbol(sym.BREAK); }

    &lt;YYINITIAL&gt; {
      /* identifiers */ 
      {Identifier}                   { return symbol(sym.IDENTIFIER); }
     
      /* literals */
      {DecIntegerLiteral}            { return symbol(sym.INTEGER_LITERAL); }
      \&quot;                             { string.setLength(0); yybegin(STRING); }

      /* operators */
      &quot;=&quot;                            { return symbol(sym.EQ); }
      &quot;==&quot;                           { return symbol(sym.EQEQ); }
      &quot;+&quot;                            { return symbol(sym.PLUS); }

      /* comments */
      {Comment}                      { /* ignore */ }
     
      /* whitespace */
      {WhiteSpace}                   { /* ignore */ }
    }

    &lt;STRING&gt; {
      \&quot;                             { yybegin(YYINITIAL); 
                                       return symbol(sym.STRING_LITERAL, 
                                       string.toString()); }
      [^\n\r\&quot;\\]+                   { string.append( yytext() ); }
      \\t                            { string.append(&#39;\t&#39;); }
      \\n                            { string.append(&#39;\n&#39;); }

      \\r                            { string.append(&#39;\r&#39;); }
      \\\&quot;                           { string.append(&#39;\&quot;&#39;); }
      \\                             { string.append(&#39;\\&#39;); }
    }

    /* error fallback */
    [^]                              { throw new Error(&quot;Illegal character &lt;&quot;+
                                                        yytext()+&quot;&gt;&quot;); }</code></pre>
<p>From this specification JFlex generates a <code>.java</code> file with one class that contains code for the scanner. The class will have a constructor taking a <code>java.io.Reader</code> from which the input is read. The class will also have a function <code>yylex()</code> that runs the scanner and that can be used to get the next token from the input (in this example the function actually has the name <code>next_token()</code> because the specification uses the <code>%cup</code> switch).</p>
<p>As with JLex, the specification consists of three parts, divided by <code>%%</code>:</p>
<ul>
<li><a href="#ExampleUserCode">usercode</a>,</li>
<li><a href="#ExampleOptions">options and declarations</a> and</li>
<li><a href="#ExampleLexRules">lexical rules</a>.</li>
</ul>
<h2 id="ExampleUserCode">Code to include</h2>
<p>Let’s take a look at the first section, <em>user code</em>: The text up to the first line starting with <code>%%</code> is copied verbatim to the top of the generated lexer class (before the actual class declaration). Next to <code>package</code> and <code>import</code> statements there is usually not much to do here. If the code ends with a <code>javadoc</code> class comment, the generated class will get this comment, if not, JFlex will generate one automatically.</p>
<h2 id="ExampleOptions">Options and Macros</h2>
<p>The second section <em>options and declarations</em> is more interesting. It consists of a set of options, code that is included inside the generated scanner class, lexical states and macro declarations. Each JFlex option must begin a line of the specification and starts with a <code>%</code>. In our example the following options are used:</p>
<ul>
<li><p><code>%class Lexer</code> tells JFlex to give the generated class the name <code>Lexer</code> and to write the code to a file <code>Lexer.java</code>.</p></li>
<li><p><code>%unicode</code> defines the set of characters the scanner will work on. For scanning text files, <code>%unicode</code> should always be used. The Unicode version may be specified, e.g. <code>%unicode 4.1</code>. If no version is specified, the most recent supported Unicode version will be used - in JFlex 1.7.0, this is Unicode 9.0. See also <a href="#sec:encodings">Encodings</a> for more information on character sets, encodings, and scanning text vs. binary files.</p></li>
<li><p><code>%cup</code> switches to CUP compatibility mode to interface with a CUP generated parser.</p></li>
<li><p><code>%line</code> switches line counting on (the current line number can be accessed via the variable <code>yyline</code>)</p></li>
<li><p><code>%column</code> switches column counting on (the current column is accessed via <code>yycolumn</code>)</p></li>
</ul>
<p>The code between <code>%{</code> and <code>%}</code> is copied verbatim into the generated lexer class source. Here you can declare member variables and functions that are used inside scanner actions. In our example we declare a <code>StringBuffer</code> <code>string</code> in which we will store parts of string literals and two helper functions <code>symbol</code> that create <code>java_cup.runtime.Symbol</code> objects with position information of the current token (see also <a href="#CUPWork">JFlex and CUP</a> for how to interface with the parser generator CUP). As with all JFlex options, both <code>%{</code> and <code>%}</code> must begin a line.</p>
<p>The specification continues with macro declarations. Macros are abbreviations for regular expressions, used to make lexical specifications easier to read and understand. A macro declaration consists of a macro identifier followed by <code>=</code>, then followed by the regular expression it represents. This regular expression may itself contain macro usages. Although this allows a grammar-like specification style, macros are still just abbreviations and not non-terminals – they cannot be recursive. Cycles in macro definitions are detected and reported at generation time by JFlex.</p>
<p>Here some of the example macros in more detail:</p>
<ul>
<li><p><code>LineTerminator</code> stands for the regular expression that matches an ASCII <code>CR</code>, an ASCII <code>LF</code> or a <code>CR</code> followed by <code>LF</code>.</p></li>
<li><p><code>InputCharacter</code> stands for all characters that are not a <code>CR</code> or <code>LF</code>.</p></li>
<li><p><code>TraditionalComment</code> is the expression that matches the string <code>/*</code> followed by a character that is not a <code>*</code>, followed by anything that does not contain, but ends in <code>*/</code>. As this would not match comments like <code>/****/</code>, we add <code>/*</code> followed by an arbitrary number (at least one) of <code>*</code> followed by the closing <code>/</code>. This is not the only, but one of the simpler expressions matching non-nesting Java comments. It is tempting to just write something like the expression <code>/* .* */</code>, but this would match more than we want. It would for instance match the entire input <code>/* */ x = 0; /* */</code>, instead of two comments and four real tokens. See the macros <code>DocumentationComment</code> and <code>CommentContent</code> for an alternative.</p></li>
<li><p><code>CommentContent</code> matches zero or more occurrences of any character except a <code>*</code> or any number of <code>*</code> followed by a character that is not a <code>/</code></p></li>
<li><p><code>Identifier</code> matches each string that starts with a character of class <code>jletter</code> followed by zero or more characters of class <code>jletterdigit</code>. <code>jletter</code> and <code>jletterdigit</code> are predefined character classes. <code>jletter</code> includes all characters for which the Java function <code>Character.isJavaIdentifierStart</code> returns <code>true</code> and <code>jletterdigit</code> all characters for that <code>Character.isJavaIdentifierPart</code> returns <code>true</code>.</p></li>
</ul>
<p>The last part of the second section in our lexical specification is a lexical state declaration: <code>state STRING</code> declares a lexical state <code>STRING</code> that can be used in the <em>lexical rules</em> part of the specification. A state declaration is a line starting with <code>%state</code> followed by a space or comma separated list of state identifiers. There can be more than one line starting with <code>%state</code>.</p>
<h2 id="ExampleLexRules">Rules and Actions</h2>
<p>The <em>lexical rules</em> section of a JFlex specification contains regular expressions and actions (Java code) that are executed when the scanner matches the associated regular expression. As the scanner reads its input, it keeps track of all regular expressions and activates the action of the expression that has the longest match. Our specification above for instance would with input <code>breaker</code> match the regular expression for <code>Identifier</code> and not the keyword <code>break</code> followed by the Identifier <code>er</code>, because rule <code>{Identifier}</code> matches more of this input at once than any other rule in the specification. If two regular expressions both have the longest match for a certain input, the scanner chooses the action of the expression that appears first in the specification. In that way, we get for input <code>break</code> the keyword <code>break</code> and not an Identifier <code>break</code>.</p>
<p>In addition to regular expression matches, one can use lexical states to refine a specification. A lexical state acts like a start condition. If the scanner is in lexical state <code>STRING</code>, only expressions that are preceded by the start condition <code>&lt;STRING&gt;</code> can be matched. A start condition of a regular expression can contain more than one lexical state. It is then matched when the lexer is in any of these lexical states. The lexical state <code>YYINITIAL</code> is predefined and is also the state in which the lexer begins scanning. If a regular expression has no start conditions it is matched in <em>all</em> lexical states.</p>
<p>Since there often are sets of expressions with the same start conditions, they can be grouped:</p>
<pre><code>&lt;STRING&gt; {
  expr1   { action1 }
  expr2   { action2 }
}</code></pre>
<p>means that both <code>expr1</code> and <code>expr2</code> have start condition <code>&lt;STRING&gt;</code>.</p>
<p>The first three rules in our example demonstrate the syntax of a regular expression preceded by the start condition <code>&lt;YYINITIAL&gt;</code>.</p>
<pre><code>&lt;YYINITIAL&gt; &quot;abstract&quot;           { return symbol(sym.ABSTRACT); }</code></pre>
<p>matches the input <code>abstract</code> only if the scanner is in its start state <code>YYINITIAL</code>. When the string <code>abstract</code> is matched, the scanner function returns the CUP symbol <code>sym.ABSTRACT</code>. If an action does not return a value, the scanning process is resumed immediately after executing the action.</p>
<p>The rules enclosed in</p>
<pre><code>&lt;YYINITIAL&gt; { ...</code></pre>
<p>demonstrate the abbreviated syntax and are also only matched in state <code>YYINITIAL</code>.</p>
<p>Of these rules, one is of special interest:</p>
<pre><code>\&quot;  { string.setLength(0); yybegin(STRING); }</code></pre>
<p>If the scanner matches a double quote in state <code>YYINITIAL</code> we have recognised the start of a string literal. Therefore we clear our <code>StringBuffer</code> that will hold the content of this string literal and tell the scanner with <code>yybegin(STRING)</code> to switch into the lexical state <code>STRING</code>. Because we do not yet return a value to the parser, our scanner proceeds immediately.</p>
<p>In lexical state <code>STRING</code> another rule demonstrates how to refer to the input that has been matched:</p>
<pre><code>[^\n\r\&quot;\\]+                   { string.append( yytext() ); }</code></pre>
<p>The expression <code>[^\n\r\&quot;\\]+</code> matches all characters in the input up to the next backslash (indicating an escape sequence such as <code>\n</code>), double quote (indicating the end of the string), or line terminator (which must not occur in a Java string literal). The matched region of the input is referred to by <code>yytext()</code> and appended to the content of the string literal parsed so far.</p>
<p>The last lexical rule in the example specification is used as an error fallback. It matches any character in any state that has not been matched by another rule. It doesn’t conflict with any other rule because it has the least priority (because it’s the last rule) and because it matches only one character (so it can’t have longest match precedence over any other rule).</p>
<h2 id="how-to-get-it-building">How to get it building</h2>
<ul>
<li><p><a href="#Installing">Install JFlex</a></p></li>
<li><p>If you have written your specification file (or chosen one from the <code>examples</code> directory), save it (say under the name <code>java-lang.flex</code>).</p></li>
<li><p>Run JFlex with</p>
<p><code>jflex java-lang.flex</code></p></li>
<li><p>JFlex should then show progress messages about generating the scanner and write the generated code to the directory of your specification file.</p></li>
<li><p>Compile the generated <code>.java</code> file and your own classes. (If you use CUP, generate your parser classes first)</p></li>
<li><p>That’s it.</p></li>
</ul>
<h1 id="Specifications">Lexical Specifications</h1>
<p>As shown above, a lexical specification file for JFlex consists of three parts divided by a single line starting with <code>%%</code>:</p>
<pre><code>UserCode
%%
Options and declarations
%%
Lexical rules</code></pre>
<p>In all parts of the specification comments of the form <code>/* comment text */</code> and Java-style end-of-line comments starting with <code>//</code> are permitted. JFlex comments do nest - so the number of <code>/*</code> and <code>*/</code> should be balanced.</p>
<h2 id="user-code">User code</h2>
<p>The first part contains user code that is copied verbatim to the beginning of the generated source file before the scanner class declaration. As shown in the example spec, this is the place to put <code>package</code> declarations and <code>import</code> statements. It is possible, but not considered good Java style to put helper classes, such as token classes, into this section; they are usually better declared in their own <code>.java</code> files.</p>
<h2 id="options-and-declarations">Options and declarations</h2>
<p>The second part of the lexical specification contains options and directives to customise the generated lexer, declarations of <a href="#StateDecl">lexical states</a> and <a href="#MacroDefs">macro definitions</a>.</p>
<p>Each JFlex directive must sit at the beginning of a line and starts with the <code>%</code> character. Directives that have one or more parameters are described as follows.</p>
<pre><code>%class &quot;classname&quot;</code></pre>
<p>means that you start a line with <code>%class</code> followed by a space followed by the name of the class for the generated scanner (the double quotes are <em>not</em> to be entered, see also the <a href="#Example">example specification</a>).</p>
<h3 id="ClassOptions">Class options and user class code</h3>
<p>These options regard name, constructor, API, and related parts of the generated scanner class.</p>
<ul>
<li><p><code>%class &quot;classname&quot;</code></p>
<p>Tells JFlex to give the generated class the name <code>classname</code> and to write the generated code to a file <code>classname.java</code>. If the <code>-d &lt;directory&gt;</code> command line option is not used, the code will be written to the directory where the specification file resides. If no <code>%class</code> directive is present in the specification, the generated class will get the name <code>Yylex</code> and will be written to a file <code>Yylex.java</code>. There should be only one <code>%class</code> directive in a specification.</p></li>
<li><p><code>%implements &quot;interface 1&quot;[, &quot;interface 2&quot;, ..]</code></p>
<p>Makes the generated lexer class implement the specified interfaces. If more than one <code>%implements</code> directive is present, all specified interfaces will be implemented.</p></li>
<li><p><code>%extends &quot;classname&quot;</code></p>
<p>Makes the generated class a subclass of the class <code>classname</code>. There should be only one <code>%extends</code> directive in a specification.</p></li>
<li><p><code>%public</code></p>
<p>Makes the generated class public (the class is only accessible in its own package by default).</p></li>
<li><p><code>%final</code></p>
<p>Makes the generated class final.</p></li>
<li><p><code>%abstract</code></p>
<p>Makes the generated class abstract.</p></li>
<li><p><code>%apiprivate</code></p>
<p>Makes all generated methods and fields of the class private. Exceptions are the constructor, user code in the specification, and, if <code>%cup</code> is present, the method <code>next_token</code>. All occurrences of <code>public</code> (one space character before and after <code>public</code>) in the skeleton file are replaced by <code>private</code> (even if a user-specified skeleton is used). Access to the generated class is expected to be mediated by user class code (see next switch).</p></li>
<li><p><code>%{</code><br /><code>...</code><br /><code>%}</code></p>
<p>The code enclosed in <code>%{</code> and <code>%}</code> is copied verbatim into the generated class. Here you can define your own member variables and functions in the generated scanner. Like all options, both <code>%{</code> and <code>%}</code> must start a line in the specification. If more than one class code directive <code>%{...%}</code> is present, the code is concatenated in order of appearance in the specification.</p></li>
<li><p><code>%init{</code><br /><code>...</code><br /><code>%init}</code></p>
<p>The code enclosed in <code>%init{</code> and <code>%init}</code> is copied verbatim into the constructor of the generated class. Here, member variables declared in the <code>%{...%}</code> directive can be initialised. If more than one initialiser option is present, the code is concatenated in order of appearance in the specification.</p></li>
<li><p><code>%initthrow{</code><br /><code>&quot;exception1&quot;[, &quot;exception2&quot;, ...]</code><br /><code>%initthrow}</code></p>
<p>or (on a single line) just</p>
<p><code>%initthrow &quot;exception1&quot; [, &quot;exception2&quot;, ...]</code></p>
<p>Causes the specified exceptions to be declared in the <code>throws</code> clause of the constructor. If more than one <code>%initthrow{</code> <code>...</code> <code>%initthrow}</code> directive is present in the specification, all specified exceptions will be declared.</p></li>
<li><p><code>%ctorarg &quot;type&quot; &quot;ident&quot;</code></p>
<p>Adds the specified argument to the constructors of the generated scanner. If more than one such directive is present, the arguments are added in order of occurrence in the specification. Note that this option conflicts with the <code>%standalone</code> and <code>%debug</code> directives, because there is no sensible default that can be created automatically for such parameters in the generated <code>main</code> methods. JFlex will warn in this case and generate an additional default constructor without these parameters and without user init code (which might potentially refer to the parameters).</p></li>
<li><p><code>%scanerror &quot;exception&quot;</code></p>
<p>Causes the generated scanner to throw an instance of the specified exception in case of an internal error (default is <code>java.lang.Error</code>). Note that this exception is only for internal scanner errors. With usual specifications it should never occur (i.e. if there is an error fallback rule in the specification and only the documented scanner API is used).</p></li>
<li><p><code>%buffer &quot;size&quot;</code></p>
<p>Set the initial size of the scan buffer to the specified value (decimal, in bytes). The default value is 16384.</p></li>
<li><p><code>%include &quot;filename&quot;</code></p>
<p>Replaces the <code>%include</code> verbatim by the specified file.</p></li>
</ul>
<h3 id="scanning-method">Scanning method</h3>
<p>This section shows how the scanning method can be customised. You can redefine the name and return type of the method and it is possible to declare exceptions that may be thrown in one of the actions of the specification. If no return type is specified, the scanning method will be declared as returning values of class <code>Yytoken</code>.</p>
<ul>
<li><p><code>%function &quot;name&quot;</code></p>
<p>Causes the scanning method to get the specified name. If no <code>%function</code> directive is present in the specification, the scanning method gets the name <code>yylex</code>. This directive overrides settings of the <code>%cup</code> switch. The default name of the scanning method with the <code>%cup</code> switch is <code>next_token</code>. Overriding this name might lead to the generated scanner being implicitly declared as <code>abstract</code>, because it does not provide the method <code>next_token</code> of the interface <code>java_cup.runtime.Scanner</code>. It is of course possible to provide a dummy implementation of that method in the class code section if you still want to override the function name.</p></li>
<li><p><code>%integer</code><br /><code>%int</code></p>
<p>Both cause the scanning method to be declared as returning Java type <code>int</code>. Actions in the specification can then return <code>int</code> values as tokens. The default end of file value under this setting is <code>YYEOF</code>, which is a <code>public static final int</code> member of the generated class.</p></li>
<li><p><code>%intwrap</code></p>
<p>Causes the scanning method to be declared as of the Java wrapper type <code>Integer</code>. Actions in the specification can then return <code>Integer</code> values as tokens. The default end of file value under this setting is <code>null</code>.</p></li>
<li><p><code>%type &quot;typename&quot;</code></p>
<p>Causes the scanning method to be declared as returning values of the specified type. Actions in the specification can then return values of <code>typename</code> as tokens. The default end of file value under this setting is <code>null</code>. If <code>typename</code> is not a subclass of <code>java.lang.Object</code>, you should specify another end of file value using the <code>%eofval{</code> <code>...</code> <code>%eofval}</code> directive or the <a href="#Grammar"><code>&lt;&lt;EOF&gt;&gt;</code> rule</a>. The <code>%type</code> directive overrides settings of the <code>%cup</code> switch.</p></li>
<li><p><code>%yylexthrow{</code><br /><code>&quot;exception1&quot; [, &quot;exception2&quot;, ... ]</code><br /><code>%yylexthrow}</code></p>
<p>or, on a single line, just</p>
<p><code>%yylexthrow &quot;exception1&quot; [, &quot;exception2&quot;, ...]</code></p>
<p>The exceptions listed inside <code>%yylexthrow{</code> <code>...</code> <code>%yylexthrow}</code> will be declared in the throws clause of the scanning method. If there is more than one <code>%yylexthrow{</code> <code>...</code> <code>%yylexthrow}</code> clause in the specification, all specified exceptions will be declared.</p></li>
</ul>
<h3 id="the-end-of-file">The end of file</h3>
<p>There is always a default value that the scanning method will return when the end of file has been reached. You may however define a specific value to return and a specific piece of code that should be executed when the end of file is reached.</p>
<p>The default end of file value depends on the return type of the scanning method:</p>
<ul>
<li><p>For <code>%integer</code>, the scanning method will return the value <code>YYEOF</code>, which is a <code>public static final int</code> member of the generated class.</p></li>
<li><p>For <code>%intwrap</code>,</p></li>
<li><p>for no specified type at all, or</p></li>
<li><p>for a user defined type, declared using <code>%type</code>, the value is <code>null</code>.</p></li>
<li><p>In CUP compatibility mode, using <code>%cup</code>, the value is</p>
<p><code>new java_cup.runtime.Symbol(sym.EOF)</code></p></li>
</ul>
<p>User values and code to be executed at the end of file can be defined using these directives:</p>
<ul>
<li><p><code>%eofval{</code><br /><code>...</code><br /><code>%eofval}</code></p>
<p>The code included in <code>%eofval{</code> <code>...</code> <code>%eofval}</code> will be copied verbatim into the scanning method and will be executed <em>each time</em> the end of file is reached (more than once is possible when the scanning method is called again after the end of file has been reached). The code should return the value that indicates the end of file to the parser. There should be only one <code>%eofval{</code> <code>...</code> <code>%eofval}</code> clause in the specification. The <code>%eofval{ ... %eofval}</code> directive overrides settings of the <code>%cup</code> switch and <code>%byaccj</code> switch. There is also an alternative, more readable way to specify the end of file value using the <a href="#Grammar"><code>&lt;&lt;EOF&gt;&gt;</code> rule</a>.</p></li>
<li><p><code>%eof{</code><br /><code>...</code><br /><code>%eof}</code></p>
<p>The code included in <code>%{eof ... %eof}</code> will be executed exactly once, when the end of file is reached. The code is included inside a method <code>void yy_do_eof()</code> and should not return any value (use <code>%eofval{...%eofval}</code> or <code>&lt;&lt;EOF&gt;&gt;</code> for this purpose). If more than one end of file code directive is present, the code will be concatenated in order of appearance in the specification.</p></li>
<li><p><code>%eofthrow{</code><br /><code>&quot;exception1&quot; [,&quot;exception2&quot;, ... ]</code><br /><code>%eofthrow}</code></p>
<p>or, on a single line:</p>
<p><code>%eofthrow &quot;exception1&quot; [, &quot;exception2&quot;, ...]</code></p>
<p>The exceptions listed inside <code>%eofthrow{...%eofthrow}</code> will be declared in the throws clause of the method <code>yy_do_eof()</code>. If there is more than one <code>%eofthrow{...%eofthrow}</code> clause in the specification, all specified exceptions will be declared.</p></li>
<li><p><code>%eofclose</code></p>
<p>Causes JFlex to close the input stream at the end of file. The code <code>yyclose()</code> is appended to the method <code>yy_do_eof()</code> (together with the code specified in <code>%eof{...%eof}</code>) and the exception <code>java.io.IOException</code> is declared in the throws clause of this method (together with those of <code>%eofthrow{...%eofthrow}</code>)</p></li>
<li><p><code>%eofclose false</code></p>
<p>Turns the effect of <code>%eofclose</code> off again (e.g. in case closing of input stream is not wanted after <code>%cup</code>).</p></li>
</ul>
<h3 id="standalone-scanners">Standalone scanners</h3>
<ul>
<li><p><code>%debug</code></p>
<p>Creates a main function in the generated class that expects the name of an input file on the command line and then runs the scanner on this input file by printing information about each returned token to the Java console until the end of file is reached. The information includes: line number (if line counting is enabled), column (if column counting is enabled), the matched text, and the executed action (with line number in the specification).</p></li>
<li><p><code>%standalone</code></p>
<p>Creates a main function in the generated class that expects the name of an input file on the command line and then runs the scanner on this input file. The values returned by the scanner are ignored, but any unmatched text is printed to the Java console instead. To avoid having to use an extra token class, the scanning method will be declared as having default type <code>int</code>, not <code>YYtoken</code> (if there isn’t any other type explicitly specified). This is in most cases irrelevant, but could be useful to know when making another scanner standalone for some purpose. You should consider using the <code>%debug</code> directive, if you just want to be able to run the scanner without a parser attached for testing etc.</p></li>
</ul>
<h3 id="cup-compatibility">CUP compatibility</h3>
<p>You may also want to read the <a href="#CUPWork">CUP section</a> if you are interested in how to interface your generated scanner with CUP.</p>
<ul>
<li><p><code>%cup</code></p>
<p>The <code>%cup</code> directive enables CUP compatibility mode and is equivalent to the following set of directives:</p>
<pre><code>%implements java_cup.runtime.Scanner
%function next_token
%type java_cup.runtime.Symbol
%eofval{
  return new java_cup.runtime.Symbol(&lt;CUPSYM&gt;.EOF);
%eofval}
%eofclose</code></pre>
<p>The value of <code>&lt;CUPSYM&gt;</code> defaults to <code>sym</code> and can be changed with the <code>%cupsym</code> directive. In JLex compatibility mode (<code>--jlex</code> switch on the command line), <code>%eofclose</code> will not be turned on.</p></li>
<li><p><code>%cup2</code></p>
<p>The <code>%cup2</code> directive is similar to CUP mode, just for the CUP2 generator from TU Munich at <a href="http://www2.in.tum.de/cup2" class="uri">http://www2.in.tum.de/cup2</a>. It does the following:</p>
<ul>
<li>adds CUP2 package import declarations</li>
<li>implements the CUP2 scanner interface</li>
<li>switches on line and column count</li>
<li>sets the scanner function to <code>readNextTerminal</code></li>
<li>sets the token type to <code>ScannerToken&lt;? extends Object&gt;</code></li>
<li>returns the special CUP2 EOF token at end of file</li>
<li>switches on unicode</li>
</ul></li>
<li><p><code>%cupsym &quot;classname&quot;</code></p>
<p>Customises the name of the CUP generated class/interface containing the names of terminal tokens. Default is <code>sym</code>. The directive should not be used after <code>%cup</code>, only before. <!-- FIXME: check if this can be relaxed --></p></li>
<li><p><code>%cupdebug</code></p>
<p>Creates a main function in the generated class that expects the name of an input file on the command line and then runs the scanner on this input file. Prints line, column, matched text, and CUP symbol name for each returned token to standard out.</p></li>
</ul>
<h3 id="byaccj-compatibility">BYacc/J compatibility</h3>
<p>You may also want to read <a href="#BYaccJ">JFlex and BYacc/J</a> if you are interested in how to interface your generated scanner with Byacc/J.</p>
<ul>
<li><p><code>%byacc</code></p>
<p>The <code>%byacc</code> directive enables BYacc/J compatibility mode and is equivalent to the following set of directives:</p>
<pre><code>%integer
%eofval{
  return 0;
%eofval}
%eofclose</code></pre></li>
</ul>
<h3 id="input-character-sets">Input Character sets</h3>
<ul>
<li><p><code>%7bit</code></p>
<p>Causes the generated scanner to use an 7 bit input character set (character codes 0-127). If an input character with a code greater than 127 is encountered in an input at runtime, the scanner will throw an <code>ArrayIndexOutofBoundsException</code>. Not only because of this, you should consider using the <code>%unicode</code> directive. See also <a href="#sec:encodings">Encodings</a> for information about character encodings. This is the default in JLex compatibility mode.</p></li>
<li><p><code>%full</code><br /><code>%8bit</code></p>
<p>Both options cause the generated scanner to use an 8 bit input character set (character codes 0-255). If an input character with a code greater than 255 is encountered in an input at runtime, the scanner will throw an <code>ArrayIndexOutofBoundsException</code>. Note that even if your platform uses only one byte per character, the Unicode value of a character may still be greater than 255. If you are scanning text files, you should consider using the <code>%unicode</code> directive. See also section <a href="#sec:encodings">Econdings</a> for more information about character encodings.</p></li>
<li><p><code>%unicode</code><br /><code>%16bit</code></p>
<p>Both options cause the generated scanner to use the full Unicode input character set, including supplementary code points: 0-0x10FFFF. <code>%unicode</code> does not mean that the scanner will read two bytes at a time. What is read and what constitutes a character depends on the runtime platform. See also section <a href="#sec:encodings">Encodings</a> for more information about character encodings. This is the default unless the JLex compatibility mode is used (command line option <code>--jlex</code>).</p></li>
<li><p><code>%caseless</code><br /><code>%ignorecase</code></p>
<p>This option causes JFlex to handle all characters and strings in the specification as if they were specified in both uppercase and lowercase form. This enables an easy way to specify a scanner for a language with case insensitive keywords. The string <code>break</code> in a specification is for instance handled like the expression <code>[bB][rR][eE][aA][kK]</code>. The <code>%caseless</code> option does not change the matched text and does not affect character classes. So <code>[a]</code> still only matches the character <code>a</code> and not <code>A</code>. Which letters are uppercase and which lowercase letters, is defined by the Unicode standard. In JLex compatibility mode (<code>--jlex</code> switch on the command line), <code>%caseless</code> and <code>%ignorecase</code> also affect character classes.</p></li>
</ul>
<h3 id="line-character-and-column-counting">Line, character and column counting</h3>
<ul>
<li><p><code>%char</code></p>
<p>Turns character counting on. The <code>int</code> member variable <code>yychar</code> contains the number of characters (starting with 0) from the beginning of input to the beginning of the current token.</p></li>
<li><p><code>%line</code></p>
<p>Turns line counting on. The <code>int</code> member variable <code>yyline</code> contains the number of lines (starting with 0) from the beginning of input to the beginning of the current token.</p></li>
<li><p><code>%column</code></p>
<p>Turns column counting on. The <code>int</code> member variable <code>yycolumn</code> contains the number of characters (starting with 0) from the beginning of the current line to the beginning of the current token.</p></li>
</ul>
<h3 id="obsolete-jlex-options">Obsolete JLex options</h3>
<ul>
<li><p><code>%notunix</code></p>
<p>This JLex option is obsolete in JFlex but still recognised as valid directive. It used to switch between Windows and Unix kind of line terminators (<code>\r\n</code> and <code>\n</code>) for the <code>$</code> operator in regular expressions. JFlex always recognises both styles of platform dependent line terminators.</p></li>
<li><p><code>%yyeof</code></p>
<p>This JLex option is obsolete in JFlex but still recognised as valid directive. In JLex it declares a public member constant <code>YYEOF</code>. JFlex declares it in any case.</p></li>
</ul>
<h3 id="StateDecl">State declarations</h3>
<p>State declarations have the following form:</p>
<p><code>%s[tate] &quot;state identifier&quot; [, &quot;state identifier&quot;, ... ]</code> for inclusive or<br /><code>%x[state] &quot;state identifier&quot; [, &quot;state identifier&quot;, ... ]</code> for exclusive states</p>
<p>There may be more than one line of state declarations, each starting with <code>%state</code> or <code>%xstate</code>. State identifiers are letters followed by a sequence of letters, digits or underscores. State identifiers can be separated by white-space or comma.</p>
<p>The sequence</p>
<pre><code>%state STATE1
%xstate STATE3, XYZ, STATE_10
%state ABC STATE5</code></pre>
<p>declares the set of identifiers <code>STATE1, STATE3, XYZ, STATE_10, ABC, STATE5</code> as lexical states, <code>STATE1</code>, <code>ABC</code>, <code>STATE5</code> as inclusive, and <code>STATE3</code>, <code>XYZ</code>, <code>STATE_10</code> as exclusive. See also <a href="#HowMatched">How the Input is Matched</a> on the way lexical states influence how the input is matched.</p>
<h3 id="MacroDefs">Macro definitions</h3>
<p>A macro definition has the form</p>
<pre><code>macroidentifier = regular expression</code></pre>
<p>That means, a macro definition is a macro identifier (letter followed by a sequence of letters, digits or underscores), that can later be used to reference the macro, followed by optional white-space, followed by an <code>=</code>, followed by optional white-space, followed by a regular expression (see <a href="#LexRules">Lexical Rules</a> for more information about the regular expression syntax).</p>
<p>The regular expression on the right hand side must be well formed and must not contain the <code>^</code>, <code>/</code> or <code>$</code> operators. <em>Differently to JLex, macros are not just pieces of text that are expanded by copying</em> - they are parsed and must be well formed.</p>
<p><strong>This is a feature.</strong> It eliminates some very hard to find bugs in lexical specifications (such like not having parentheses around more complicated macros - which is not necessary with JFlex). See <a href="#Porting">Porting from JLex</a> for more details on the problems of JLex style macros.</p>
<p>Since it is allowed to have macro usages in macro definitions, it is possible to use a grammar-like notation to specify the desired lexical structure. However, macros remain just abbreviations of the regular expressions they represent. They are not non-terminals of a grammar and cannot be used recursively. JFlex detects cycles in macro definitions and reports them at generation time. JFlex also warns you about macros that have been defined but never used in the <em>lexical rules</em> section of the specification.</p>
<h2 id="LexRules">Lexical rules</h2>
<p>The <em>lexical rules</em> section of a JFlex specification contains a set of regular expressions and actions (Java code) that are executed when the scanner matches the associated regular expression.</p>
<p>The <code>%include</code> directive may be used in this section to include lexical rules from a separate file. The directive will be replaced verbatim by the contents of the specified file.</p>
<h3 id="Grammar">Syntax</h3>
<p>The syntax of the <em>lexical rules</em> section is described by the following EBNF grammar (terminal symbols are enclosed in ’quotes’):</p>
<pre><code>LexicalRules ::= (Include|Rule)+
Include      ::= &#39;%include&#39; (&#39; &#39;|&#39;\t&#39;|&#39;\b&#39;)+ File
Rule         ::= [StateList] [&#39;^&#39;] RegExp [LookAhead] Action 
               | [StateList] &#39;&lt;&lt;EOF&gt;&gt;&#39; Action
               | StateGroup 
StateGroup   ::= StateList &#39;{&#39; Rule+ &#39;}&#39; 
StateList    ::= &#39;&lt;&#39; Identifier (&#39;,&#39; Identifier)* &#39;&gt;&#39; 
LookAhead    ::= &#39;$&#39; | &#39;/&#39; RegExp
Action       ::= &#39;{&#39; JavaCode &#39;}&#39; | &#39;|&#39;

RegExp       ::= RegExp &#39;|&#39; RegExp 
               | RegExp RegExp 
               | &#39;(&#39; RegExp &#39;)&#39;
               | (&#39;!&#39;|&#39;~&#39;) RegExp
               | RegExp (&#39;*&#39;|&#39;+&#39;|&#39;?&#39;)
               | RegExp &quot;{&quot; Number [&quot;,&quot; Number] &quot;}&quot; 
               | CharClass
               | PredefinedClass 
               | MacroUsage 
               | &#39;&quot;&#39; StringCharacter+ &#39;&quot;&#39; 
               | Character 

CharClass    ::= &#39;[&#39; [&#39;^&#39;] CharClassContent* &#39;]&#39;
               | &#39;[&#39; [&#39;^&#39;] CharClassContent+ 
                     CharClassOperator CharClassContent+ &#39;]&#39;
                 
CharClassContent    ::= CharClass | Character |
                        Character&#39;-&#39;Character | 
                        MacroUsage | PredefinedClass

CharClassOperator   ::= &#39;||&#39; | &#39;&amp;&amp;&#39; | &#39;--&#39; | &#39;~~&#39;

MacroUsage          ::= &#39;{&#39; Identifier &#39;}&#39;

PredefinedClass     ::= &#39;[:jletter:]&#39; 
                      | &#39;[:jletterdigit:]&#39; 
                      | &#39;[:letter:]&#39; 
                      | &#39;[:digit:]&#39;
                      | &#39;[:uppercase:]&#39; 
                      | &#39;[:lowercase:]&#39;
                      | &#39;\d&#39; | &#39;\D&#39;
                      | &#39;\s&#39; | &#39;\S&#39;
                      | &#39;\w&#39; | &#39;\W&#39;
                      | &#39;\p{&#39; UnicodePropertySpec &#39;}&#39;
                      | &#39;\P{&#39; UnicodePropertySpec &#39;}&#39;
                      | &#39;\R&#39;
                      | &#39;.&#39;          
                            
UnicodePropertySpec ::= BinaryProperty | 
                        EnumeratedProperty (&#39;:&#39; | &#39;=&#39;) PropertyValue

BinaryProperty      ::= Identifier

EnumeratedProperty  ::= Identifier

PropertyValue       ::= Identifier</code></pre>
<p>The grammar uses the following terminal symbols:</p>
<ul>
<li><p><code>File</code><br />a file name, either absolute or relative to the directory containing the lexical specification.</p></li>
<li><p><code>JavaCode</code><br />a sequence of <code>BlockStatements</code> as described in the Java Language Specification <span class="citation">(Gosling, Joy, and Steele 1996)</span>, section 14.2.</p></li>
<li><p><code>Number</code><br />a non negative decimal integer.</p></li>
<li><p><code>Identifier</code><br />a letter <code>[a-zA-Z]</code> followed by a sequence of zero or more letters, digits or underscores <code>[a-zA-Z0-9_]</code></p></li>
<li><p><code>Character</code><br />an escape sequence or any unicode character that is not one of these meta characters: <code>|  (  )  {  }  [  ]  &lt; &gt;  \  .  *  +  ?  ^  $  / . &quot; ~ !</code></p></li>
<li><p><code>StringCharacter</code><br />an escape sequence or any unicode character that is not one of these meta characters: <code>\  &quot;</code></p></li>
<li><p>An escape sequence</p>
<ul>
<li><p><code>\n</code> <code>\r</code> <code>\t</code> <code>\f</code> <code>\b</code></p></li>
<li><p>a <code>\x</code> followed by two hexadecimal digits <code>[a-fA-F0-9]</code> (denoting an ASCII escape sequence);</p></li>
<li><p>a <code>\u</code> followed by four hexadecimal digits <code>[a-fA-F0-9]</code>, denoting a unicode escape sequence. Note that these are precisely four digits, i.e. <code>\u12345</code> is the character <code>\u1234</code> followed by the character <code>5</code>.</p></li>
<li><p>a <code>\U</code> (note that the ’U’ is uppercase) followed by six hexadecimal digits <code>[a-fA-F0-9]</code>, denoting a unicode code point escape sequence;</p></li>
<li><p><code>\u{H+( H+)*}</code>, where <code>H+</code> is one or more hexadecimal digits <code>[a-fA-F0-9]</code>, each <code>H+</code> denotes a code point - note that in character classes, only one code point is allowed;</p></li>
<li><p>a backslash followed by a three digit octal number from 000 to 377, denoting an ASCII escape sequence; or</p></li>
<li><p>a backslash followed by any other unicode character that stands for this character.</p></li>
</ul></li>
</ul>
<p>Please note that the <code>\n</code> escape sequence stands for the ASCII LF character - not for the end of line. If you would like to match the line terminator, you should use the expression <code>\r|\n|\r\n</code> if you want the Java conventions, or <code>\r\n|[\r\n\u2028\u2029\u000B\u000C\u0085]</code> (provided as predefined class <code>\R</code>) if you want to be fully Unicode compliant (see also <span class="citation">(Davis and Heninger 2013)</span>).</p>
<p>The white-space characters <code>&quot; &quot;</code> (space) and <code>\t</code> (tab) can be used to improve the readability of regular expressions. They will be ignored by JFlex. In character classes and strings, however, white-space characters keep standing for themselves (so the string <code>&quot; &quot;</code> still matches exactly one space character and <code>[ \n]</code> still matches an ASCII LF or a space character).</p>
<p>JFlex applies the following standard operator precedences in regular expression (from highest to lowest):</p>
<ul>
<li><p>unary postfix operators (<code>*</code>, <code>+</code>, <code>?</code>, <code>{n}</code>, <code>{n,m}</code>)</p></li>
<li><p>unary prefix operators (<code>!</code>, <code>~</code>)</p></li>
<li><p>concatenation (<code>RegExp::= RegExp Regexp</code>)</p></li>
<li><p>union (<code>RegExp::= RegExp '|' RegExp</code>)</p></li>
</ul>
<p>So the expression <code>a | abc | !cd*</code> for instance is parsed as <code>(a|(abc)) | ((!c)(d*))</code>.</p>
<h3 id="Semantics">Semantics</h3>
<p>This section gives an informal description of which text is matched by a regular expression, i.e. an expression described by the <code>RegExp</code> production of the grammar <a href="#Grammar">above</a>.</p>
<p>A regular expression that consists solely of</p>
<ul>
<li><p>a <code>Character</code> matches this character.</p></li>
<li><p>a character class <code>[...]</code> matches any character in that class. A <code>Character</code> is considered an element of a class if it is listed in the class or if its code lies within a listed character range <code>Character’-’Character</code> or Macro or predefined character class. So <code>[a0-3\n]</code> for instance matches the characters</p>
<p><code>a 0 1 2 3 \n</code></p>
<p>If the list of characters is empty (i.e. just <code>[]</code>), the expression matches nothing at all (the empty set), not even the empty string. This can be useful in combination with the negation operator <code>!</code>.</p>
<p>Character sets may be nested, e.g. <code>[[[abc]d[e]]fg]</code> is equivalent to <code>[abcdefg]</code>.</p>
<p>Supported character set operations:</p>
<ul>
<li><p>Union (<code>||</code>), e.g. <code>[[a-c]||[d-f]]</code>, equivalent to <code>[a-cd-f]</code>: this is the default character set operation when no operator is specified.</p></li>
<li><p>Intersection (<code>&amp;&amp;</code>), e.g. <code>[[a-f]&amp;&amp;[f-m]]</code>, equivalent to <code>[f]</code>.</p></li>
<li><p>Set difference (<code>--</code>), e.g. <code>[[a-z]--m]</code>, equivalent to <code>[a-ln-z]</code>.</p></li>
<li><p>Symmetric difference (<code>~~</code>): the union of two classes minus their intersection. For instance</p>
<pre><code>[\p{Letter}~~\p{ASCII}] </code></pre>
<p>is equivalent to</p>
<pre><code>[[\p{Letter}||\p{ASCII}]--[\p{Letter}&amp;&amp;\p{ASCII}]]</code></pre>
<p>the set of characters that are present in either <code>\p{Letter}</code> or in <code>\p{ASCII}</code>, but not in both.</p></li>
</ul></li>
<li><p>a negated character class <code>'[^...]'</code> matches all characters not listed in the class. If the list of characters is empty (i.e. <code>[^]</code>), the expression matches any character of the input character set.</p></li>
<li><p>a string <code>’’ StringCharacter+ ’’</code> matches the exact text enclosed in double quotes. All meta characters apart from <code>\</code> and <code>&quot;</code> lose their special meaning inside a string. See also the <code>%ignorecase</code> switch.</p></li>
<li><p>a macro usage <code>'{' Identifier '}'</code> matches the input that is matched by the right hand side of the macro with name <code>Identifier</code>.</p></li>
<li><p>a predefined character class matches any of the characters in that class. There are the following predefined character classes:</p>
<ul>
<li><p>two predefined character classes that are determined by Java library functions in class <code>java.lang.Character</code>:</p>
<pre><code>    [:jletter:]       isJavaIdentifierStart()
    [:jletterdigit:]  isJavaIdentifierPart()</code></pre></li>
<li><p>four predefined character classes equivalent to the following Unicode properties (described <a href="#unipropsyntax">below</a>):</p>
<pre><code>    [:letter:]     \p{Letter}
    [:digit:]      \p{Digit}
    [:uppercase:]  \p{Uppercase}
    [:lowercase:]  \p{Lowercase}</code></pre></li>
<li><p>the following meta characters, equivalent to these (sets of) Unicode Properties (described <a href="#unipropsyntax">below</a>):</p>
<pre><code>    \d  \p{Digit}
    \D  \P{Digit}
    \s  \p{Whitespace}
    \S  \P{Whitespace}
    \w  [\p{Alpha}\p{Digit}\p{Mark}
         \p{Connector Punctuation}\p{Join Control}]
    \W  [^\p{Alpha}\p{Digit}\p{Mark}
          \p{Connector Punctuation}\p{Join Control}]</code></pre></li>
<li><p> <!-- FIXME: inline refs don't link properly in pdf --> <a name="unipropsyntax"></a>Unicode Properties are character classes specified by each version of the Unicode Standard. JFlex supports a subset of all defined Properties for each supported Unicode version. To see the full list of supported Properties, give the <code>–uniprops &lt;ver&gt;</code> option on the JFlex command line, where <code>&lt;ver&gt;</code> is the Unicode version. Some Properties have aliases; JFlex recognizes all aliases for all supported properties. JFlex supports loose matching of Properties: case distinctions, whitespace, hyphens, and underscores are ignored.</p>
<p>To refer to a Unicode Property, use the <code>\p{...}</code> syntax, e.g. the Greek Block can be referred to as <code>\p{Block:Greek}</code>. To match all characters not included in a property, use the <code>\P{...}</code> syntax (note that the ’<code>P</code>’ is uppercase), e.g. to match all characters that are <strong>not</strong> letters, use <code>\P{Letter}</code>.</p>
<p>See UTS#18 <span class="citation">(Davis and Heninger 2013)</span> for a description of and links to definitions of some supported Properties. UnicodeSet <span class="citation">(“Unicode Utilities: UnicodeSet” 2015)</span> is an online utility to show the character sets corresponding to Unicode Properties and set operations on them, but only for the most recent Unicode version.</p></li>
<li><p>Dot (<code>.</code>) matches <code>[^\r\n\u2028\u2029\u000B\u000C\u0085]</code>.<br />Use the <code>–legacydot</code> option to instead match <code>[^\n]</code>.</p></li>
<li><p><code>\R</code> matches any newline: <code>\r\n|[\r\n\u2028\u2029\u000B\u000C\u0085]</code>.</p></li>
</ul></li>
</ul>
<p>If <code>a</code> and <code>b</code> are regular expressions, then</p>
<ul>
<li><p><code>a | b</code> (union)</p>
<p>is the regular expression that matches all input matched by <code>a</code> or by <code>b</code>.</p></li>
<li><p><code>a b</code> (concatenation)</p>
<p>is the regular expression that matches the input matched by <code>a</code> followed by the input matched by <code>b</code>.</p></li>
<li><p><code>a*</code> (Kleene closure)</p>
<p>matches zero or more repetitions of the input matched by <code>a</code></p></li>
<li><p><code>a+</code> (iteration)</p>
<p>is equivalent to <code>aa*</code></p></li>
<li><p><code>a?</code> (option)</p>
<p>matches the empty input or the input matched by <code>a</code></p></li>
<li><p><code>!a</code> (negation)</p>
<p>matches everything but the strings matched by <code>a</code>. Use with care: the construction of <code>!a</code> involves an additional, possibly exponential NFA to DFA transformation on the NFA for <code>a</code>. Note that with negation and union you also have (by applying DeMorgan) intersection and set difference: the intersection of <code>a</code> and <code>b</code> is <code>!(!a|!b)</code>, the expression that matches everything of <code>a</code> not matched by <code>b</code> is <code>!(!a|b)</code></p></li>
<li><p><code>~a</code> (upto)</p>
<p>matches everything up to (and including) the first occurrence of a text matched by <code>a</code>. The expression <code>~a</code> is equivalent to <code>!([^]* a [^]*) a</code>. A traditional C-style comment is matched by <code>&quot;/*&quot; ~&quot;*/&quot;</code></p></li>
<li><p><code>a {n}</code> (repeat)</p>
<p>is equivalent to <code>n</code> times the concatenation of <code>a</code>. So <code>a{4}</code> for instance is equivalent to the expression <code>a a a a</code>. The decimal integer <code>n</code> must be positive.</p></li>
<li><p><code>a {n,m}</code></p>
<p>is equivalent to at least <code>n</code> times and at most <code>m</code> times the concatenation of <code>a</code>. So <code>a{2,4}</code> for instance is equivalent to the expression <code>a a a? a?</code>. Both <code>n</code> and <code>m</code> are non-negative decimal integers and <code>m</code> must not be smaller than <code>n</code>.</p></li>
<li><p><code>(a)</code></p>
<p>matches the same input as <code>a</code>.</p></li>
</ul>
<p>In a lexical rule, a regular expression <code>r</code> may be preceded by a <code>^</code> (the beginning of line operator). <code>r</code> is then only matched at the beginning of a line in the input. A line begins after each occurrence of <code>\r|\n|\r\n|\u2028|\u2029|\u000B|\u000C|\u0085</code> (see also <span class="citation">(Davis and Heninger 2013)</span>) and at the beginning of input. The preceding line terminator in the input is not consumed and can be matched by another rule.</p>
<p>In a lexical rule, a regular expression <code>r</code> may be followed by a look-ahead expression. A look-ahead expression is either <code>$</code> (the end of line operator) or <code>/</code> followed by an arbitrary regular expression. In both cases the look-ahead is not consumed and not included in the matched text region, but it <strong>is</strong> considered while determining which rule has the longest match (see also <a href="#HowMatched">How the input is matched</a>).</p>
<p>In the <code>$</code> case, <code>r</code> is only matched at the end of a line in the input. The end of a line is denoted by the regular expression <code>\r|\n|\r\n|\u2028|\u2029|\u000B|\u000C|\u0085</code>. So <code>a$</code> is equivalent to <code>a / \r|\n|\r\n|\u2028|\u2029|\u000B|\u000C|\u0085</code>. This is different to the situation described in <span class="citation">(Davis and Heninger 2013)</span>: since in JFlex <code>$</code> is a true trailing context, the end of file does <strong>not</strong> count as end of line.</p>
<p>For arbitrary look-ahead (also called <em>trailing context</em>) the expression is matched only when followed by input that matches the trailing context.</p>
<p>JFlex allows lex/flex style <code>&lt;&lt;EOF&gt;&gt;</code> rules in lexical specifications. A rule</p>
<pre><code>[StateList]  &lt;&lt;EOF&gt;&gt;    { action code }</code></pre>
<p>is very similar to the <code>%eofval</code> directive. The difference lies in the optional <code>StateList</code> that may precede the <code>&lt;&lt;EOF&gt;&gt;</code> rule. The action code will only be executed when the end of file is read and the scanner is currently in one of the lexical states listed in <code>StateList</code>. The same <code>StateGroup</code> (see section <a href="#HowMatched">How the input is matched</a>) and precedence rules as in the “normal” rule case apply (i.e. if there is more than one <code>&lt;&lt;EOF&gt;&gt;</code> rule for a certain lexical state, the action of the one appearing earlier in the specification will be executed). <code>&lt;&lt;EOF&gt;&gt;</code> rules override settings of the <code>%cup</code> and <code>%byaccj</code> options and should not be mixed with the <code>%eofval</code> directive.</p>
<p>An <code>Action</code> consists either of a piece of Java code enclosed in curly braces or is the special <code>|</code> action. The <code>|</code> action is an abbreviation for the action of the following expression.</p>
<p>Example:</p>
<pre><code>expression1   |
expression2   |
expression3   { some action }</code></pre>
<p>is equivalent to the expanded form</p>
<pre><code>expression1   { some action }
expression2   { some action }
expression3   { some action }</code></pre>
<p>They are useful when working with trailing context expressions. The expression <code>a | (c / d) | b</code> is not a syntactically legal regular expression, but can be expressed using the <code>|</code> action:</p>
<pre><code>a       |
c / d   |
b       { some action }</code></pre>
<h3 id="HowMatched">How the input is matched</h3>
<p>When consuming its input, the scanner determines the regular expression that matches the longest portion of the input (longest match rule). If there is more than one regular expression that matches the longest portion of input (i.e. they all match the same input), the generated scanner chooses the expression that appears first in the specification. After determining the active regular expression, the associated action is executed. If there is no matching regular expression, the scanner terminates the program with an error message (if the <code>%standalone</code> directive has been used, the scanner prints the unmatched input to <code>java.lang.System.out</code> instead and resumes scanning).</p>
<p>Lexical states can be used to further restrict the set of regular expressions that match the current input.</p>
<ul>
<li><p>A regular expression can only be matched when its associated set of lexical states includes the currently active lexical state of the scanner or if the set of associated lexical states is empty and the currently active lexical state is inclusive. Exclusive and inclusive states only differ in this one point: rules with an empty set of associated states.</p></li>
<li><p>The currently active lexical state of the scanner can be changed from within an action of a regular expression using the method <code>yybegin()</code>.</p></li>
<li><p>The scanner starts in the inclusive lexical state <code>YYINITIAL</code>, which is always declared by default.</p></li>
<li><p>The set of lexical states associated with a regular expression is the <code>StateList</code> that precedes the expression. If a rule is contained in one or more <code>StateGroups</code>, then the states of these are also associated with the rule, i.e. they accumulate over <code>StateGroups</code>.</p>
<p>Example:</p>
<pre><code>%states A, B
%xstates C
%%
expr1                   { yybegin(A); action }
&lt;YYINITIAL, A&gt; expr2    { action }
&lt;A&gt; {
  expr3                 { action }
  &lt;B,C&gt; expr4           { action }
}</code></pre>
<p>The first line declares two (inclusive) lexical states <code>A</code> and <code>B</code>, the second line an exclusive lexical state <code>C</code>. The default (inclusive) state <code>YYINITIAL</code> is always implicitly there and doesn’t need to be declared. The rule with <code>expr1</code> has no states listed, and is thus matched in all states but the exclusive ones, i.e. <code>A</code>, <code>B</code>, and <code>YYINITIAL</code>. In its action, the scanner is switched to state <code>A</code>. The second rule <code>expr2</code> can only match when the scanner is in state <code>YYINITIAL</code> or <code>A</code>. The rule <code>expr3</code> can only be matched in state <code>A</code> and <code>expr4</code> in states <code>A</code>, <code>B</code>, and <code>C</code>.</p></li>
<li><p>Lexical states are declared and used as Java <code>int</code> constants in the generated class under the same name as they are used in the specification. There is no guarantee that the values of these integer constants are distinct. They are pointers into the generated DFA table, and if JFlex recognises two states as lexically equivalent (if they are used with the exact same set of regular expressions), then the two constants will get the same value.</p></li>
</ul>
<h3 id="the-generated-class">The generated class</h3>
<p>JFlex generates exactly one file containing one class from the specification (unless you have declared another class in the first specification section).</p>
<p>The generated class contains (among other things) the DFA tables, an input buffer, the lexical states of the specification, a constructor, and the scanning method with the user supplied actions.</p>
<p>The name of the class is by default <code>Yylex</code>. The name is customisable with the <code>%class</code> directive. The input buffer of the lexer is connected with external input through the <code>java.io.Reader</code> object which is passed to the lexer in the generated constructor. If you provide your own constructor for the lexer, you should always chain-call the generated one to initialise the input buffer. The input buffer should not be accessed directly, but only through the advertised API (see also <a href="#ScannerMethods">Scanner Methods</a>). Its internal implementation may change between releases or skeleton files without notice.</p>
<p>The main interface to the outside world is the generated scanning method (default name <code>yylex</code>, default return type <code>Yytoken</code>). Most of its aspects are customisable (name, return type, declared exceptions etc.). If it is called, it will consume input until one of the expressions in the specification is matched or an error occurs. If an expression is matched, the corresponding action is executed. It may return a value of the specified return type (in which case the scanning method returns with this value), or, if it does not return a value, the scanner resumes consuming input until the next expression is matched. If the end of file is reached, the scanner executes the <code>EOF</code> action, and (also upon each further call to the scanning method) returns the specified <code>EOF</code> value.</p>
<h3 id="ScannerMethods">Scanner methods and fields accessible in actions (API)</h3>
<p>Generated methods and member fields in JFlex scanners are prefixed with <code>yy</code> to indicate that they are generated and to avoid name conflicts with user code copied into the class. Since user code is part of the same class, JFlex has no language means like the <code>private</code> modifier to indicate which members and methods are internal and which ones belong to the API. Instead, JFlex follows a naming convention: everything starting with <code>zz</code>, such as <code>zzStartRead</code>, is internal and subject to change without notice between JFlex releases. Methods and members of the generated class that do not have a <code>zz</code> prefix, such as <code>yycharat</code>, belong to the API that the scanner class provides to users in action code of the specification. They will remain stable and supported between JFlex releases as long as possible.</p>
<p>Currently, the API consists of the following methods and member fields:</p>
<ul>
<li><p><code>String yytext()</code></p>
<p>returns the matched input text region</p></li>
<li><p><code>int yylength()</code></p>
<p>returns the length of the matched input text region as number of Java <code>chars</code> (as opposed to Unicode code points). Does notrequire a <code>String</code> object to be created.</p></li>
<li><p><code>char yycharat(int pos)</code></p>
<p>returns the Java <code>char</code> at position <code>pos</code> from the matched text. It is equivalent to <code>yytext().charAt(pos)</code>, but faster. <code>pos</code> must be a value from <code>0</code> to <code>yylength()-1</code>.</p></li>
<li><p><code>void yyclose()</code></p>
<p>closes the input stream. All subsequent calls to the scanning method will return the end of file value</p></li>
<li><p><code>void yyreset(java.io.Reader reader)</code></p>
<p>closes the current input stream, and resets the scanner to read from a new Reader. All internal variables are reset, the old Reader <em>cannot</em> be reused (content of the internal buffer is discarded and lost). The lexical state is set to <code>YY_INITIAL</code>. The <code>%{init</code> code is <em>not</em> included in <code>yyreset</code>, because it is assumed to run in the context of a constructor, not a normal method. If <code>%{init</code> does need to be repeated, consider constructing a new lexer object instead, or calling a custom function that performs any additional user-level state reset.</p></li>
<li><p><code>void yypushStream(java.io.Reader reader)</code></p>
<p>Stores the current input stream on a stack, and reads from a new stream. Lexical state, line, char, and column counting remain untouched. The current input stream can be restored with <code>yypopStream</code> (usually in an <code>&lt;&lt;EOF&gt;&gt;</code> action).</p>
<p>A typical example for this are include files in style of the C pre-processor. The corresponding JFlex specification could look like this:</p>
<pre><code>&quot;#include&quot; {FILE}  { yypushStream(new FileReader(getFile(yytext()))); }
...
&lt;&lt;EOF&gt;&gt;            { if (yymoreStreams()) yypopStream(); else return EOF; }</code></pre>
<p>This method is only available in the skeleton file <code>skeleton.nested</code>. You can find it in the <code>src</code> directory of the JFlex distribution.</p></li>
<li><p><code>void yypopStream()</code></p>
<p>Closes the current input stream and continues to read from the one on top of the stream stack.</p>
<p>This method is only available in the skeleton file <code>skeleton.nested</code>. You can find it in the <code>src</code> directory of the JFlex distribution.</p></li>
<li><p><code>boolean yymoreStreams()</code></p>
<p>Returns true iff there are still streams for <code>yypopStream</code> left to read from on the stream stack.</p>
<p>This method is only available in the skeleton file <code>skeleton.nested</code>. You can find it in the <code>src</code> directory of the JFlex distribution.</p></li>
<li><p><code>int yystate()</code></p>
<p>returns the current lexical state of the scanner.</p></li>
<li><p><code>void yybegin(int lexicalState)</code></p>
<p>enters the lexical state <code>lexicalState</code></p></li>
<li><p><code>void yypushback(int number)</code></p>
<p>pushes <code>number</code> Java <code>char</code>s (as opposed to Unicode code points) of the matched text back into the input stream. They will be read again in the next call of the scanning method. The number of chars to be read again must not be greater than the length of the matched text. The pushed back characters will not be included in <code>yylength()</code> and <code>yytext()</code>. Note that in Java strings are unchangeable, i.e. an action code like</p>
<pre><code>    String matched = yytext();
    yypushback(1);
    return matched;</code></pre>
<p>will return the whole matched text, while</p>
<pre><code>    yypushback(1);
    return yytext();</code></pre>
<p>will return the matched text minus the last character.</p>
<p>Note that with Unicode surrogate characters it is possible that expressions such as <code>[^]</code> match more than one <code>char</code>.</p></li>
<li><p><code>int yyline</code></p>
<p>contains the current line of input (starting with 0, only active with the <code>lineCounting</code> directive)</p></li>
<li><p><code>int yychar</code></p>
<p>contains the current character count in the input (starting with 0, only active with the <code>charCounting</code> directive)</p></li>
<li><p><code>int yycolumn</code></p>
<p>contains the current column of the current line (starting with 0, only active with the <code>columnCounting</code> directive)</p></li>
</ul>
<h1 id="sec:encodings">Encodings, Platforms, and Unicode</h1>
<p>This section discusses Unicode and encodings, cross platform scanning, and how to deal with binary data.</p>
<h2 id="the-problem">The Problem</h2>
<p>Java aims to be implementation platform independent, yet different platforms use different ways to encode characters. Moreover, a file written on one platform, say Windows, may later be read by a scanner on another platform, for instance Linux.</p>
<p>If a program reads a file from disk, what it really reads is a stream of bytes. These bytes can be mapped to characters in different ways. For instance, in standard ASCII, the byte value 65 stands for the character <code>A</code>, and in the encoding <code>iso-latin-1</code>, the byte value 213 stands for the umlaut character <code>ä</code>, but in the encoding <code>iso-latin-2</code> <!-- FIXME: check --> the value 213 is <code>é</code> instead. As long as one encoding is used consistently, this is no problem. Some characters may not be available in the encoding you are using, but at least the interpretation of the mapping between bytes and characters agrees between different programs.</p>
<p>When your program runs on more than one platform, however, as is often the case with Java, things become more complex. Java’s solution to this is to use Unicode internally. Unicode aims to be able to represent all known character sets and is therefore a perfect base for encoding things that might get used all over the world and on different platforms. To make things work correctly, you still have to know where you are and how to map byte values to Unicode characters and vice versa, but the important thing is, that this mapping is at least possible (you can map Kanji characters to Unicode, but you cannot map them to ASCII or <code>iso-latin-1</code>).</p>
<h2 id="scanning-text-files">Scanning text files</h2>
<p>Scanning text files is the standard application for scanners like JFlex. Therefore it should also be the most convenient one. Most times it is.</p>
<p>The following scenario works fine: You work on a platform X, write your lexer specification there, can use any obscure Unicode character in it as you like, and compile the program. Your users work on any platform Y (possibly but not necessarily something different from X), they write their input files on Y and they run your program on Y. No problems.</p>
<p>Java does this as follows: If you want to read anything in Java that is supposed to contain text, you use a <code>FileReader</code>, which converts the bytes of the file into Unicode characters with the platform’s default encoding. If a text file is produced on the same platform, the platform’s default encoding should do the mapping correctly. Since JFlex also uses readers and Unicode internally, this mechanism also works for the scanner specifications. If you write an <code>A</code> in your text editor and the editor uses the platform’s encoding (say <code>A</code> is 65), then Java translates this into the logical Unicode <code>A</code> internally. If a user writes an <code>A</code> on a completely different platform (say <code>A</code> is 237 there), then Java also translates this into the logical Unicode <code>A</code> internally. Scanning is performed after that translation and both match.</p>
<p>Note that because of this mapping from bytes to characters, you should always use the <code>%unicode</code> switch in you lexer specification if you want to scan text files. <code>%8bit</code> may not be enough, even if you know that your platform only uses one byte per character. The encoding <code>Cp1252</code> used on many Windows machines for instance knows 256 characters, but the character <code>'</code> with <code>Cp1252</code> code <code>\x92</code> has the Unicode value <code>\u2019</code>, which is larger than 255 and which would make your scanner throw an <code>ArrayIndexOutOfBoundsException</code> if it is encountered.</p>
<p>So for the usual case you don’t have to do anything but use the <code>%unicode</code> switch in your lexer specification.</p>
<p>Things may break when you produce a text file on platform X and consume it on a different platform Y. Let’s say you have a file written on a Windows PC using the encoding <code>Cp1252</code>. Then you move this file to a Linux PC with encoding <code>ISO 8859-1</code> and there you run your scanner on it. Java now thinks the file is encoded in <code>ISO 8859-1</code> (the platform’s default encoding) while it really is encoded in <code>Cp1252</code>. For most characters <code>Cp1252</code> and <code>ISO 8859-1</code> are the same, but for the byte values <code>\x80</code> to <code>\x9f</code> they disagree: <code>ISO 8859-1</code> is undefined there. You can fix the problem by telling Java explicitly which encoding to use. When constructing the <code>InputStreamReader</code>, you can give the encoding as argument. The line</p>
<pre><code>Reader r = new InputStreamReader(input, Cp1252);</code></pre>
<p>will do the trick.</p>
<p>Of course the encoding to use can also come from the data itself: for instance, when you scan an HTML page, it may have embedded information about its character encoding in the headers.</p>
<p>More information about encodings, which ones are supported, how they are called, and how to set them may be found in the official Java documentation in the chapter about internationalisation. The link <a href="http://docs.oracle.com/javase/7/docs/technotes/guides/intl/" class="uri">http://docs.oracle.com/javase/7/docs/technotes/guides/intl/</a> leads to an online version of this for Oracle’s JDK 1.7.</p>
<h2 id="scanning-binaries">Scanning binaries</h2>
<p>Scanning binaries is both easier and more difficult than scanning text files. It’s easier because you want the raw bytes and not their meaning, i.e. you don’t want any translation. It’s more difficult because it’s not so easy to get “no translation” when you use Java readers.</p>
<p>The problem (for binaries) is that JFlex scanners are designed to work on text. Therefore the interface is the <code>Reader</code> class. You can still get a binary scanner when you write your own custom <code>InputStreamReader</code> class that explicitly does no translation, but just copies byte values to character codes instead. It sounds quite easy, and actually it is no big deal, but there are a few pitfalls on the way. In the scanner specification you can only enter positive character codes (for bytes that is <code>\x00</code> to <code>\xFF</code>). Java’s <code>byte</code> type on the other hand is a signed 8 bit integer (-128 to 127), so you have to convert them accordingly in your custom <code>Reader</code>. Also, you should take care when you write your lexer spec: if you use text in there, it gets interpreted by an encoding first, and what scanner you get as result might depend on which platform you run JFlex on when you generate the scanner (this is what you want for text, but for binaries it gets in the way). If you are not sure, or if the development platform might change, it’s probably best to use character code escapes in all places, since they don’t change their meaning.</p>
<h1 id="unicoderegexconformance">Conformance with Unicode Regular Expressions UTS#18</h1>
<p>This section gives details about JFlex 1.7.0’s conformance with the requirements for Basic Unicode Support Level 1 given in UTS#18 <span class="citation">(Davis and Heninger 2013)</span>.</p>
<h2 id="rl1.1-hex-notation">RL1.1 Hex Notation</h2>
<blockquote>
<p><em>To meet this requirement, an implementation shall supply a mechanism for specifying any Unicode code point (from U+0000 to U+10FFFF), using the hexadecimal code point representation.</em></p>
</blockquote>
<p>JFlex conforms. Syntax is provided to express values across the whole range, via <code>\uXXXX</code>, where <code>XXXX</code> is a 4-digit hex value; <code>\Uyyyyyy</code>, where <code>yyyyyy</code> is a 6-digit hex value; and <code>\u{X+( X+)*}</code>, where <code>X+</code> is a 1-6 digit hex value.</p>
<h2 id="rl1.2-properties">RL1.2 Properties</h2>
<blockquote>
<p><em>To meet this requirement, an implementation shall provide at least a minimal list of properties, consisting of the following: General_Category, Script and Script_Extensions, Alphabetic, Uppercase, Lowercase, White_Space, Noncharacter_Code_Point, Default_Ignorable_Code_Point, ANY, ASCII, ASSIGNED.</em></p>
<p><em>The values for these properties must follow the Unicode definitions, and include the property and property value aliases from the UCD. Matching of Binary, Enumerated, Catalog, and Name values, must follow the Matching Rules from [UAX44].</em></p>
</blockquote>
<p>JFlex conforms. The minimal set of properties is supported, as well as a few others. To see the full list of supported properties, use the JFlex command line option <code>--uniprops &lt;ver&gt;</code>, where <code>&lt;ver&gt;</code> is the Unicode version. Loose matching is performed: case distinctions, whitespace, underscores and hyphens in property names and values are ignored.</p>
<h2 id="rl1.2a-compatibility-properties">RL1.2a Compatibility Properties</h2>
<blockquote>
<p><em>To meet this requirement, an implementation shall provide the properties listed in Annex C: Compatibility Properties, with the property values as listed there. Such an implementation shall document whether it is using the Standard Recommendation or POSIX-compatible properties.</em></p>
</blockquote>
<p>JFlex does not fully conform. The Standard Recommendation version of the Annex C Compatibility Properties are provided, with two exceptions: <code>\X</code> Extended Grapheme Clusters; and <code>\b</code> Default Word Boundaries.</p>
<h2 id="rl1.3-subtraction-and-intersection">RL1.3 Subtraction and Intersection</h2>
<blockquote>
<p><em>To meet this requirement, an implementation shall supply mechanisms for union, intersection and set-difference of Unicode sets.</em></p>
</blockquote>
<p>JFlex conforms by providing these mechanisms, as well as symmetric difference.</p>
<h2 id="rl1.4-simple-word-boundaries">RL1.4 Simple Word Boundaries</h2>
<blockquote>
<p><em>To meet this requirement, an implementation shall extend the word boundary mechanism so that:</em></p>
<ol style="list-style-type: decimal">
<li><p><em>The class of <code>&lt;word_character&gt;</code> includes all the Alphabetic values from the Unicode character database, from UnicodeData.txt [UData], plus the decimals (General_Category = Decimal_Number, or equivalently Numeric_Type = Decimal), and the U+200C ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER (Join_Control=True). See also Annex C: Compatibility Properties.</em></p></li>
<li><p><em>Nonspacing marks are never divided from their base characters, and otherwise ignored in locating boundaries.</em></p></li>
</ol>
</blockquote>
<p>JFlex does not conform: <code>\b</code> does not match simple word boundaries.</p>
<h2 id="rl1.5-simple-loose-matches">RL1.5 Simple Loose Matches</h2>
<blockquote>
<p><em>To meet this requirement, if an implementation provides for case-insensitive matching, then it shall provide at least the simple, default Unicode case-insensitive matching, and specify which properties are closed and which are not.</em></p>
<p><em>To meet this requirement, if an implementation provides for case conversions, then it shall provide at least the simple, default Unicode case folding.</em></p>
</blockquote>
<p>JFlex conforms. All supported Unicode Properties are closed.</p>
<h2 id="rl1.6-line-boundaries">RL1.6 Line Boundaries</h2>
<blockquote>
<p><em>To meet this requirement, if an implementation provides for line-boundary testing, it shall recognize not only CRLF, LF, CR, but also NEL (U+0085), PARAGRAPH SEPARATOR (U+2029) and LINE SEPARATOR (U+2028).</em></p>
</blockquote>
<p>JFlex conforms.</p>
<h2 id="rl1.7-supplementary-code-points">RL1.7 Supplementary Code Points</h2>
<blockquote>
<p><em>To meet this requirement, an implementation shall handle the full range of Unicode code points, including values from U+FFFF to U+10FFFF. In particular, where UTF-16 is used, a sequence consisting of a leading surrogate followed by a trailing surrogate shall be handled as a single code point in matching.</em></p>
</blockquote>
<p>JFlex conforms.</p>
<h1 id="performance">A few words on performance</h1>
<p>This section gives tips on how to make your specification produce a faster scanner.</p>
<p>In general, the regular expression matching generated by JFlex has very good performance. It is DFA-based (deterministic finite automata) and does not require backtracking over alternative as for instance perl-style regular expression matching does. In the optimal case, each character is only examined once, in some situations explained below, a small amount of backtracking is necessary to determine the longest match.</p>
<p>Even within the class of DFA-based scanners, JFlex generated scanners usually show very good performance without special optimisations. The following lists a few heuristics that can make a lexical specification produce an even faster scanner. Those are (roughly in order of performance gain):</p>
<ul>
<li><p>Avoid rules that require backtracking</p>
<p>While there is no backtracking for expressions like <code>a|b</code> in JFlex, some backtracking is still introduced by the longest match rule and occurs for instance on this set of expressions:</p>
<pre><code>averylongkeyword
.</code></pre>
<p>With input <code>averylongjoke</code> the scanner has to read all characters up to <code>’j’</code> to decide that rule <code>.</code> should be matched. All characters of <code>verylong</code> have to be read again for the next matching process.</p>
<p>From the C/C++ flex <span class="citation">(Paxson 1995)</span> man page: <em>Getting rid of backtracking is messy and often may be an enormous amount of work for a complicated scanner.</em> Backtracking can be avoided in general by adding error rules that match those error conditions</p>
<pre><code>&quot;av&quot;|&quot;ave&quot;|&quot;avery&quot;|&quot;averyl&quot;|..</code></pre>
<p>While this is impractical in most scanners, there is still the possibility to add a <em>catch all</em> rule for a lengthy list of keywords</p>
<pre><code>&quot;keyword1&quot;  { return symbol(KEYWORD1); } 
.. 
&quot;keywordn&quot;  { return symbol(KEYWORDn); }
[a-z]+      { error(&quot;not a keyword&quot;); }</code></pre>
<p>Most programming language scanners already have a rule like this for some kind of variable length identifiers, which means this kind of backtracking for programming language scanners often concerns only at most a single character.</p></li>
<li><p>Avoid line and column counting</p>
<p>It costs multiple additional comparisons per input character and the matched text has to be re-scanned for counting. In most scanners it is possible to do the line counting in the specification by incrementing <code>yyline</code> each time a line terminator has been matched. Column counting could also be included in actions. This will be faster, but can in some cases become quite messy.</p></li>
<li><p>Avoid look-ahead expressions and the end of line operator <code>$</code></p>
<p>In the best case, the trailing context will first have to be read and then (because it is not to be consumed) re-read again. The cases of fixed-length look-ahead and fixed-length base expressions are handled efficiently by matching the concatenation and then pushing back the required amount of characters. This extends to the case of a disjunction of fixed-length look-ahead expressions such as <code>r1 / \r|\n|\r\n</code>. All other cases <code>r1 / r2</code> are handled by first scanning the concatenation of <code>r1</code> and <code>r2</code>, and then finding the correct end of <code>r1</code>. The end of <code>r1</code> is found by scanning forwards in the match again, marking all possible <code>r1</code> terminations, and then scanning the reverse of <code>r2</code> backwards from the end until a start of <code>r2</code> intersects with an end of <code>r1</code>. This algorithm is linear in the size of the input (not quadratic or worse as backtracking is), but about a factor of 2 slower than normal scanning. It also consumes memory proportional to the size of the matched input for <code>r1 r2</code>.</p></li>
<li><p>Avoid the beginning of line operator <code>^</code></p>
<p>It costs multiple additional comparisons per match. In some cases one extra look-ahead character is needed (when the last character read is <code>\r</code>, the scanner has to read one character ahead to check if the next one is an <code>\n</code> or not).</p></li>
<li><p>Match as much text as possible in a rule.</p>
<p>One rule is matched in the innermost loop of the scanner. After each action, setting up the internal state of the scanner is necessary and induces a small overhead.</p></li>
</ul>
<p>Note that writing more rules in a specification does <em>not</em> make the generated scanner slower.</p>
<p>The two main rules of optimisation apply also for lexical specifications:</p>
<ol style="list-style-type: decimal">
<li><strong>don’t do it</strong></li>
<li><strong>(for experts only) don’t do it yet</strong></li>
</ol>
<p>Some of the performance tips above contradict a readable and compact specification style. When in doubt or when requirements are not or not yet fixed: don’t use them — the specification can always be optimised in a later state of the development process.</p>
<h1 id="Porting">Porting Issues</h1>
<h2 id="porting-from-jlex">Porting from JLex</h2>
<p>JFlex was designed to read old JLex specifications unchanged and to generate a scanner which behaves exactly the same as the one generated by JLex with the only difference of being faster.</p>
<p>This works as expected on all well formed JLex specifications.</p>
<p>Since the statement above is somewhat absolute, let’s take a look at what <em>well formed</em> means for this purpose. A JLex specification is well formed, when it</p>
<ul>
<li><p>generates a working scanner with JLex</p></li>
<li><p>doesn’t contain the unescaped characters <code>!</code> and <code>~</code></p>
<p>They are operators in JFlex while JLex treats them as normal input characters. You can easily port such a JLex specification to JFlex by replacing every <code>!</code> with <code>\!</code> and every <code>~</code> with <code>\~</code> in all regular expressions.</p></li>
<li><p>has only complete regular expressions surrounded by parentheses in macro definitions</p>
<p>This may sound a bit harsh, but is usually not a big problem – it can also help you find some disgusting bugs in your specification that went unnoticed so far. In JLex, the right hand side of a macro is just a piece of text that is copied to the point where the macro is used. With this, things like</p>
<pre><code>  macro1 = (&quot;hello&quot;
  macro2 = {macro1})*</code></pre>
<p>were possible (with <code>macro2</code> expanding to <code>(&quot;hello&quot;)*</code>). This is not allowed in JFlex and you will have to transform such definitions. There are more subtle kinds of errors that can be introduced by JLex macros. Consider a definition such as <code>macro = a|b</code> and a usage like <code>{macro}*</code>. This expands in JLex to <code>a|b*</code> and not to the probably intended <code>(a|b)*</code>.</p>
<p>Basically, JLex uses C-preprocessor style macros, whereas JFlex uses grammar definitions.</p>
<p>Most specifications shouldn’t suffer from this problem, because macros often only contain (harmless) character classes like <code>alpha = [a-zA-Z]</code> and more dangerous definitions like</p>
<p><code>ident = {alpha}({alpha}|{digit})*</code></p>
<p>are only used to write rules like</p>
<p><code>{ident}       { .. action .. }</code></p>
<p>and not more complex expressions like</p>
<p><code>{ident}*      { .. action .. }</code></p>
<p>where the kind of error presented above would show up.</p></li>
</ul>
<h2 id="porting-from-lexflex">Porting from lex/flex</h2>
<p>This section gives an incomplete overview of potential pitfalls and steps for porting a lexical specification from the C/C++ tools <code>lex</code> and <code>flex</code> <span class="citation">(Paxson 1995)</span> available on most Unix systems to JFlex.</p>
<p>Most of the C/C++ specific features are naturally not present in JFlex, but most “clean” lex/flex lexical specifications can be ported to JFlex without too much work.</p>
<h3 id="basic-structure">Basic structure</h3>
<p>A lexical specification for flex has the following basic structure:</p>
<pre><code>definitions
%%
rules
%%
user code</code></pre>
<p>The <code>user code</code> section usually contains C code that is used in actions of the <code>rules</code> part of the specification. For JFlex, this code will have to be translated to Java, and most of it will then go into the class code <code>%{..%}</code> directive in the <code>options and declarations</code> section.</p>
<h3 id="macros-and-regular-expression-syntax">Macros and Regular Expression Syntax</h3>
<p>The <code>definitions</code> section of a flex specification is quite similar to the <code>options and declarations</code> part of JFlex specs.</p>
<p>Macro definitions in flex have the form:</p>
<pre><code>&lt;identifier&gt;  &lt;expression&gt;</code></pre>
<p>To port them to JFlex macros, just insert a <code>=</code> between <code>&lt;identifier&gt;</code> and <code>&lt;expression&gt;</code>.</p>
<p>The syntax and semantics of regular expressions in flex are pretty much the same as in JFlex. Some attention is needed for escape sequences present in flex (such as <code>\a</code>) that are not supported in JFlex. These escape sequences should be transformed into their unicode equivalent.</p>
<h3 id="character-classes">Character Classes</h3>
<p>Flex offers the character classes directly supported by C, JFlex offers the ones supported by Java. These classes will sometimes have to be listed manually.</p>
<p>In flex more special characters lose their meaning in character classes. In particular<a href="#fn2" class="footnoteRef" id="fnref2"><sup>2</sup></a>:</p>
<ul>
<li><p>in flex <code>[][]</code> is the character class containing <code>]</code> and <code>[</code>, whereas in JFlex, the expression means “empty expression” followed by “empty expression”. To get <code>]</code> and <code>[</code> in JFlex, use for instance <code>[\]\[]</code>.</p></li>
<li><p>the classes <code>[]</code> and <code>[^]</code> are illegal in flex, but have meaning in JFlex.</p></li>
<li><p>in flex <code>[&quot;]</code> is legal, in JFlex you need <code>[\&quot;]</code>.</p></li>
</ul>
<h3 id="lexical-rules">Lexical Rules</h3>
<p>Since flex is mostly Unix based, the ’<code>^</code>’ (beginning of line) and ’<code>$</code>’ (end of line) operators, consider the <code>\n</code> character as only line terminator. This should usually not cause much problems, but you should be prepared for occurrences of <code>\r</code> or <code>\r\n</code> or one of the characters <code>\u2028</code>, <code>\u2029</code>, <code>\u000B</code>, <code>\u000C</code>, or <code>\u0085</code>. They are considered to be line terminators in Unicode and therefore may not be consumed when <code>^</code> or <code>$</code> is present in a rule.</p>
<h1 id="WorkingTog">Working together</h1>
<h2 id="CUPWork">JFlex and CUP</h2>
<p>One of the design goals of JFlex was to make interfacing with the parser generators CUP <span class="citation">(Hudson 1996)</span> and CUP2 <span class="citation">(Petter 2008)</span> as easy as possible. This has been done by providing the <code>%cup</code> and <code>%cup2</code> directives in JFlex. However, each interface has two sides. This section concentrates on the CUP side of the story.</p>
<h3 id="cup2">CUP2</h3>
<p>Please refer to the CUP2 <span class="citation">(Petter 2008)</span> documentation, which provides instructions on how to interface with JFlex. The CUP2 JFlex patch provided there is not necessary any more for JFlex versions greater than 1.5.0.</p>
<h3 id="cup-version-0.10j-and-above">CUP version 0.10j and above</h3>
<p>Since CUP version 0.10j, interfacing with JFlex has been simplified greatly by the new CUP scanner interface <code>java_cup.runtime.Scanner</code>. JFlex lexers now implement this interface automatically when the <code>%cup</code> switch is used. There are no special <code>parser code</code>, <code>init code</code> or <code>scan with</code> options any more that you have to provide in your CUP parser specification. You can just concentrate on your grammar.</p>
<p>If your generated lexer has the class name <code>Scanner</code>, the parser is started from the main program like this:</p>
<pre><code>...
  try {
    parser p = new parser(new Scanner(new FileReader(fileName)));
    Object result = p.parse().value;
  }
  catch (Exception e) {
...</code></pre>
<h3 id="custom-symbol-interface">Custom symbol interface</h3>
<p>If you have used the <code>-symbol</code> command line switch of CUP to change the name of the generated symbol interface, you have to tell JFlex about this change of interface so that correct end-of-file code is generated. You can do so either by using an <code>%eofval{</code> directive or by using an <code>&lt;&lt;EOF&gt;&gt;</code> rule.</p>
<p>If your new symbol interface is called <code>mysym</code> for example, the corresponding code in the jflex specification would be either</p>
<pre><code>%eofval{
  return mysym.EOF;
%eofval}</code></pre>
<p>in the macro/directives section of the spec, or it would be</p>
<pre><code>  &lt;&lt;EOF&gt;&gt;  { return mysym.EOF; }</code></pre>
<p>in the rules section of your spec.</p>
<h3 id="using-existing-jflexcup-specifications-with-cup-0.10j-and-above">Using existing JFlex/CUP specifications with CUP 0.10j and above</h3>
<p>If you already have an existing specification and you would like to upgrade both JFlex and CUP to their newest version, you will probably have to adjust your specification.</p>
<p>The main difference between the <code>%cup</code> switch in JFlex 1.2.1 and lower, and more recent versions is that JFlex scanners now automatically implement the <code>java_cup.runtime.Scanner</code> interface. This means the scanning function changes its name from <code>yylex()</code> to <code>next_token()</code>.</p>
<p>The main difference from older CUP versions to 0.10j is, that CUP now has a default constructor that accepts a <code>java_cup.runtime.Scanner</code> as argument and that uses this scanner as default (so no <code>scan with</code> code is necessary any more).</p>
<p>If you have an existing CUP specification, it will probably look somewhat like this:</p>
<pre><code>parser code {:
  Lexer lexer;

  public parser (java.io.Reader input) {
    lexer = new Lexer(input);
  }
:};

scan with {: return lexer.yylex(); :};</code></pre>
<p>To upgrade to CUP 0.10j, you could change it to look like this:</p>
<pre><code>parser code {:
  public parser (java.io.Reader input) {
    super(new Lexer(input));
  }
:};</code></pre>
<p>If you don’t mind changing the method that is calling the parser, you could remove the constructor entirely (and if there is nothing else in it, the whole <code>parser code</code> section). The main method calling the parser would then construct the parser as shown in the section above.</p>
<p>The JFlex specification does not need to be changed.</p>
<h2 id="BYaccJ">JFlex and BYacc/J</h2>
<p>JFlex has built-in support for the Java extension <a href="http://byaccj.sourceforge.net/">BYacc/J</a> <span class="citation">(Jamison, n.d.)</span> by Bob Jamison to the classical Berkeley Yacc parser generator. This section describes how to interface BYacc/J with JFlex. It builds on many helpful suggestions and comments from Larry Bell.</p>
<p>Since Yacc’s architecture is a bit different from CUP’s, the interface setup also works in a slightly different manner. BYacc/J expects a function <code>int yylex()</code> in the parser class that returns each next token. Semantic values are expected in a field <code>yylval</code> of type <code>parserval</code> where <code>parser</code> is the name of the generated parser class.</p>
<p>For a small calculator example, one could use a setup like the following on the JFlex side:</p>
<pre><code>%%

%byaccj

%{
  /* store a reference to the parser object */
  private parser yyparser;

  /* constructor taking an additional parser object */
  public Yylex(java.io.Reader r, parser yyparser) {
    this(r);
    this.yyparser = yyparser;
  }
%}

NUM = [0-9]+ (&quot;.&quot; [0-9]+)?
NL  = \n | \r | \r\n

%%

/* operators */
&quot;+&quot; | 
..
&quot;(&quot; | 
&quot;)&quot;    { return (int) yycharat(0); }

/* newline */
{NL}   { return parser.NL; }

/* float */
{NUM}  { yyparser.yylval = new parserval(Double.parseDouble(yytext()));
         return parser.NUM; }</code></pre>
<p>The lexer expects a reference to the parser in its constructor. Since Yacc allows direct use of terminal characters like <code>’+’</code> in its specifications, we just return the character code for single char matches (e.g. the operators in the example). Symbolic token names are stored as <code>public static int</code> constants in the generated parser class. They are used as in the <code>NL</code> token above. Finally, for some tokens, a semantic value may have to be communicated to the parser. The <code>NUM</code> rule demonstrates how.</p>
<p>A matching BYacc/J parser specification would look like this:</p>
<pre><code>%{
  import java.io.*;
%}
      
%token NL          /* newline  */
%token &lt;dval&gt; NUM  /* a number */

%type &lt;dval&gt; exp

%left &#39;-&#39; &#39;+&#39;
..
%right &#39;^&#39;         /* exponentiation */
      
%%

..
      
exp:     NUM          { $$ = $1; }
       | exp &#39;+&#39; exp  { $$ = $1 + $3; }
       ..
       | exp &#39;^&#39; exp  { $$ = Math.pow($1, $3); }
       | &#39;(&#39; exp &#39;)&#39;  { $$ = $2; }
       ;

%%
  /* a reference to the lexer object */
  private Yylex lexer;

  /* interface to the lexer */
  private int yylex () {
    int yyl_return = -1;
    try {
      yyl_return = lexer.yylex();
    }
    catch (IOException e) {
      System.err.println(&quot;IO error :&quot;+e);
    }
    return yyl_return;
  }

  /* error reporting */
  public void yyerror (String error) {
    System.err.println (&quot;Error: &quot; + error);
  }

  /* lexer is created in the constructor */
  public parser(Reader r) {
    lexer = new Yylex(r, this);
  }

  /* that&#39;s how you use the parser */
  public static void main(String args[]) throws IOException {
    parser yyparser = new parser(new FileReader(args[0]));
    yyparser.yyparse();    
  }</code></pre>
<p>Here, the customised part is mostly in the user code section. We create the lexer in the constructor of the parser and store a reference to it for later use in the parser’s <code>int yylex()</code> method. This <code>yylex</code> in the parser only calls <code>int yylex()</code> of the generated lexer and passes the result on. If something goes wrong, it returns -1 to indicate an error.</p>
<p>Runnable versions of the specifications above are located in the <code>examples/byaccj</code> directory of the JFlex distribution.</p>
<h2 id="jflex-and-jay">JFlex and Jay</h2>
<p>Combining JFlex with the <a href="http://www.cs.rit.edu/~ats/projects/lp/doc/jay/package-summary.html">Jay Parser Generator</a> <span class="citation">(Schreiner 2006)</span> is quite simple. The Jay Parser Generator defines an interface called <code>&lt;parsername&gt;.yyInput</code>. In the JFlex source the directive</p>
<pre><code>%implements &lt;parsername&gt;.yyInput</code></pre>
<p>tells JFlex to generate the corresponding class declaration.</p>
<p>The three interface methods to implement are</p>
<ul>
<li><p><code>advance()</code> which should return a boolean that is <code>true</code> if there is more work to do and <code>false</code> if the end of input has been reached,</p></li>
<li><p><code>token()</code> which returns the last scanned token, and</p></li>
<li><p><code>value()</code> which returns an Object that contains the (optional) value of the last read token.</p></li>
</ul>
<p>The following shows a small example with Jay parser specification and corresponding JFlex code. First of all the Jay code (in a file <code>MiniParser.jay</code>):</p>
<pre><code>%{
//
// Prefix Code like Package declaration, 
// imports, variables and the parser class declaration
// 

import java.io.*;
import java.util.*;

public class MiniParser 
{

%}

// Token declarations, and types of non-terminals

%token DASH COLON
%token &lt;Integer&gt; NUMBER

%token &lt;String&gt; NAME

%type &lt;Gameresult&gt; game
%type &lt;Vector&lt;Gameresult&gt;&gt; gamelist

// start symbol
%start gamelist

%%

gamelist: game        { $$ = new Vector&lt;Gameresult&gt;();
                        $&lt;Vector&lt;Gameresult&gt;&gt;$.add($1);
                      }
  |  gamelist game    { $1.add($2); }

game: NAME DASH NAME NUMBER COLON NUMBER {
      $$ = new Gameresult($1, $3, $4, $6); }

%%

  // supporting methods part of the parser class
  public static void main(String argv[])
  {
    MiniScanner scanner = new MiniScanner(new InputStreamReader(System.in));
    MiniParser parser = new MiniParser();
    try {
      parser.yyparse (scanner);
    } catch (final IOException ioe) {
      System.out.println(&quot;I/O Exception : &quot; + ioe.toString());
    } catch (final MiniParser.yyException ye) {
      System.out.println (&quot;Oops : &quot; + ye.toString());
    }
  }

} // closing brace for the parser class

class Gameresult {
  String homeTeam;
  String outTeam;
  Integer homeScore;
  Integer outScore;

  public Gameresult(String ht, String ot, Integer hs, Integer os)
  {
    homeTeam = ht;
    outTeam = ot;
    homeScore = hs;
    outScore = os;
  }
}</code></pre>
<p>The corresponding JFlex code (MiniScanner.jflex) could be</p>
<pre><code>%%

%public
%class MiniScanner
%implements MiniParser.yyInput
%integer

%line
%column
%unicode

%{
private int token;
private Object value;

// the next 3 methods are required to implement the yyInput interface

public boolean advance() throws java.io.IOException {
  value = new String(&quot;&quot;);
  token = yylex();
  return (token != YYEOF);
}

public int token() {
  return token;
}

public Object value() {
  return value;
}

%}

nl =     [\n\r]+
ws =     [ \t\b\015]+
number = [0-9]+
name =   [a-zA-Z]+
dash =   &quot;-&quot;
colon =  &quot;:&quot;

%%

{nl}      { /* do nothing */ }
{ws}      { /* happy meal */ }
{name}    { value = yytext(); return MiniParser.NAME; }
{dash}    { return MiniParser.DASH; }
{colon}   { return MiniParser.COLON; }
{number}  { try  {
              value = new Integer(Integer.parseInt(yytext()));
            } catch (NumberFormatException nfe) {
              // shouldn&#39;t happen
              throw new Error();
            }
            return MiniParser.NUMBER;
          }</code></pre>
<p>This small example reads an input like</p>
<pre><code>Borussia - Schalke 3:2
ACMilano - Juventus 1:4</code></pre>
<h1 id="Bugs">Bugs and Deficiencies</h1>
<h2 id="deficiencies">Deficiencies</h2>
<p>JFlex 1.7.0 conforms with Unicode Regular Expressions UTS#18 <span class="citation">(Davis and Heninger 2013)</span> Basic Unicode Support Level 1, with a few exceptions - for details see <a href="#unicoderegexconformance">UTS # 18 Conformance</a>.</p>
<h2 id="bugs">Bugs</h2>
<p>As of 21 September 2018, no major open problems are known for JFlex version 1.7.0.</p>
<p>Please use the JFlex <a href="https://github.com/jflex-de/jflex/labels/bug">github issue tracker</a> for any problems that have been reported since then.</p>
<h1 id="Copyright">Copying and License</h1>
<p>JFlex is free software, published under a BSD-style license.</p>
<p>There is <strong>NO WARRANTY</strong> for JFlex, its code and its documentation.</p>
<p>See the file <a href="COPYRIGHT"><code>COPYRIGHT</code></a> for more information.</p>
<div id="references" class="references">
<h1 id="references" class="unnumbered">References</h1>
<div id="ref-Aho_SU_86">
<p>Aho, Alfred V., Ravi Sethi, and Jeffrey D. Ullman. 1986. <em>Compilers – Principles, Techniques, and Tools</em>. Addison-Wesley.</p>
</div>
<div id="ref-Appel_98">
<p>Appel, Andrew W. 1998. <em>Modern Compiler Implementation in Java: Basic Techniques</em>. Cambridge University Press.</p>
</div>
<div id="ref-JLex">
<p>Berk, Elliot. 1996. “JLex: A Lexical Analyzer Generator for Java.” <a href="http://www.cs.princeton.edu/~appel/modern/java/JLex/" class="uri">http://www.cs.princeton.edu/~appel/modern/java/JLex/</a>.</p>
</div>
<div id="ref-unicode_rep">
<p>Davis, Mark, and Andy Heninger. 2013. “Unicode Regular Expressions.” <a href="http://www.unicode.org/reports/tr18/tr18-17.html" class="uri">http://www.unicode.org/reports/tr18/tr18-17.html</a>.</p>
</div>
<div id="ref-LangSpec">
<p>Gosling, James, Bill Joy, and Guy Steele. 1996. <em>The Java Language Specification</em>. Addison-Wesley. <a href="https://docs.oracle.com/javase/specs/" class="uri">https://docs.oracle.com/javase/specs/</a>.</p>
</div>
<div id="ref-CUP">
<p>Hudson, Scott E. 1996. “CUP LALR Parser Generator for Java.” <a href="http://www2.cs.tum.edu/projects/cup/" class="uri">http://www2.cs.tum.edu/projects/cup/</a>.</p>
</div>
<div id="ref-BYaccJ">
<p>Jamison, Bob. n.d. “BYacc/J.” <a href="http://byaccj.sourceforge.net" class="uri">http://byaccj.sourceforge.net</a>.</p>
</div>
<div id="ref-flex">
<p>Paxson, Vern. 1995. “Flex - the Fast Lexical Analyzer Generator.” <a href="http://flex.sourceforge.net" class="uri">http://flex.sourceforge.net</a>.</p>
</div>
<div id="ref-CUP2">
<p>Petter, Michael. 2008. “CUP2 User Manual.” <a href="http://www2.in.tum.de/cup2" class="uri">http://www2.in.tum.de/cup2</a>.</p>
</div>
<div id="ref-Jay">
<p>Schreiner, Axel T. 2006. “Jay Parser Generator.” <a href="http://www.cs.rit.edu/~ats/projects/lp/doc/jay/package-summary.html" class="uri">http://www.cs.rit.edu/~ats/projects/lp/doc/jay/package-summary.html</a>.</p>
</div>
<div id="ref-UnicodeSet">
<p>“Unicode Utilities: UnicodeSet.” 2015. <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp" class="uri">http://unicode.org/cldr/utility/list-unicodeset.jsp</a>.</p>
</div>
</div>
<div class="footnotes">
<hr />
<ol>
<li id="fn1"><p>Java is a trademark of Sun Microsystems, Inc., and refers to Sun’s Java programming language. JFlex is not sponsored by or affiliated with Sun Microsystems, Inc.<a href="#fnref1">↩</a></p></li>
<li id="fn2"><p>Thanks to Dimitri Maziuk for pointing these out.<a href="#fnref2">↩</a></p></li>
</ol>
</div>
</body>
</html>