1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436
|
#!/usr/bin/env perl
#############################################################################
# Name: regex.pl
# Purpose: Generate test code for wxRegEx from 'reg.test'
# Author: Mike Wetherell
# Copyright: (c) Mike Wetherell
# Licence: wxWindows licence
#############################################################################
#
# Notes:
# See './regex.pl -h' for usage
#
# Output at the moment is C++ using the cppunit testing framework. The
# language/framework specifics are separated, with the following 5
# subs as an interface: 'begin_output', 'begin_section', 'write_test',
# 'end_section' and 'end_output'. So for a different language/framework,
# implement 5 new similar subs.
#
# I've avoided using 'use encoding "UTF-8"', since this wasn't available
# in perl 5.6.x. Instead I've used some hacks like 'pack "U0C*"'. Versions
# earler than perl 5.6.0 aren't going to work.
#
use strict;
use warnings;
use File::Basename;
#use encoding "UTF-8"; # enable in the future when perl 5.6.x is just a memory
# if 0 output is wide characters, if 1 output is utf8 encoded
my $utf = 1;
# quote a parameter (C++ helper)
#
sub quotecxx {
my %esc = ( "\a" => "a", "\b" => "b", "\f" => "f",
"\n" => "n", "\r" => "r", "\t" => "t",
"\013" => "v", '"' => '"', "\\" => "\\" );
# working around lack of 'use encoding'
if (!$utf) {
$_ = pack "U0C*", unpack "C*", $_;
use utf8;
}
s/[\000-\037"\\\177-\x{ffff}]/
if ($esc{$&}) {
"\\$esc{$&}";
} elsif (ord($&) > 0x9f && !$utf) {
sprintf "\\u%04x", ord($&);
} else {
sprintf "\\%03o", ord($&);
}
/ge;
# working around lack of 'use encoding'
if (!$utf) {
no utf8;
$_ = pack "C*", unpack "C*", $_;
}
return ($utf ? '"' : 'L"') . $_ . '"'
}
# start writing the output code (C++ interface)
#
sub begin_output {
my ($from, $instructions) = @_;
# embed it in the comment
$from = "\n$from";
$from =~ s/^(?: )?/ * /mg;
# $instructions contains information about the flags etc.
if ($instructions) {
$instructions = "\n$instructions";
$instructions =~ s/^(?: )?/ * /mg;
}
my $u = $utf ? " (UTF-8 encoded)" : "";
print <<EOT;
/*
* Test data for wxRegEx$u
$from$instructions */
EOT
}
my @classes;
# start a new section (C++ interface)
#
sub begin_section {
my ($id, $title) = @_;
my $class = "regextest_$id";
$class =~ s/\W/_/g;
push @classes, [$id, $class];
print <<EOT;
/*
* $id $title
*/
class $class : public RegExTestSuite
{
public:
$class() : RegExTestSuite("regex.$id") { }
static Test *suite();
};
Test *$class\::suite()
{
RegExTestSuite *suite = new $class;
EOT
}
# output a test line (C++ interface)
#
sub write_test {
my @args = @_;
$_ = quotecxx for @args;
print " suite->add(" . (join ', ', @args) . ", NULL);\n";
}
# end a section (C++ interface)
#
sub end_section {
my ($id, $class) = @{$classes[$#classes]};
print <<EOT;
return suite;
}
CPPUNIT_TEST_SUITE_NAMED_REGISTRATION($class, "regex.$id");
EOT
}
# finish off the output (C++ interface)
#
sub end_output {
print <<EOT;
/*
* A suite containing all the above suites
*/
class regextest : public TestSuite
{
public:
regextest() : TestSuite("regex") { }
static Test *suite();
};
Test *regextest::suite()
{
TestSuite *suite = new regextest;
EOT
print " suite->addTest(".$_->[1]."::suite());\n" for @classes;
print <<EOT;
return suite;
}
CPPUNIT_TEST_SUITE_NAMED_REGISTRATION(regextest, "regex");
CPPUNIT_TEST_SUITE_REGISTRATION(regextest);
EOT
}
# Parse a tcl string. Handles curly quoting and double quoting.
#
sub parsetcl {
my ($curly, $quote);
# recursively defined expression that can parse balanced braces
# warning: uses experimental features of perl, see perlop(1)
$curly = qr/\{(?:(?>(?:\\[{}]|[^{}])+)|(??{$curly}))*\}/;
$quote = qr/"(?:\\"|[^"])*"/;
my @tokens = shift =~ /($curly|$quote|\S+)/g;
# now remove braces/quotes and unescape any escapes
for (@tokens) {
if (s/^{(.*)}$/$1/) {
# for curly quoting, only unescape \{ and \}
s/\\([{}])/$1/g;
} else {
s/^"(.*)"$/$1/;
# unescape any escapes
my %esc = ( "a" => "\a", "b" => "\b", "f" => "\f",
"n" => "\n", "r" => "\r", "t" => "\t",
"v" => "\013" );
my $x = qr/[[:xdigit:]]/;
s/\\([0-7]{1,3}|x$x+|u$x{1,4}|.)/
if ($1 =~ m{^([0-7]+)}) {
chr(oct($1));
} elsif ($1 =~ m{^x($x+)}) {
pack("C0U", hex($1) & 0xff);
} elsif ($1 =~ m{^u($x+)}) {
pack("C0U", hex($1));
} elsif ($esc{$1}) {
$esc{$1};
} else {
$1;
}
/ge;
}
}
return @tokens;
}
# helpers which keep track of whether begin_section has been called, so that
# end_section can be called when appropriate
#
my @doing = ("0", "");
my $in_section = 0;
sub handle_doing {
end_section if $in_section;
$in_section = 0;
@doing = @_;
}
sub handle_test {
begin_section(@doing) if !$in_section;
$in_section = 1;
write_test @_;
}
sub handle_end {
end_section if $in_section;
$in_section = 0;
end_output;
}
# 'main' - start by parsing the command lines options.
#
my $badoption = !@ARGV;
my $utfdefault = $utf;
my $outputname;
for (my $i = 0; $i < @ARGV; ) {
if ($ARGV[$i] !~ m{^-.}) {
$i++;
next;
}
if ($ARGV[$i] eq '--') {
splice @ARGV, $i, 1;
last;
}
if ($ARGV[$i] =~ s{^-(.*)o(.*)$}{-$1}i) { # -o : output file
$outputname = $2 || splice @ARGV, $i + 1, 1;
}
for (split //, substr($ARGV[$i], 1)) {
if (/u/i) { # -u : utf-8 output
$utf = 1;
} elsif (/w/i) { # -w : wide char output
$utf = 0;
} else {
$badoption = 1;
}
}
splice @ARGV, $i, 1;
}
# Display help
#
if ($badoption) {
my $prog = basename $0;
my ($w, $u) = (" (default)", " ");
($w, $u) = ($u, $w) if $utfdefault;
print <<EOT;
Usage: $prog [-u|-w] [-o OUTPUT] [FILE...]
Generate test code for wxRegEx from 'reg.test'
Example: $prog -o regex.inc reg.test wxreg.test
-w$w Output will be wide characters.
-u$u Output will be UTF-8 encoded.
Input files should be in UTF-8. If no input files are specified input is
read from stdin. If no output file is specified output is written to stdout.
See the comments in reg.test for details of the input file format.
EOT
exit 0;
}
# Open the output file
#
open STDOUT, ">$outputname" if $outputname;
# Read in the files and initially parse just the comments for copyright
# information and instructions on the tests
#
my @input; # slurped input files stripped of comments
my $files = ""; # copyright info from the input comments
my $instructions = ""; # test instructions from the input comments
do {
my $inputname = basename $ARGV[0] if @ARGV;
# slurp input
undef $/;
my $in = <>;
# remove escaped newlines
$in =~ s/(?<!\\)\\\n//g;
# record the copyrights of the input files
for ($in =~ /^#[\t ]*(.*copyright.*)$/mig) {
s/[\s:]+/ /g;
$files .= " ";
$files .= $inputname . ": " if $inputname && $inputname ne '-';
$files .= "$_\n";
}
# Parse the comments for instructions on the tests, which look like this:
# i successful match with -indices (used in checking things like
# nonparticipating subexpressions)
if (!$instructions) {
my $sp = qr{\t| +}; # tab or three or more spaces
my @instructions = $in =~
/\n(
(?:
\#$sp\S?$sp\S[^\n]+\n # instruction line
(?:\#$sp$sp\S[^\n]+\n)* # continuation lines (if any)
)+
)/gx;
if (@instructions) {
$instructions[0] = "Test types:\n$instructions[0]";
if (@instructions > 1) {
$instructions[1] = "Flag characters:\n$instructions[1]";
}
$instructions = join "\n", @instructions;
$instructions =~ s/^#([^\t]?)/ $1/mg;
}
}
# @input is the input of all files (stipped of comments)
$in =~ s/^#.*$//mg;
push @input, $in;
} while $ARGV[0];
# Make a string naming the generator, the input files and copyright info
#
my $from = "Generated " . localtime() . " by " . basename $0;
$from =~ s/[\s]+/ /g;
if ($files) {
if ($files =~ /:/) {
$from .= " from the following files:";
} else {
$from .= " from work with the following copyright:";
}
}
$from = join("\n", $from =~ /(.{0,76}(?:\s|$))/g); # word-wrap
$from .= "\n$files" if $files;
# Now start to print the code
#
begin_output $from, $instructions;
# numbers for 'extra' sections
my $extra = 1;
for (@input)
{
# Print the main tests
#
# Test lines look like this:
# m 3 b {\(a\)b} ab ab a
#
# Also looks for heading lines, e.g.:
# doing 4 "parentheses"
#
for (split "\n") {
if (/^doing\s+(\S+)\s+(\S.*)/) {
handle_doing parsetcl "$1 $2";
} elsif (/^[efimp]\s/) {
handle_test parsetcl $_;
}
}
# Extra tests
#
# The expression below matches something like this:
# test reg-33.8 {Bug 505048} {
# regexp -inline {\A\s*[^b]*b} ab
# } ab
#
# The three subexpressions then return these parts:
# $extras[$i] = '{Bug 505048}',
# $extras[$i + 1] = '-inline {\A\s*[^b]*b} ab'
# $extras[$i + 2] = 'ab'
#
my @extras = /\ntest\s+\S+\s*(\{.*?\})\s*\{\n # line 1
\s*regexp\s+([^\n]+)\n # line 2
\}\s*(\S[^\n]*)/gx; # line 3
handle_doing "extra_" . $extra++, "checks for bug fixes" if @extras;
for (my $i = 0; $i < @extras; $i += 3) {
my $id = $extras[$i];
# further parse the middle line into options and the rest (i.e. $args)
my ($opts, $args) = $extras[$i + 1] =~ /^\s*((?:-\S+\s+)*)([^\s-].*)/;
my @args = parsetcl $args;
$#args = 1; # only want the first two
# now handle the options
my $test = $opts =~ /-indices/ ? 'i' : $extras[$i + 2] ? 'm' : 'f';
my $results = $opts =~ /-inline/ && $test ne 'f' ? $extras[$i+2] : '';
# get them all in the right order and print
unshift @args, $test, parsetcl($id), $results ? '-' : 'o';
push @args, parsetcl(parsetcl($results)) if $results;
handle_test @args;
}
}
# finish
#
handle_end;
|