File: unicode_short_charclass.t

package info (click to toggle)
libppix-regexp-perl 0.090-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 1,524 kB
  • sloc: perl: 8,022; makefile: 8
file content (56 lines) | stat: -rw-r--r-- 1,361 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
package main;

use 5.006;

use strict;
use warnings;

use PPIx::Regexp;
use Test::More 0.88;	# Because of done_testing();

note <<'EOD';

Scraping perluniprops seems fragile because it is, but I can not think
of better way to find out what all the single-character property names
are. If this breaks too often I may end up going to just matching
[[:upper:]] or something like that. Note to self: the relevant regular
expression is in PPIx::Regexp::Token::CharClass::Simple method
__PPIX_TOKENIZER__regexp()

EOD

my %prop;

foreach ( `perldoc -oText perluniprops` ) {
    m/ \\p [{] ( . ) [}] .*? \\p [{] ( .{2,}? ) [}] /smx
	or next;
    $prop{$1} ||= $2;
}

is_deeply \%prop, {
    C	=> 'Other',
    L	=> 'Letter',
    M	=> 'Mark',
    N	=> 'Number',
    P	=> 'Punct',
    S	=> 'Symbol',
    Z	=> 'Separator',
}, 'All single-character properties are accounted for';

foreach my $letter ( sort keys %prop ) {
    my $token = "\\p$letter";
    my $text = "/$token/";
    my $pre = PPIx::Regexp->new( $text );
    my $re = $pre->regular_expression();
    my @kids = $re->children();
    cmp_ok scalar( @kids ), '==', 1, "'$text' parsed to a single token";
    cmp_ok $kids[0]->content(), 'eq', $token, "'$text' contains token $token";
    isa_ok $kids[0], 'PPIx::Regexp::Token::CharClass::Simple',
	"Token $token";
}

done_testing;

1;

# ex: set textwidth=72 :