File: Lite.pm

package info (click to toggle)
movabletype-opensource 4.2.3-1%2Blenny3
  • links: PTS
  • area: main
  • in suites: lenny
  • size: 21,268 kB
  • ctags: 15,862
  • sloc: perl: 178,892; php: 26,178; sh: 161; makefile: 82
file content (203 lines) | stat: -rw-r--r-- 6,497 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
# ======================================================================
#
# Copyright (C) 2000-2001 Paul Kulchenko (paulclinger@yahoo.com)
# SOAP::Lite is free software; you can redistribute it
# and/or modify it under the same terms as Perl itself.
#
# $Id: Lite.pm,v 1.2 2005/02/22 01:47:42 byrnereese Exp $
#
# ======================================================================

package XML::Parser::Lite;

use strict;
use vars qw($VERSION);
#$VERSION = sprintf("%d.%s", map {s/_//g; $_} q$Name:  $ =~ /-(\d+)_([\d_]+)/);
$VERSION = '0.65_3';

sub new { 
  my $self = shift;
  my $class = ref($self) || $self;
  return $self if ref $self;

  $self = bless {} => $class;
  my %parameters = @_;
  $self->setHandlers(); # clear first 
  $self->setHandlers(%{$parameters{Handlers} || {}});
  return $self;
}

sub setHandlers {
  my $self = shift; 
  no strict 'refs'; local $^W;
  # clear all handlers if called without parameters
  unless (@_) { foreach (qw(Start End Char Final Init)) { *$_ = sub {} } }
  while (@_) { my($name => $func) = splice(@_, 0, 2); *$name = defined $func ? $func : sub {} }
  return $self;
}

sub regexp {
  my $patch = shift || '';
  my $package = __PACKAGE__;

  # This parser is based on "shallow parser" http://www.cs.sfu.ca/~cameron/REX.html 

  # Robert D. Cameron "REX: XML Shallow Parsing with Regular Expressions",
  # Technical Report TR 1998-17, School of Computing Science, Simon Fraser University, November, 1998.
  # Copyright (c) 1998, Robert D. Cameron. 
  # The following code may be freely used and distributed provided that
  # this copyright and citation notice remains intact and that modifications
  # or additions are clearly identified.

  my $TextSE = "[^<]+";
  my $UntilHyphen = "[^-]*-";
  my $Until2Hyphens = "$UntilHyphen(?:[^-]$UntilHyphen)*-";
  my $CommentCE = "$Until2Hyphens>?";
  my $UntilRSBs = "[^\\]]*](?:[^\\]]+])*]+";
  my $CDATA_CE = "$UntilRSBs(?:[^\\]>]$UntilRSBs)*>";
  my $S = "[ \\n\\t\\r]+";
  my $NameStrt = "[A-Za-z_:]|[^\\x00-\\x7F]";
  my $NameChar = "[A-Za-z0-9_:.-]|[^\\x00-\\x7F]";
  my $Name = "(?:$NameStrt)(?:$NameChar)*";
  my $QuoteSE = "\"[^\"]*\"|'[^']*'";
  my $DT_IdentSE = "$S$Name(?:$S(?:$Name|$QuoteSE))*";
  my $MarkupDeclCE = "(?:[^\\]\"'><]+|$QuoteSE)*>";
  my $S1 = "[\\n\\r\\t ]";
  my $UntilQMs = "[^?]*\\?+";
  my $PI_Tail = "\\?>|$S1$UntilQMs(?:[^>?]$UntilQMs)*>";
  my $DT_ItemSE = "<(?:!(?:--$Until2Hyphens>|[^-]$MarkupDeclCE)|\\?$Name(?:$PI_Tail))|%$Name;|$S";
  my $DocTypeCE = "$DT_IdentSE(?:$S)?(?:\\[(?:$DT_ItemSE)*](?:$S)?)?>?";
  my $DeclCE = "--(?:$CommentCE)?|\\[CDATA\\[(?:$CDATA_CE)?|DOCTYPE(?:$DocTypeCE)?";
  my $PI_CE = "$Name(?:$PI_Tail)?";

  # these expressions were modified for backtracking and events
  my $EndTagCE = "($Name)(?{${package}::end(\$2)})(?:$S)?>";
  my $AttValSE = "\"([^<\"]*)\"|'([^<']*)'";
  my $ElemTagCE = "($Name)(?:$S($Name)(?:$S)?=(?:$S)?(?:$AttValSE)(?{[\@{\$^R||[]},\$4=>defined\$5?\$5:\$6]}))*(?:$S)?(/)?>(?{${package}::start(\$3,\@{\$^R||[]})})(?{\${7} and ${package}::end(\$3)})";
  my $MarkupSPE = "<(?:!(?:$DeclCE)?|\\?(?:$PI_CE)?|/(?:$EndTagCE)?|(?:$ElemTagCE)?)";

  # Next expression is under "black magic".
  # Ideally it should be '($TextSE)(?{${package}::char(\$1)})|$MarkupSPE',
  # but it doesn't work under Perl 5.005 and only magic with
  # (?:....)?? solved the problem. 
  # I would appreciate if someone let me know what is the right thing to do 
  # and what's the reason for all this magic. 
  # Seems like a problem related to (?:....)? rather than to ?{} feature.
  # Tests are in t/31-xmlparserlite.t if you decide to play with it.
  "(?:($TextSE)(?{${package}::char(\$1)}))$patch|$MarkupSPE";
}

sub compile { local $^W; 
  # try regexp as it should be, apply patch if doesn't work
  foreach (regexp(), regexp('??')) {
    eval qq{sub parse_re { use re "eval"; 1 while \$_[0] =~ m{$_}go }; 1} or die;
    last if eval { parse_re('<foo>bar</foo>'); 1 }
  };

  *compile = sub {};
}

setHandlers();
compile();

sub parse { 
  init(); 
  parse_re($_[1]);
  final(); 
}

my(@stack, $level);

sub init { 
  @stack = (); $level = 0;
  Init(__PACKAGE__, @_);  
}

sub final { 
  die "not properly closed tag '$stack[-1]'\n" if @stack;
  die "no element found\n" unless $level;
  Final(__PACKAGE__, @_) 
} 

sub start { 
  die "multiple roots, wrong element '$_[0]'\n" if $level++ && !@stack;
  push(@stack, $_[0]);
  Start(__PACKAGE__, @_); 
}

sub char { 
  Char(__PACKAGE__, $_[0]), return if @stack;

  # check for junk before or after element
  # can't use split or regexp due to limitations in ?{} implementation, 
  # will iterate with loop, but we'll do it no more than two times, so
  # it shouldn't affect performance
  for (my $i=0; $i < length $_[0]; $i++) {
    die "junk '$_[0]' @{[$level ? 'after' : 'before']} XML element\n"
      if index("\n\r\t ", substr($_[0],$i,1)) < 0; # or should '< $[' be there
  }
}

sub end { 
  pop(@stack) eq $_[0] or die "mismatched tag '$_[0]'\n";
  End(__PACKAGE__, $_[0]);
}

# ======================================================================

1;

__END__

=head1 NAME

XML::Parser::Lite - Lightweight regexp-based XML parser

=head1 SYNOPSIS

  use XML::Parser::Lite;
  
  $p1 = new XML::Parser::Lite;
  $p1->setHandlers(
    Start => sub { shift; print "start: @_\n" },
    Char => sub { shift; print "char: @_\n" },
    End => sub { shift; print "end: @_\n" },
  );
  $p1->parse('<foo id="me">Hello World!</foo>');

  $p2 = new XML::Parser::Lite
    Handlers => {
      Start => sub { shift; print "start: @_\n" },
      Char => sub { shift; print "char: @_\n" },
      End => sub { shift; print "end: @_\n" },
    }
  ;
  $p2->parse('<foo id="me">Hello <bar>cruel</bar> World!</foo>');

=head1 DESCRIPTION

This Perl module gives you access to XML parser with interface similar to
XML::Parser interface. Though only basic calls are supported (init, final,
start, char, and end) you should be able to use it in the same way you use
XML::Parser. Due to using experimantal regexp features it'll work only on
Perl 5.6 and may behave differently on different platforms.
 
=head1 SEE ALSO

 XML::Parser

=head1 COPYRIGHT

Copyright (C) 2000-2001 Paul Kulchenko. All rights reserved.

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.

This parser is based on "shallow parser" http://www.cs.sfu.ca/~cameron/REX.html
Copyright (c) 1998, Robert D. Cameron.

=head1 AUTHOR

Paul Kulchenko (paulclinger@yahoo.com)

=cut