File: Compiler.pm

package info (click to toggle)
libmarc-charset-perl 0.95-1etch1
  • links: PTS
  • area: main
  • in suites: etch
  • size: 2,444 kB
  • ctags: 75
  • sloc: xml: 98,939; perl: 612; makefile: 52
file content (145 lines) | stat: -rw-r--r-- 2,913 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
package MARC::Charset::Compiler; 

=head1 NAME

MARC::Charset::Compiler - compile XML mapping rules from LoC

=head1 SYNOPSIS

    $compiler = MARC::Charset::Compiler->new();
    $table = $compiler->compile('codetables.xml');

=head1 DESCRIPTION

MARC::Charset uses mapping rules from the Library of Congress for
generating a MARC::Charset::Table for looking up utf8 values based on the 
source MARC-8 character set and the character.

=head1 METHODS

=cut

use strict;
use warnings;

use base qw( XML::SAX::Base );
use XML::SAX::ParserFactory;
use Unicode::UCD qw(charinfo);
use MARC::Charset::Table;
use MARC::Charset::Code;


=head1 new()

The constructor.

=cut

sub new 
{
    my $self = bless {}, 'MARC::Charset::Compiler';
    $self->{table} = MARC::Charset::Table->brand_new();
    $self->{current_code} = undef;
    $self->{text} = '';
    return $self;
}


=head1 compile()

Pass in the path to an XML file to compile.

=cut

sub compile 
{
    my ($self, $file) = @_;

    my $factory = XML::SAX::ParserFactory->new();
    my $parser = $factory->parser(Handler => $self);
    $parser->parse_uri($file);
}


## SAX event handlers are below

sub start_element 
{
    my ($self, $data) = @_;
    my $name = $data->{Name};
    if ($name eq 'code')
    {
        $self->{current_code} = MARC::Charset::Code->new();
    }
    elsif ($name eq 'characterSet')
    {
        my $charset = $data->{Attributes}{'{}ISOcode'}{Value};
        warn('missing ISOcode in characterSet element') unless $charset;
        $self->{current_charset} = $charset;
    }
}


sub end_element
{
    my ($self, $data) = @_;
    my $name = $data->{Name};

    # normalize some names for method lookup
    $name = 'is_combining' if $name eq 'isCombining';

    # get the existing code if we have one
    my $code = $self->{current_code};

    # if we're ending a code element
    if ($code and $name eq 'code')
    {
        # set the charset code
        $code->charset($self->{current_charset});

        # lookup the name from perl's character db
        my $info = charinfo(hex($code->ucs()));
        $code->name($info->{name}) if $info;

        # add it to the table
        $self->{table}->add_code($code);

        # start with a clean slate
        $self->{current_code} = undef;
    }
   
    # add these elements
    elsif ($code and $name =~ /marc|ucs|is_combining/)
    {
        $code->$name($self->text());
    }

    # ending an element so forget all text
    $self->{text} = '';
}


sub characters 
{
    my ($self, $data) = @_;
    return unless $self->{current_code};
    my $text = $data->{Data};
    $self->{text} .= $data->{Data};
}


sub text 
{
    my $text = shift->{text};
    # collapse whitespace
    $text =~ s/\s\s+/ /g;
    # strip new lines
    $text =~ s/[\r\n]//g;
    # strip leading whitespace
    $text =~ s/^\s+//;
    # strip trailing whitespace
    $text =~ s/\s+$//;
    return $text;
}

1;