1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
|
package MARC::Charset::Code;
use strict;
use warnings;
use base qw(Class::Accessor);
use Carp qw(croak);
use Encode qw(encode_utf8);
use MARC::Charset::Constants qw(:all);
MARC::Charset::Code
->mk_accessors(qw(marc ucs name charset is_combining));
=head1 NAME
MARC::Charset::Code - represents a MARC-8/UTF-8 mapping
=head1 SYNOPSIS
=head1 DESCRIPTION
Each mapping from a MARC-8 value to a UTF-8 value is represented by
a MARC::Charset::Code object in a MARC::Charset::Table.
=head1 METHODS
=head2 new()
The constructor.
=head2 name()
A descriptive name for the code point.
=head2 marc()
A string representing the MARC-8 bytes codes.
=head2 ucs()
A string representing the UCS code point in hex.
=head2 charset_code()
The MARC-8 character set code.
=head2 is_combining()
Returns true/false to tell if the character is a combining character.
=head2 to_string()
A stringified version of the object suitable for pretty printing.
=head2 char_value()
Returns the unicode character. Essentially just a helper around
ucs().
=cut
sub char_value()
{
return chr(hex(shift->ucs()));
}
=head2 marc_value()
The string representing the MARC-8 encoding.
=cut
sub marc_value
{
my $code = shift;
my $marc = $code->marc();
return chr(hex($marc)) unless $code->charset_name eq 'CJK';
return
chr(hex(substr($marc,0,2))) .
chr(hex(substr($marc,2,2))) .
chr(hex(substr($marc,4,2)));
}
=head2 charset_name()
Returns the name of the character set, instead of the code.
=cut
sub charset_name()
{
return MARC::Charset::Constants::charset_name(shift->charset_value());
}
=head2 to_string()
Returns a stringified version of the object.
=cut
sub to_string
{
my $self = shift;
my $str =
$self->name() . ': ' .
'charset_code=' . $self->charset() . ' ' .
'marc=' . $self->marc() . ' ' .
'ucs=' . $self->ucs() . ' ';
$str .= ' combining' if $self->is_combining();
return $str;
}
=head2 marc8_hash_code()
Returns a hash code for this Code object for looking up the object using
MARC8. First portion is the character set code and the second is the
MARC-8 value.
=cut
sub marc8_hash_code
{
my $self = shift;
return sprintf('%s:%s', $self->charset_value(), $self->marc_value());
}
=head2 utf8_hash_code()
Returns a hash code for uniquely identifying a Code by it's UCS value.
=cut
sub utf8_hash_code
{
return int(hex(shift->ucs()));
}
=head2 default_charset_group
Returns 'G0' or 'G1' indicating where the character is typicalling used
in the MARC-8 environment.
=cut
sub default_charset_group
{
my $charset = shift->charset_value();
return 'G0'
if $charset eq ASCII_DEFAULT
or $charset eq GREEK_SYMBOLS
or $charset eq SUBSCRIPTS
or $charset eq SUPERSCRIPTS
or $charset eq BASIC_LATIN
or $charset eq BASIC_ARABIC
or $charset eq BASIC_CYRILLIC
or $charset eq BASIC_GREEK
or $charset eq BASIC_HEBREW
or $charset eq CJK;
return 'G1';
}
=head2 get_marc8_escape
Returns an escape sequence to move to the Code from another marc-8 character
set.
=cut
sub get_escape
{
my $charset = shift->charset_value();
return ESCAPE . $charset
if $charset eq ASCII_DEFAULT
or $charset eq GREEK_SYMBOLS
or $charset eq SUBSCRIPTS
or $charset eq SUPERSCRIPTS;
return ESCAPE . SINGLE_G0_A . $charset
if $charset eq ASCII_DEFAULT
or $charset eq BASIC_LATIN
or $charset eq BASIC_ARABIC
or $charset eq BASIC_CYRILLIC
or $charset eq BASIC_GREEK
or $charset eq BASIC_HEBREW;
return ESCAPE . SINGLE_G1_A . $charset
if $charset eq EXTENDED_ARABIC
or $charset eq EXTENDED_LATIN
or $charset eq EXTENDED_CYRILLIC;
return ESCAPE . MULTI_G0_A . CJK
if $charset eq CJK;
}
=head2 charset_value
Returns the charset value, not the hex sequence.
=cut
sub charset_value
{
return chr(hex(shift->charset()));
}
1;
|