File: utf8.t

package info (click to toggle)
libmarc-charset-perl 1.35-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 2,476 kB
  • sloc: xml: 99,038; perl: 774; makefile: 9
file content (126 lines) | stat: -rw-r--r-- 2,846 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
use strict;
use warnings;
use Test::More qw(no_plan);

use MARC::Charset::Constants ':all';
use MARC::Charset 'utf8_to_marc8';

## MAKE SURE ALL THE CHARACTER SETS ARE THERE

is(
    utf8_to_marc8(chr(0x0041)), 
    chr(0x41),
    'ASCII'
); 

is(
    utf8_to_marc8(chr(0x0131)),
    chr(0xB8),
    'Ansel'
);

is(
    utf8_to_marc8(chr(0x0628)),
    ESCAPE . SINGLE_G0_A . BASIC_ARABIC . chr(0x48) . 
	ESCAPE . ASCII_DEFAULT,
    'Basic Arabic' 
);

is(
    utf8_to_marc8(chr(0x068D)),
    ESCAPE . SINGLE_G1_A . EXTENDED_ARABIC . chr(0xB9) . 
	ESCAPE . SINGLE_G1_A . EXTENDED_LATIN,
    'Extended Arabic'
);

is(
    utf8_to_marc8(chr(0x0440)),
    ESCAPE . SINGLE_G0_A . BASIC_CYRILLIC . chr(0x52) . 
	ESCAPE . ASCII_DEFAULT,
    'Basic Cyrillic'
);

is(
    utf8_to_marc8(chr(0x0408)),
    ESCAPE . SINGLE_G1_A . EXTENDED_CYRILLIC . chr(0xE8) . 
	ESCAPE . SINGLE_G1_A . EXTENDED_LATIN,
    'Extended Cyrillic'
);

is(
    utf8_to_marc8(chr(0x0398)),
    ESCAPE . SINGLE_G0_A . BASIC_GREEK . chr(0x4B) . 
	ESCAPE . ASCII_DEFAULT,
    'Greek'
);

## note: we skip Greek Symbols since when mapping from utf8 to marc8
## we always use the Greek character set instead

is(
    utf8_to_marc8(chr(0x05E0)),
    ESCAPE . SINGLE_G0_A . BASIC_HEBREW . chr(0x70) . 
	ESCAPE . ASCII_DEFAULT,
    'Hebrew' 
);

is(utf8_to_marc8(chr(0x2083)),
    ESCAPE . SUBSCRIPTS . chr(0x33) . ESCAPE . ASCII_DEFAULT,
    'Subscripts'
);

is(utf8_to_marc8(chr(0x2074)),
    ESCAPE . SUPERSCRIPTS . chr(0x34) . ESCAPE . ASCII_DEFAULT,
    'Superscripts'
);
    
is(
    utf8_to_marc8(chr(0x71AC)),
    ESCAPE . MULTI_G0_A . CJK . chr(0x21) . chr(0x49) . chr(0x7C) . 
	ESCAPE . ASCII_DEFAULT, 
    'East Asian'
);

## COMBINING CHARACTERS

is(
    utf8_to_marc8('c' . chr(0x0327) . 'edilla'),
    chr(0xF0) . 'cedilla',
    'string with interior combining character'
);

is(
    utf8_to_marc8('abc' . chr(0x0327) . chr(0x0300) . chr(0x0301) 
	. 'def'),
    'ab' . chr(0xF0) . chr(0xE1) . chr(0xE2) . 'cdef',
    'string with multiple interior combining characters'
);


## ESCAPING TO OTHER CHARACTER SETS 

is(
    utf8_to_marc8(chr(0x043A)),
    ESCAPE . SINGLE_G0_A . BASIC_CYRILLIC . chr(0x4B) .
	ESCAPE . ASCII_DEFAULT ,
    'CYRILLIC SMALL LETTER KA'
);


is(
    utf8_to_marc8(chr(0x05D0) . chr(0x043B)),
    ESCAPE . SINGLE_G0_A . BASIC_HEBREW . chr(0x60) .
	ESCAPE . SINGLE_G0_A . BASIC_CYRILLIC . chr(0x4C) .
	ESCAPE . ASCII_DEFAULT,
    'string with multiple character sets'
);

is(
    utf8_to_marc8(chr(0x0396). ' ' . chr(0x0398)),
    ESCAPE . SINGLE_G0_A . BASIC_GREEK .    ## set G0 to Greek
    chr(0x49) .                             ## ZETA
    ' ' .                                   ## SPACE
    chr(0x4B) .                             ## THETA
    ESCAPE . ASCII_DEFAULT,                 ## Back to ASCII 
    'greek utf8 with an internal space'
);