File: String.pm

package info (click to toggle)
libunicode-string-perl 2.10-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye, buster, stretch
  • size: 716 kB
  • ctags: 59
  • sloc: perl: 17,186; makefile: 3
file content (998 lines) | stat: -rw-r--r-- 22,561 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
package Unicode::String;

# Copyright 1997-1999, Gisle Aas.

use strict;
use vars qw($VERSION @ISA @EXPORT_OK $UTF7_OPTIONAL_DIRECT_CHARS);
use Carp;

require Exporter;
require DynaLoader;
@ISA = qw(Exporter DynaLoader);

@EXPORT_OK = qw(
    utf16 utf16le utf16be ucs2
    utf8
    utf7
    ucs4 utf32 utf32be utf32le
    latin1
    uchr uhex

    byteswap2 byteswap4
);

$VERSION = '2.10';

$UTF7_OPTIONAL_DIRECT_CHARS ||= 1;

bootstrap Unicode::String $VERSION;

use overload '""'   => \&as_string,
	     'bool' => \&as_bool,
	     '0+'   => \&as_num,
	     '.='   => \&append,
             '.'    => \&concat,
             'x'    => \&repeat,
	     '='    => \&copy,
             'fallback' => 1;

my %stringify = (
   unicode => \&utf16,
   utf16   => \&utf16,
   utf16be => \&utf16,
   utf16le => \&utf16le,
   ucs2    => \&utf16,
   utf8    => \&utf8,
   utf7    => \&utf7,
   ucs4    => \&ucs4,
   utf32   => \&ucs4,
   utf32be => \&ucs4,
   utf32le => \&utf32le,
   latin1  => \&latin1,
  'hex'    => \&hex,
);

my $stringify_as = \&utf8;

# some aliases
*ucs2 = \&utf16;
*utf16be = \&utf16;
*utf32 = \&ucs4;
*utf32be = \&ucs4;
*uhex = \&hex;
*uchr = \&chr;

sub new
{
    #_dump_arg("new", @_);
    my $class = shift;
    my $str;
    my $self = bless \$str, $class;
    &$stringify_as($self, shift) if @_;
    $self;
}


sub repeat
{
    my($self, $count) = @_;
    my $class = ref($self);
    my $str = $$self x $count;
    bless \$str, $class;
}


sub _dump_arg
{
    my $func = shift;
    print "$func(";
    print join(",", map { if (defined $_) {
                             my $x = overload::StrVal($_);
			     $x =~ s/\n/\\n/g;
			     $x = '""' unless length $x;
			     $x;
			 } else {
			     "undef"
			 }
                        } @_);
    print ")\n";
}


sub concat
{
    #_dump_arg("concat", @_);
    my($self, $other, $reversed) = @_;
    my $class = ref($self);
    unless (UNIVERSAL::isa($other, 'Unicode::String')) {
	$other = Unicode::String->new($other);
    }
    my $str = $reversed ? $$other . $$self : $$self . $$other;
    bless \$str, $class;
}


sub append
{
    #_dump_arg("append", @_);
    my($self, $other) = @_;
    unless (UNIVERSAL::isa($other, 'Unicode::String')) {
	$other = Unicode::String->new($other);
    }
    $$self .= $$other;
    $self;
}


sub copy
{
    my($self) = @_;
    my $class = ref($self);
    my $copy = $$self;
    bless \$copy, $class;
}


sub as_string
{
    #_dump_arg("as_string", @_);
    &$stringify_as($_[0]);
}


sub as_bool
{
    # This is different from perl's normal behaviour by not letting
    # a U+0030  ("0") be false.
    my $self = shift;
    $$self ? 1 : "";
}


sub as_num
{
    # Should be able to use the numeric property from Unidata
    # in order to parse a large number of numbers.  Currently we
    # only convert it to a plain string and let perl's normal
    # num-converter do the job.
    my $self = shift;
    my $str = $self->utf8;
    $str + 0;
}


sub stringify_as
{
    my $class;
    if (@_ > 1) {
	$class = shift;
	$class = ref($class) if ref($class);
    } else {
	$class = "Unicode::String";
    }
    my $old = $stringify_as;
    if (@_) {
	my $as = shift;
	croak("Don't know how to stringify as '$as'")
	    unless exists $stringify{$as};
	$stringify_as = $stringify{$as};
    }
    $old;
}


sub utf16
{
    my $self = shift;
    unless (ref $self) {
	my $u = new Unicode::String;
	$u->utf16($self);
	return $u;
    }
    my $old = $$self;
    if (@_) {
	$$self = shift;
	if ((length($$self) % 2) != 0) {
	    warn "Uneven UTF16 data" if $^W;
	    $$self .= "\0";
	}
	if ($$self =~ /^\xFF\xFE/) {
	    # the string needs byte swapping
	    $$self = byteswap2($$self);
	}
    }
    $old;
}


sub utf16le
{
    my $self = shift;
    unless (ref $self) {
	my $u = new Unicode::String;
	$u->utf16(byteswap2($self));
	return $u;
    }
    my $old = byteswap2($$self);
    if (@_) {
        $self->utf16(byteswap2(shift));
    }
    $old;
}


sub utf32le
{
    my $self = shift;
    unless (ref $self) {
	my $u = new Unicode::String;
	$u->ucs4(byteswap4($self));
	return $u;
    }
    my $old = byteswap4($self->ucs4);
    if (@_) {
        $self->ucs4(byteswap4(shift));
    }
    $old;
}


sub utf7   # rfc1642
{
    my $self = shift;
    unless (ref $self) {
	# act as ctor
	my $u = new Unicode::String;
	$u->utf7($self);
	return $u;
    }
    my $old;
    if (defined wantarray) {
	# encode into $old
	$old = "";
	pos($$self) = 0;
	my $len = length($$self);
	while (pos($$self) < $len) {
            if (($UTF7_OPTIONAL_DIRECT_CHARS &&
		 $$self =~ /\G((?:\0[A-Za-z0-9\'\(\)\,\-\.\/\:\?\!\"\#\$\%\&\*\;\<\=\>\@\[\]\^\_\`\{\|\}\s])+)/gc)
	        || $$self =~ /\G((?:\0[A-Za-z0-9\'\(\)\,\-\.\/\:\?\s])+)/gc)
            {
		#print "Plain ", utf16($1)->latin1, "\n";
		$old .= utf16($1)->latin1;
	    }
            elsif (($UTF7_OPTIONAL_DIRECT_CHARS &&
                    $$self =~ /\G((?:[^\0].|\0[^A-Za-z0-9\'\(\)\,\-\.\/\:\?\!\"\#\$\%\&\*\;\<\=\>\@\[\]\^\_\`\{\|\}\s])+)/gsc)
                   || $$self =~ /\G((?:[^\0].|\0[^A-Za-z0-9\'\(\)\,\-\.\/\:\?\s])+)/gsc)
            {
		#print "Unplain ", utf16($1)->hex, "\n";
		if ($1 eq "\0+") {
		    $old .= "+-";
		} else {
		    require MIME::Base64;
		    my $base64 = MIME::Base64::encode($1, '');
		    $base64 =~ s/=+$//;
		    $old .= "+$base64-";
		    # XXX should we determine when the final "-" is
		    # unnecessary? depends on next char not being part
		    # of the base64 char set.
		}
	    } else {
		die "This should not happen, pos=" . pos($$self) .
                                            ":  "  . $self->hex . "\n";
	    }
	}
    }

    if (@_) {
	# decode
	my $len = length($_[0]);
	$$self = "";
	pos($_[0]) = 0;
	while (pos($_[0]) < $len) {
	    if ($_[0] =~ /\G([^+]+)/gc) {
		$self->append(latin1($1));
	    } elsif ($_[0] =~ /\G\+-/gc) {
		$$self .= "\0+";
	    } elsif ($_[0] =~ /\G\+([A-Za-z0-9+\/]+)-?/gc) {
		my $base64 = $1;
		my $pad = length($base64) % 4;
		$base64 .= "=" x (4 - $pad) if $pad;
		require MIME::Base64;
		$$self .= MIME::Base64::decode($base64);
		if ((length($$self) % 2) != 0) {
		    warn "Uneven UTF7 base64-data" if $^W;
		    chop($$self); # correct it
		}
            } elsif ($_[0] =~ /\G\+/gc) {
		warn "Bad UTF7 data escape" if $^W;
		$$self .= "\0+";
	    } else {
		die "This should not happen " . pos($_[0]);
	    }
	}
    }
    $old;
}


sub hex
{
    my $self = shift;
    unless (ref $self) {
	my $u = new Unicode::String;
	$u->hex($self);
	return $u;
    }
    my $old;
    if (defined($$self) && defined wantarray) {
	$old = unpack("H*", $$self);
	$old =~ s/(....)/U+$1 /g;
	$old =~ s/\s+$//;
    }
    if (@_) {
	my $new = shift;
	$new =~ tr/0-9A-Fa-f//cd;  # leave only hex chars
	croak("Hex string length must be multiple of four")
	    unless (length($new) % 4) == 0;
	$$self = pack("H*", $new);
    }
    $old;
}


sub length
{
    my $self = shift;
    int(length($$self) / 2);
}

sub byteswap
{
   my $self = shift;
   byteswap2($$self);
   $self;
}

sub unpack
{
    my $self = shift;
    unpack("n*", $$self)
}


sub pack
{
    my $self = shift;
    $$self = pack("n*", @_);
    $self;
}


sub ord
{
    my $self = shift;
    return () unless defined $$self;

    my $array = wantarray;
    my @ret;
    my @chars;
    if ($array) {
        @chars = CORE::unpack("n*", $$self);
    } else {
	@chars = CORE::unpack("n2", $$self);
    }

    while (@chars) {
	my $first = shift(@chars);
	if ($first >= 0xD800 && $first <= 0xDFFF) { 	# surrogate
	    my $second = shift(@chars);
	    #print "F=$first S=$second\n";
	    if ($first >= 0xDC00 || $second < 0xDC00 || $second > 0xDFFF) {
		carp(sprintf("Bad surrogate pair (U+%04x U+%04x)",
			     $first, $second));
		unshift(@chars, $second);
		next;
	    }
	    push(@ret, ($first-0xD800)*0x400 + ($second-0xDC00) + 0x10000);
	} else {
	    push(@ret, $first);
	}
	last unless $array;
    }
    $array ? @ret : $ret[0];
}


sub name
{
    my $self = shift;
    require Unicode::CharName;
    if (wantarray) {
	return map { Unicode::CharName::uname($_) } $self->ord;
    } else {
        return Unicode::CharName::uname(scalar($self->ord));
    }
}


sub chr
{
    my($self,$val) = @_;
    unless (ref $self) {
	# act as ctor
	my $u = new Unicode::String;
	return $u->uchr($self);
    }
    if ($val > 0xFFFF) {
	# must be represented by a surrogate pair
	return undef if $val > 0x10FFFF;  # Unicode limit
	$val -= 0x10000;
	my $h = int($val / 0x400) + 0xD800;
	my $l = ($val % 0x400) + 0xDC00;
	$$self = CORE::pack("n2", $h, $l);
    } else {
	$$self = CORE::pack("n", $val);
    }
    $self;
}


sub substr
{
    my($self, $offset, $length, $substitute) = @_;
    $offset ||= 0;
    $offset *= 2;
    my $substr;
    if (defined $substitute) {
	unless (UNIVERSAL::isa($substitute, 'Unicode::String')) {
	    $substitute = Unicode::String->new($substitute);
	}
	if (defined $length) {
	    $substr = substr($$self, $offset, $length*2) = $$substitute;
	} else {
	    $substr = substr($$self, $offset) = $$substitute;
	}
    } else {
	if (defined $length) {
	    $substr = substr($$self, $offset, $length*2);
	} else {
	    $substr = substr($$self, $offset);
	}
    }
    bless \$substr, ref($self);
}


sub index
{
    my($self, $other, $pos) = @_;
    $pos ||= 0;
    $pos *= 2;
    $other = Unicode::String->new($other) unless ref($other);
    $pos++ while ($pos = index($$self, $$other, $pos)) > 0 && ($pos%2) != 0;
    $pos /= 2 if $pos > 0;
    $pos;
}


sub rindex
{
    my($self, $other, $pos) = @_;
    $pos ||= 0;
    die "NYI";
}


sub chop
{
    my $self = shift;
    if (CORE::length $$self) {
	my $chop = chop($$self);
	$chop = chop($$self) . $chop;
	return bless \$chop, ref($self);
    }
    undef;
}


# XXX: Ideas to be implemented
sub scan;
sub reverse;

sub lc;
sub lcfirst;
sub uc;
sub ucfirst;

sub split;
sub sprintf;
sub study;
sub tr;


1;

__END__

=head1 NAME

Unicode::String - String of Unicode characters (UTF-16BE)

=head1 SYNOPSIS

 use Unicode::String qw(utf8 latin1 utf16be);

 $u = utf8("string");
 $u = latin1("string");
 $u = utf16be("\0s\0t\0r\0i\0n\0g");

 print $u->utf32be;   # 4 byte characters
 print $u->utf16le;   # 2 byte characters + surrogates
 print $u->utf8;      # 1-4 byte characters

=head1 DESCRIPTION

A C<Unicode::String> object represents a sequence of Unicode
characters.  Methods are provided to convert between various external
formats (encodings) and C<Unicode::String> objects, and methods are
provided for common string manipulations.

The functions utf32be(), utf32le(), utf16be(), utf16le(), utf8(),
utf7(), latin1(), uhex(), uchr() can be imported from the
C<Unicode::String> module and will work as constructors initializing
strings of the corresponding encoding.

The C<Unicode::String> objects overload various operators, which means
that they in most cases can be treated like plain strings.

Internally a C<Unicode::String> object is represented by a string of 2
byte numbers in network byte order (big-endian). This representation
is not visible by the API provided, but it might be useful to know in
order to predict the efficiency of the provided methods.

=head2 METHODS

=head2 Class methods

The following class methods are available:

=over 4

=item Unicode::String->stringify_as

=item Unicode::String->stringify_as( $enc )

This method is used to specify which encoding will be used when
C<Unicode::String> objects are implicitly converted to and from plain
strings.

If an argument is provided it sets the current encoding.  The argument
should have one of the following: "ucs4", "utf32", "utf32be",
"utf32le", "ucs2", "utf16", "utf16be", "utf16le", "utf8", "utf7",
"latin1" or "hex".  The default is "utf8".

The stringify_as() method returns a reference to the current encoding
function.

=item $us = Unicode::String->new

=item $us = Unicode::String->new( $initial_value )

This is the object constructor.  Without argument, it creates an empty
C<Unicode::String> object.  If an $initial_value argument is given, it
is decoded according to the specified stringify_as() encoding, UTF-8
by default.

In general it is recommended to import and use one of the encoding
specific constructor functions instead of invoking this method.

=back

=head2 Encoding methods

These methods get or set the value of the C<Unicode::String> object by
passing strings in the corresponding encoding.  If a new value is
passed as argument it will set the value of the C<Unicode::String>,
and the previous value is returned.  If no argument is passed then the
current value is returned.

=encoding UTF-16

To illustrate the encodings we show how the 2 character sample string
of "m" (micro meter) is encoded for each one.

=over 4

=item $us->utf32be

=item $us->utf32be( $newval )

The string passed should be in the UTF-32 encoding with bytes in big
endian order.  The sample "m" is "\0\0\0\xB5\0\0\0m" in this encoding.

Alternative names for this method are utf32() and ucs4().

=item $us->utf32le

=item $us->utf32le( $newval )

The string passed should be in the UTF-32 encoding with bytes in little
endian order.  The sample "m" is is "\xB5\0\0\0m\0\0\0" in this encoding.

=item $us->utf16be

=item $us->utf16be( $newval )

The string passed should be in the UTF-16 encoding with bytes in big
endian order. The sample "m" is "\0\xB5\0m" in this encoding.

Alternative names for this method are utf16() and ucs2().

If the string passed to utf16be() starts with the Unicode byte order
mark in little endian order, the result is as if utf16le() was called
instead.

=item $us->utf16le

=item $us->utf16le( $newval )

The string passed should be in the UTF-16 encoding with bytes in
little endian order.  The sample "m" is is "\xB5\0m\0" in this
encoding.  This is the encoding used by the Microsoft Windows API.

If the string passed to utf16le() starts with the Unicode byte order
mark in big endian order, the result is as if utf16le() was called
instead.

=item $us->utf8

=item $us->utf8( $newval )

The string passed should be in the UTF-8 encoding. The sample "m" is
"\xC2\xB5m" in this encoding.

=item $us->utf7

=item $us->utf7( $newval )

The string passed should be in the UTF-7 encoding. The sample "m" is
"+ALU-m" in this encoding.


The UTF-7 encoding only use plain US-ASCII characters for the
encoding.  This makes it safe for transport through 8-bit stripping
protocols.  Characters outside the US-ASCII range are base64-encoded
and '+' is used as an escape character.  The UTF-7 encoding is
described in RFC 1642.

If the (global) variable $Unicode::String::UTF7_OPTIONAL_DIRECT_CHARS
is TRUE, then a wider range of characters are encoded as themselves.
It is even TRUE by default.  The characters affected by this are:

   ! " # $ % & * ; < = > @ [ ] ^ _ ` { | }

=item $us->latin1

=item $us->latin1( $newval )

The string passed should be in the ISO-8859-1 encoding. The sample "m" is
"\xB5m" in this encoding.

Characters outside the "\x00" .. "\xFF" range are simply removed from
the return value of the latin1() method.  If you want more control
over the mapping from Unicode to ISO-8859-1, use the C<Unicode::Map8>
class.  This is also the way to deal with other 8-bit character sets.

=item $us->hex

=item $us->hex( $newval )

The string passed should be plain ASCII where each Unicode character
is represented by the "U+XXXX" string and separated by a single space
character.  The "U+" prefix is optional when setting the value.  The
sample "m" is "U+00b5 U+006d" in this encoding.

=back

=head2 String Operations

The following methods are available:

=over 4

=item $us->as_string

Converts a C<Unicode::String> to a plain string according to the
setting of stringify_as().  The default stringify_as() encoding is
"utf8".

=item $us->as_num

Converts a C<Unicode::String> to a number.  Currently only the digits
in the range 0x30 .. 0x39 are recognized.  The plan is to eventually
support all Unicode digit characters.

=item $us->as_bool

Converts a C<Unicode::String> to a boolean value.  Only the empty
string is FALSE.  A string consisting of only the character U+0030 is
considered TRUE, even if Perl consider "0" to be FALSE.

=item $us->repeat( $count )

Returns a new C<Unicode::String> where the content of $us is repeated
$count times.  This operation is also overloaded as:

  $us x $count

=item $us->concat( $other_string )

Concatenates the string $us and the string $other_string.  If
$other_string is not an C<Unicode::String> object, then it is first
passed to the Unicode::String->new constructor function.  This
operation is also overloaded as:

  $us . $other_string


=item $us->append( $other_string )

Appends the string $other_string to the value of $us.  If
$other_string is not an C<Unicode::String> object, then it is first
passed to the Unicode::String->new constructor function.  This
operation is also overloaded as:

  $us .= $other_string

=item $us->copy

Returns a copy of the current C<Unicode::String> object.  This
operation is overloaded as the assignment operator.

=item $us->length

Returns the length of the C<Unicode::String>.  Surrogate pairs are
still counted as 2.

=item $us->byteswap

This method will swap the bytes in the internal representation of the
C<Unicode::String> object.

Unicode reserve the character U+FEFF character as a byte order mark.
This works because the swapped character, U+FFFE, is reserved to not
be valid.  For strings that have the byte order mark as the first
character, we can guaranty to get the byte order right with the
following code:

   $ustr->byteswap if $ustr->ord == 0xFFFE;

=item $us->unpack

Returns a list of integers each representing an UCS-2 character code.

=item $us->pack( @uchr )

Sets the value of $us as a sequence of UCS-2 characters with the
characters codes given as parameter.

=item $us->ord

Returns the character code of the first character in $us.  The ord()
method deals with surrogate pairs, which gives us a result-range of
0x0 .. 0x10FFFF.  If the $us string is empty, undef is returned.

=item $us->chr( $code )

Sets the value of $us to be a string containing the character assigned
code $code.  The argument $code must be an integer in the range 0x0
.. 0x10FFFF.  If the code is greater than 0xFFFF then a surrogate pair
created.

=item $us->name

In scalar context returns the official Unicode name of the first
character in $us.  In array context returns the name of all characters
in $us.  Also see L<Unicode::CharName>.

=item $us->substr( $offset )

=item $us->substr( $offset, $length )

=item $us->substr( $offset, $length, $subst )

Returns a sub-string of $us.  Works similar to the builtin substr()
function.

=item $us->index( $other )

=item $us->index( $other, $pos )

Locates the position of $other within $us, possibly starting the
search at position $pos.

=item $us->chop

Chops off the last character of $us and returns it (as a
C<Unicode::String> object).

=back

=head1 FUNCTIONS

The following functions are provided.  None of these are exported by default.

=over 4

=item byteswap2( $str, ... )

This function will swap 2 and 2 bytes in the strings passed as
arguments.  If this function is called in void context,
then it will modify its arguments in-place.  Otherwise, the swapped
strings are returned.

=item byteswap4( $str, ... )

The byteswap4 function works similar to byteswap2, but will reverse
the order of 4 and 4 bytes.

=item latin1( $str )

=item utf7( $str )

=item utf8( $str )

=item utf16le( $str )

=item utf16be( $str )

=item utf32le( $str )

=item utf32be( $str )

Constructor functions for the various Unicode encodings.  These return
new C<Unicode::String> objects.  The provided argument should be
encoded correspondingly.

=item uhex( $str )

Constructs a new C<Unicode::String> object from a string of hex
values.  See hex() method above for description of the format.

=item uchar( $num )

Constructs a new one character C<Unicode::String> object from a
Unicode character code.  This works similar to perl's builtin chr()
function.

=back

=head1 SEE ALSO

L<Unicode::CharName>,
L<Unicode::Map8>

L<http://www.unicode.org/>

L<perlunicode>

=head1 COPYRIGHT

Copyright 1997-2000,2005 Gisle Aas.

This library is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.

=cut


#
# Some old code that is not used any more (because the methods are
# now implemented as XS) and which I did not want to throw away yet.
#

sub ucs4_inperl
{
    my $self = shift;
    unless (ref $self) {
	my $u = new Unicode::String;
	$u->ucs4($self);
	return $u;
    }
    my $old = pack("N*", $self->ord);
    if (@_) {
	$$self = "";
	for (unpack("N*", shift)) {
	    $self->append(uchr($_));
	}
    }
    $old;
}


sub utf8_inperl
{
    my $self = shift;
    unless (ref $self) {
	# act as ctor
	my $u = new Unicode::String;
	$u->utf8($self);
	return $u;
    }

    my $old;
    if (defined($$self) && defined wantarray) {
	# encode UTF-8
	my $uc;
	for $uc (unpack("n*", $$self)) {
	    if ($uc < 0x80) {
		# 1 byte representation
		$old .= chr($uc);
	    } elsif ($uc < 0x800) {
		# 2 byte representation
		$old .= chr(0xC0 | ($uc >> 6)) .
                        chr(0x80 | ($uc & 0x3F));
	    } else {
		# 3 byte representation
		$old .= chr(0xE0 | ($uc >> 12)) .
		        chr(0x80 | (($uc >> 6) & 0x3F)) .
			chr(0x80 | ($uc & 0x3F));
	    }
	}
    }

    if (@_) {
	if (defined $_[0]) {
	    $$self = "";
	    my $bytes = shift;
	    $bytes =~ s/^[\200-\277]+//;  # can't start with 10xxxxxx
	    while (length $bytes) {
		if ($bytes =~ s/^([\000-\177]+)//) {
		    $$self .= pack("n*", unpack("C*", $1));
		} elsif ($bytes =~ s/^([\300-\337])([\200-\277])//) {
		    my($b1,$b2) = (ord($1), ord($2));
		    $$self .= pack("n", (($b1 & 0x1F) << 6) | ($b2 & 0x3F));
		} elsif ($bytes =~ s/^([\340-\357])([\200-\277])([\200-\277])//) {
		    my($b1,$b2,$b3) = (ord($1), ord($2), ord($3));
		    $$self .= pack("n", (($b1 & 0x0F) << 12) |
                                        (($b2 & 0x3F) <<  6) |
				         ($b3 & 0x3F));
		} else {
		    croak "Bad UTF-8 data";
		}
	    }
	} else {
	    $$self = undef;
	}
    }

    $old;
}




sub latin1_inperl
{
    my $self = shift;
    unless (ref $self) {
	# act as ctor
	my $u = new Unicode::String;
	$u->latin1($self);
	return $u;
    }

    my $old;
    # XXX: should really check that none of the chars > 256
    $old = pack("C*", unpack("n*", $$self)) if defined $$self;

    if (@_) {
	# set the value
	if (defined $_[0]) {
	    $$self = pack("n*", unpack("C*", $_[0]));
	} else {
	    $$self = undef;
	}
    }
    $old;
}