File: Punycode.pm

package info (click to toggle)
libidna-punycode-perl 0.03-1
links: PTS
area: main
in suites: lenny, squeeze, wheezy
size: 68 kB
ctags: 22
sloc: perl: 140; makefile: 42
file content (240 lines) | stat: -rw-r--r-- 5,486 bytes
package IDNA::Punycode;

use strict;
our $VERSION = 0.03;

require Exporter;
our @ISA	= qw(Exporter);
our @EXPORT = qw(encode_punycode decode_punycode idn_prefix);

use integer;

our $DEBUG = 0;
our $PREFIX = 'xn--';

use constant BASE => 36;
use constant TMIN => 1;
use constant TMAX => 26;
use constant SKEW => 38;
use constant DAMP => 700;
use constant INITIAL_BIAS => 72;
use constant INITIAL_N => 128;

my $Delimiter = chr 0x2D;
my $BasicRE   = qr/[\x00-\x7f]/;

sub _croak { require Carp; Carp::croak(@_); }

sub idn_prefix {
	$PREFIX = shift;
}

sub digit_value {
	my $code = shift;
	return ord($code) - ord("A") if $code =~ /[A-Z]/;
	return ord($code) - ord("a") if $code =~ /[a-z]/;
	return ord($code) - ord("0") + 26 if $code =~ /[0-9]/;
	return;
}

sub code_point {
	my $digit = shift;
	return $digit + ord('a') if 0 <= $digit && $digit <= 25;
	return $digit + ord('0') - 26 if 26 <= $digit && $digit <= 36;
	die 'NOT COME HERE';
}

sub adapt {
	my($delta, $numpoints, $firsttime) = @_;
	$delta = $firsttime ? $delta / DAMP : $delta / 2;
	$delta += $delta / $numpoints;
	my $k = 0;
	while ($delta > ((BASE - TMIN) * TMAX) / 2) {
		$delta /= BASE - TMIN;
		$k += BASE;
	}
	return $k + (((BASE - TMIN + 1) * $delta) / ($delta + SKEW));
}

sub decode_punycode {
	my $code = shift;

	my $n	  = INITIAL_N;
	my $i	  = 0;
	my $bias   = INITIAL_BIAS;
	my @output;

	if ($PREFIX) {
		if ($code !~ /^$PREFIX/) {
			return $code;
		}
		$code =~ s/^$PREFIX//;
	}

	if ($code =~ s/(.*)$Delimiter//o) {
		push @output, map ord, split //, $1;
		return _croak('non-basic code point') unless $1 =~ /^$BasicRE*$/o;
	}

	while ($code) {
		my $oldi = $i;
		my $w	= 1;
		LOOP:
		for (my $k = BASE; 1; $k += BASE) {
			my $cp = substr($code, 0, 1, '');
			my $digit = digit_value($cp);
			defined $digit or return _croak("invalid punycode input");
			$i += $digit * $w;
			my $t = ($k <= $bias) ? TMIN
			: ($k >= $bias + TMAX) ? TMAX : $k - $bias;
			last LOOP if $digit < $t;
			$w *= (BASE - $t);
		}
		$bias = adapt($i - $oldi, @output + 1, $oldi == 0);
		warn "bias becomes $bias" if $DEBUG;
		$n += $i / (@output + 1);
		$i = $i % (@output + 1);
		splice(@output, $i, 0, $n);
		warn join " ", map sprintf('%04x', $_), @output if $DEBUG;
		$i++;
	}
	return join '', map chr, @output;
}

sub encode_punycode {
	my $input = shift;
	# my @input = split //, $input; # doesn't work in 5.6.x!
	my @input = map substr($input, $_, 1), 0..length($input)-1;

	my $n	 = INITIAL_N;
	my $delta = 0;
	my $bias  = INITIAL_BIAS;

	my @output;
	my @basic = grep /$BasicRE/, @input;
	my $h = my $b = @basic;
	#push @output, @basic, $Delimiter if $b > 0;
	push @output, @basic if $b > 0;
	warn "basic codepoints: (@output)" if $DEBUG;

	if ($h < @input) {
		$PREFIX && unshift(@output, $PREFIX);
		push(@output, $Delimiter);
	} else {
		return join '', @output;
	}

	while ($h < @input) {
		my $m = min(grep { $_ >= $n } map ord, @input);
		warn sprintf "next code point to insert is %04x", $m if $DEBUG;
		$delta += ($m - $n) * ($h + 1);
		$n = $m;
		for my $i (@input) {
			my $c = ord($i);
			$delta++ if $c < $n;
			if ($c == $n) {
				my $q = $delta;
				LOOP:
				for (my $k = BASE; 1; $k += BASE) {
					my $t = ($k <= $bias) ? TMIN :
					($k >= $bias + TMAX) ? TMAX : $k - $bias;
					last LOOP if $q < $t;
					my $cp = code_point($t + (($q - $t) % (BASE - $t)));
					push @output, chr($cp);
					$q = ($q - $t) / (BASE - $t);
				}
				push @output, chr(code_point($q));
				$bias = adapt($delta, $h + 1, $h == $b);
				warn "bias becomes $bias" if $DEBUG;
				$delta = 0;
				$h++;
			}
		}
		$delta++;
		$n++;
	}
	return join '', @output;
}

sub min {
	my $min = shift;
	for (@_) { $min = $_ if $_ <= $min }
	return $min;
}

1;
__END__

=head1 NAME

IDNA::Punycode - encodes Unicode string in Punycode

=head1 SYNOPSIS

  use IDNA::Punycode;
  idn_prefix('xn--');
  $punycode = encode_punycode($unicode);
  $unicode  = decode_punycode($punycode);

=head1 DESCRIPTION

IDNA::Punycode is a module to encode / decode Unicode strings into
Punycode, an efficient encoding of Unicode for use with IDNA.

This module requires Perl 5.6.0 or over to handle UTF8 flagged Unicode
strings.

=head1 FUNCTIONS

This module exports following functions by default.

=over 4

=item encode_punycode

  $punycode = encode_punycode($unicode);

takes Unicode string (UTF8-flagged variable) and returns Punycode
encoding for it.

=item decode_punycode

  $unicode = decode_punycode($punycode)

takes Punycode encoding and returns original Unicode string.

=item idn_prefix

  idn_prefix($prefix);

causes encode_punycode() to add $prefix to ACE-string after conversion.
As a side-effect decode_punycode() will only consider strings
beginning with $prefix as punycode representations.

According to RFC 3490 the ACE prefix "xn--" had been chosen as the
standard.  Thus, "xn--" is also the default ACE prefix.  For compatibility
I'm leaving idn_prefix() in the module.  Use C<idn_prefix(undef)> to
get the old behaviour.

=back

These functions throws exceptionsn on failure. You can catch 'em via
C<eval>.

=head1 AUTHORS

Tatsuhiko Miyagawa E<lt>miyagawa@bulknews.netE<gt> is the original
author and wrote almost all the code.

Robert Urban E<lt>urban@UNIX-Beratung.deE<gt> added C<idn_prefix()>.

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.

=head1 SEE ALSO

http://www.ietf.org/internet-drafts/draft-ietf-idn-punycode-01.txt

L<Encode::Punycode>

=cut