1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
|
Author: Brett Wuth <wuth@castrov.cuug.ab.ca>
Last-Update: 2013-07-09
Bug-Debian: https://bugs.debian.org/715418
Description: Fix missing character encoding
--- a/lib/WWW/Mediawiki/Client.pm
+++ b/lib/WWW/Mediawiki/Client.pm
@@ -1477,19 +1477,36 @@ sub _get_wiki_text {
}
sub _get_server_encoding {
+ # Determine the character set used by the server.
+ # Assumes the same character set is used on all pages.
+
my ($self) = @_;
+
+ # Get a sample page
my $url = $self->_get_version_url;
my $res = $self->{ua}->get($url);
+
+ # Use the character set in the response header, if provided
+ my $contenttypeheader = $res->header( 'content-type' );
+ # if defined will be like: "text/html; charset=UTF-8"
+ my $charsetheader = $contenttypeheader;
+ $charsetheader =~ m/charset=(.*)/;
+ $charsetheader = $1;
+ return $charsetheader if ($charsetheader);
+
+ # No character set defined in the header. Look instead in the content.
my $doc = $res->decoded_content;
my $p = HTML::TokeParser->new(\$doc);
while ( my $t = $p->get_tag("meta") ) {
next unless defined $t->[1]->{'http-equiv'}
- and ($t->[1]->{'http-equiv'} eq 'Content-Type'
- or $t->[1]->{'http-equiv'} eq 'Content-type');
+ and ($t->[1]->{'http-equiv'} eq 'Content-Type'
+ or $t->[1]->{'http-equiv'} eq 'Content-type');
my $cont = $t->[1]->{'content'};
$cont =~ m/charset=(.*)/;
return $1;
}
+
+ # No character set found anywhere. Return nothing.
}
sub _get_page_headline {
|