Author: Brett Wuth <wuth@castrov.cuug.ab.ca>
Last-Update: 2013-07-09
Bug-Debian: https://bugs.debian.org/715418
Description: Fix missing character encoding

--- a/lib/WWW/Mediawiki/Client.pm
+++ b/lib/WWW/Mediawiki/Client.pm
@@ -1477,19 +1477,36 @@ sub _get_wiki_text {
 }
 
 sub _get_server_encoding {
+    # Determine the character set used by the server.
+    # Assumes the same character set is used on all pages.
+
     my ($self) = @_;
+
+    # Get a sample page
     my $url = $self->_get_version_url;
     my $res = $self->{ua}->get($url);
+
+    # Use the character set in the response header, if provided
+    my $contenttypeheader = $res->header( 'content-type' );
+    # if defined will be like: "text/html; charset=UTF-8"
+    my $charsetheader = $contenttypeheader;
+    $charsetheader =~ m/charset=(.*)/;
+    $charsetheader = $1;
+    return $charsetheader if ($charsetheader);
+
+    # No character set defined in the header.  Look instead in the content.
     my $doc = $res->decoded_content;
     my $p = HTML::TokeParser->new(\$doc);
     while ( my $t = $p->get_tag("meta") ) {
         next unless defined $t->[1]->{'http-equiv'}
-     and ($t->[1]->{'http-equiv'} eq 'Content-Type'
-     or $t->[1]->{'http-equiv'} eq 'Content-type');
+          and ($t->[1]->{'http-equiv'} eq 'Content-Type'
+               or $t->[1]->{'http-equiv'} eq 'Content-type');
         my $cont = $t->[1]->{'content'};
         $cont =~ m/charset=(.*)/;
         return $1;
     }
+
+    # No character set found anywhere.  Return nothing.
 }
 
 sub _get_page_headline {
