File: 0009-wuth06-get_server_encoding.patch

package info (click to toggle)
libwww-mediawiki-client-perl 0.31-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 1,132 kB
  • sloc: perl: 1,672; makefile: 14
file content (46 lines) | stat: -rw-r--r-- 1,573 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
Author: Brett Wuth <wuth@castrov.cuug.ab.ca>
Last-Update: 2013-07-09
Bug-Debian: https://bugs.debian.org/715418
Description: Fix missing character encoding

--- a/lib/WWW/Mediawiki/Client.pm
+++ b/lib/WWW/Mediawiki/Client.pm
@@ -1477,19 +1477,36 @@ sub _get_wiki_text {
 }
 
 sub _get_server_encoding {
+    # Determine the character set used by the server.
+    # Assumes the same character set is used on all pages.
+
     my ($self) = @_;
+
+    # Get a sample page
     my $url = $self->_get_version_url;
     my $res = $self->{ua}->get($url);
+
+    # Use the character set in the response header, if provided
+    my $contenttypeheader = $res->header( 'content-type' );
+    # if defined will be like: "text/html; charset=UTF-8"
+    my $charsetheader = $contenttypeheader;
+    $charsetheader =~ m/charset=(.*)/;
+    $charsetheader = $1;
+    return $charsetheader if ($charsetheader);
+
+    # No character set defined in the header.  Look instead in the content.
     my $doc = $res->decoded_content;
     my $p = HTML::TokeParser->new(\$doc);
     while ( my $t = $p->get_tag("meta") ) {
         next unless defined $t->[1]->{'http-equiv'}
-     and ($t->[1]->{'http-equiv'} eq 'Content-Type'
-     or $t->[1]->{'http-equiv'} eq 'Content-type');
+          and ($t->[1]->{'http-equiv'} eq 'Content-Type'
+               or $t->[1]->{'http-equiv'} eq 'Content-type');
         my $cont = $t->[1]->{'content'};
         $cont =~ m/charset=(.*)/;
         return $1;
     }
+
+    # No character set found anywhere.  Return nothing.
 }
 
 sub _get_page_headline {