1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
|
##
# This file is part of WhatWeb and may be subject to
# redistribution and commercial restrictions. Please see the WhatWeb
# web site for more information on licensing and terms of use.
# http://www.morningstarsecurity.com/research/whatweb
##
## Version 0.2
# added example sites
Plugin.define "Charset" do
author "Andrew Horton"
version "0.2"
description "Detects the character set of a page, this is required for MongoDB output. Only checks the meta content tag, not the HTTP header. It tries the specified charset, then ascii, then utf-8 then uses CharDet from the rchardet gem which is CPU intensive / slow. Tests conversion to UTF-8 using Iconv. Returns Failed if unsuccessful."
# requires rchardet
# https://github.com/jmhodges/rchardet
# http://www.meeho.net/blog/2010/03/ruby-how-to-detect-the-encoding-of-a-string/
def startup
if not defined?(CharDet)
error("ERROR: You need the rchardet gem to use the CharDet plugin to detect anything but ASCII or UTF-8.")
end
end
def get_charset(body)
charset=nil
meta_content_tag=body.scan(/<meta[^>]+Content-Type[^>]+>/i)[0]
# puts meta_content_tag
unless meta_content_tag.nil? or not meta_content_tag =~ /charset=['"]?([a-zA-Z0-9_-]+)/i
charset=meta_content_tag.scan(/charset=['"]?([a-zA-Z0-9_-]+)/i)[0][0]
charset.upcase!
end
charset
end
def passive
m=[]
body=@body
=begin
Arabic (Windows) Windows-1256
Baltic (Windows) Windows-1257
Central European (Windows) Windows-1250
Cyrillic (Windows) Windows-1251
Greek (Windows) Windows-1253
Hebrew (Windows) Windows-1255
Thai (Windows) TIS-620
Turkish (Windows) Windows-1254
Vietnamese (Windows) Windows-1258
Western European (Windows) Windows-1252
Arabic (ISO) ISO-8859-6
Baltic (ISO) ISO-8859-4
Central European (ISO) ISO-8859-2
Cyrillic (ISO) ISO-8859-5
Estonian (ISO) ISO-8859-13
Greek (ISO) ISO-8859-7
Hebrew (ISO-Logical) ISO-8859-8-l
Hebrew (ISO-Visual) ISO-8859-8
Latin 9 (ISO) ISO-8859-15
Turkish (ISO) ISO-8859-9
Western European (ISO) ISO-8859-1
Chinese Simplified (GB18030) GB18030
Chinese Simplified (GB2312) GB2312
Chinese Simplified (HZ) HZ
Chinese Traditional (Big5) Big5
Japanese (Shift-JIS) Shift_JIS
Japanese (EUC) EUC-JP
Korean EUC-KR
Unicode (UTF-8) UTF-8
=end
#UTF-7 ISO-8859-5 ISO-8859-1 ISO-2022-JP WINDOWS-1250 IBM852 EUC-JP SHIFT_JIS BIG5 UTF-8 ASCII
trythese = %w| UTF_8 ASCII | # it's stack backwards
charset=get_charset(body)
trythese.push(charset) unless charset.nil?
found=false
while trythis = trythese.pop
begin
d = body.force_encoding('UTF-8')
found=true
m << {:string=> trythis}
break
rescue
#
end
if defined?(CharDet)
if found==false
begin
cd = CharDet.detect(body)
encoding = cd['encoding'].upcase
d = body.force_encoding('UTF-8')
found=true
m << {:string=> encoding, :module=> "CharDet"}
rescue
end
end
end
m << {:name=>"x",:string=> "Failed"} if found==false
end
m
end
end
|