File: charset.rb

package info (click to toggle)
whatweb 0.4.9-2
  • links: PTS
  • area: main
  • in suites: buster
  • size: 21,188 kB
  • sloc: ruby: 33,652; sh: 614; makefile: 42
file content (114 lines) | stat: -rw-r--r-- 2,979 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
##
# This file is part of WhatWeb and may be subject to
# redistribution and commercial restrictions. Please see the WhatWeb
# web site for more information on licensing and terms of use.
# http://www.morningstarsecurity.com/research/whatweb
##

## Version 0.2
# added example sites

Plugin.define "Charset" do
author "Andrew Horton"
version "0.2"
description "Detects the character set of a page, this is required for MongoDB output. Only checks the meta content tag, not the HTTP header. It tries the specified charset, then ascii, then utf-8 then uses CharDet from the rchardet gem which is CPU intensive / slow. Tests conversion to UTF-8 using Iconv. Returns Failed if unsuccessful."


# requires rchardet
# https://github.com/jmhodges/rchardet
# http://www.meeho.net/blog/2010/03/ruby-how-to-detect-the-encoding-of-a-string/

def startup
	if not defined?(CharDet)
		error("ERROR: You need the rchardet gem to use the CharDet plugin to detect anything but ASCII or UTF-8.")
	end
end

def get_charset(body)
	charset=nil
	meta_content_tag=body.scan(/<meta[^>]+Content-Type[^>]+>/i)[0]
#	puts meta_content_tag
	unless meta_content_tag.nil? or not meta_content_tag =~ /charset=['"]?([a-zA-Z0-9_-]+)/i
		charset=meta_content_tag.scan(/charset=['"]?([a-zA-Z0-9_-]+)/i)[0][0]
		charset.upcase!
	end
	charset
end

def passive
	m=[]
	
	body=@body
=begin
		Arabic (Windows)	Windows-1256
		Baltic (Windows)	Windows-1257
		Central European (Windows)	Windows-1250
		Cyrillic (Windows)	Windows-1251
		Greek (Windows)	Windows-1253
		Hebrew (Windows)	Windows-1255
		Thai (Windows)	TIS-620
		Turkish (Windows)	Windows-1254
		Vietnamese (Windows)	Windows-1258
		Western European (Windows)	Windows-1252

		Arabic (ISO)	ISO-8859-6
		Baltic (ISO)	ISO-8859-4
		Central European (ISO)	ISO-8859-2
		Cyrillic (ISO)	ISO-8859-5
		Estonian (ISO)	ISO-8859-13
		Greek (ISO)	ISO-8859-7
		Hebrew (ISO-Logical)	ISO-8859-8-l
		Hebrew (ISO-Visual)	ISO-8859-8
		Latin 9 (ISO)	ISO-8859-15
		Turkish (ISO)	ISO-8859-9
		Western European (ISO)	ISO-8859-1

		Chinese Simplified (GB18030)	GB18030
		Chinese Simplified (GB2312)	GB2312
		Chinese Simplified (HZ)	HZ
		Chinese Traditional (Big5)	Big5
		Japanese (Shift-JIS)	Shift_JIS
		Japanese (EUC)	EUC-JP
		Korean	EUC-KR
		Unicode (UTF-8)	UTF-8
=end
		#UTF-7 ISO-8859-5 ISO-8859-1 ISO-2022-JP WINDOWS-1250 IBM852 EUC-JP SHIFT_JIS BIG5 UTF-8 ASCII


	trythese = %w| UTF_8 ASCII | # it's stack backwards

	charset=get_charset(body)
	trythese.push(charset) unless charset.nil?


	found=false
	while trythis = trythese.pop
	begin
        d = body.force_encoding('UTF-8')
		found=true
		m << {:string=> trythis}
		break
	rescue		
		#
	end

	if defined?(CharDet)
		if found==false
			begin
				cd = CharDet.detect(body)
				encoding = cd['encoding'].upcase
                d = body.force_encoding('UTF-8')
				found=true
				m << {:string=> encoding, :module=> "CharDet"}
			rescue
			end
		end
	end
		m << {:name=>"x",:string=> "Failed"} if found==false
	end

	m
end

end