File: universaldetector.rb

package info (click to toggle)
ruby-rchardet 1.3-3
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd, wheezy
  • size: 664 kB
  • ctags: 246
  • sloc: ruby: 5,839; makefile: 3
file content (166 lines) | stat: -rw-r--r-- 5,205 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is Mozilla Universal charset detector code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 2001
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
#   Jeff Hodges - port to Ruby
#   Mark Pilgrim - port to Python
#   Shy Shalom - original C code
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
# 
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
# 
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301  USA
######################### END LICENSE BLOCK #########################

module CharDet
  MINIMUM_THRESHOLD = 0.20
  EPureAscii = 0
  EEscAscii = 1
  EHighbyte = 2

  class UniversalDetector
    attr_accessor :result
    def initialize
      @_highBitDetector = /[\x80-\xFF]/
      @_escDetector = /(\033|\~\{)/
      @_mEscCharSetProber = nil
      @_mCharSetProbers = []
      reset()
    end

    def reset
      @result = {'encoding' => nil, 'confidence' => 0.0}
      @done = false
      @_mStart = true
      @_mGotData = false
      @_mInputState = EPureAscii
      @_mLastChar = ''
      if @_mEscCharSetProber
	@_mEscCharSetProber.reset()
      end
      for prober in @_mCharSetProbers
	prober.reset()
      end
    end

    def feed(aBuf)
      return if @done

      aLen = aBuf.length
      return if not aLen

      if not @_mGotData
	# If the data starts with BOM, we know it is UTF
	if aBuf[0...3] == "\xEF\xBB\xBF"
	  # EF BB BF  UTF-8 with BOM
	  @result = {'encoding' => "UTF-8", 'confidence' => 1.0}
	elsif aBuf[0...4] == "\xFF\xFE\x00\x00"
	  # FF FE 00 00  UTF-32, little-endian BOM
	  @result = {'encoding' => "UTF-32LE", 'confidence' => 1.0}
	elsif aBuf[0...4] == "\x00\x00\xFE\xFF"
	  # 00 00 FE FF  UTF-32, big-endian BOM
	  @result = {'encoding' => "UTF-32BE", 'confidence' => 1.0}
	elsif aBuf[0...4] == "\xFE\xFF\x00\x00"
	  # FE FF 00 00  UCS-4, unusual octet order BOM (3412)
	  @result = {'encoding' => "X-ISO-10646-UCS-4-3412", 'confidence' => 1.0}
	elsif aBuf[0...4] == "\x00\x00\xFF\xFE"
	  # 00 00 FF FE  UCS-4, unusual octet order BOM (2143)
	  @result = {'encoding' =>  "X-ISO-10646-UCS-4-2143", 'confidence' =>  1.0}
	elsif aBuf[0...2] == "\xFF\xFE"
	  # FF FE  UTF-16, little endian BOM
	  @result = {'encoding' =>  "UTF-16LE", 'confidence' =>  1.0}
	elsif aBuf[0...2] == "\xFE\xFF"
	  # FE FF  UTF-16, big endian BOM
	  @result = {'encoding' =>  "UTF-16BE", 'confidence' =>  1.0}
	end
      end

      @_mGotData = true
      if @result['encoding'] and (@result['confidence'] > 0.0)
	@done = true
	return
      end
      if @_mInputState == EPureAscii
	if @_highBitDetector =~ (aBuf)
	  @_mInputState = EHighbyte
	elsif (@_mInputState == EPureAscii) and @_escDetector =~ (@_mLastChar + aBuf)
	  @_mInputState = EEscAscii
	end
      end

      @_mLastChar = aBuf[-1..-1]
      if @_mInputState == EEscAscii
	if not @_mEscCharSetProber
	  @_mEscCharSetProber = EscCharSetProber.new()
	end
	if @_mEscCharSetProber.feed(aBuf) == EFoundIt
	  @result = {'encoding' =>  self._mEscCharSetProber.get_charset_name(),
			       'confidence' =>  @_mEscCharSetProber.get_confidence()
	  }
	  @done = true
	end
      elsif @_mInputState == EHighbyte
	if not @_mCharSetProbers or @_mCharSetProbers.empty?
	  @_mCharSetProbers = [MBCSGroupProber.new(), SBCSGroupProber.new(), Latin1Prober.new()]
	end
	for prober in @_mCharSetProbers
	  if prober.feed(aBuf) == EFoundIt
	    @result = {'encoding' =>  prober.get_charset_name(),
				   'confidence' =>  prober.get_confidence()}
	    @done = true
	    break
	  end
	end
      end

    end

    def close
      return if @done
      if not @_mGotData
	$stderr << "no data received!\n" if $debug
	return
      end
      @done = true

      if @_mInputState == EPureAscii
	@result = {'encoding' => 'ascii', 'confidence' => 1.0}
	return @result
      end

      if @_mInputState == EHighbyte
	confidences = {}
        @_mCharSetProbers.each{ |prober| confidences[prober] = prober.get_confidence }
	maxProber = @_mCharSetProbers.max{ |a,b| confidences[a] <=> confidences[b] }
	if maxProber and maxProber.get_confidence > MINIMUM_THRESHOLD
	  @result = {'encoding' =>  maxProber.get_charset_name(),
			       'confidence' =>  maxProber.get_confidence()}
	  return @result
	end
      end

      if $debug
	$stderr << "no probers hit minimum threshhold\n" if $debug
	for prober in @_mCharSetProbers[0]._mProbers
	  next if not prober
	  $stderr << "#{prober.get_charset_name} confidence = #{prober.get_confidence}\n" if $debug
	end
      end
    end
  end
end