File: universaldetector.rb

package info (click to toggle)
ruby-rchardet 1.3-3
links: PTS, VCS
area: main
in suites: jessie, jessie-kfreebsd, wheezy
size: 664 kB
ctags: 246
sloc: ruby: 5,839; makefile: 3
file content (166 lines) | stat: -rw-r--r-- 5,205 bytes
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is Mozilla Universal charset detector code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 2001
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
#   Jeff Hodges - port to Ruby
#   Mark Pilgrim - port to Python
#   Shy Shalom - original C code
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
# 
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
# 
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301  USA
######################### END LICENSE BLOCK #########################

module CharDet
  MINIMUM_THRESHOLD = 0.20
  EPureAscii = 0
  EEscAscii = 1
  EHighbyte = 2

  class UniversalDetector
    attr_accessor :result
    def initialize
      @_highBitDetector = /[\x80-\xFF]/
      @_escDetector = /(\033|\~\{)/
      @_mEscCharSetProber = nil
      @_mCharSetProbers = []
      reset()
    end

    def reset
      @result = {'encoding' => nil, 'confidence' => 0.0}
      @done = false
      @_mStart = true
      @_mGotData = false
      @_mInputState = EPureAscii
      @_mLastChar = ''
      if @_mEscCharSetProber
	@_mEscCharSetProber.reset()
      end
      for prober in @_mCharSetProbers
	prober.reset()
      end
    end

    def feed(aBuf)
      return if @done

      aLen = aBuf.length
      return if not aLen

      if not @_mGotData
	# If the data starts with BOM, we know it is UTF
	if aBuf[0...3] == "\xEF\xBB\xBF"
	  # EF BB BF  UTF-8 with BOM
	  @result = {'encoding' => "UTF-8", 'confidence' => 1.0}
	elsif aBuf[0...4] == "\xFF\xFE\x00\x00"
	  # FF FE 00 00  UTF-32, little-endian BOM
	  @result = {'encoding' => "UTF-32LE", 'confidence' => 1.0}
	elsif aBuf[0...4] == "\x00\x00\xFE\xFF"
	  # 00 00 FE FF  UTF-32, big-endian BOM
	  @result = {'encoding' => "UTF-32BE", 'confidence' => 1.0}
	elsif aBuf[0...4] == "\xFE\xFF\x00\x00"
	  # FE FF 00 00  UCS-4, unusual octet order BOM (3412)
	  @result = {'encoding' => "X-ISO-10646-UCS-4-3412", 'confidence' => 1.0}
	elsif aBuf[0...4] == "\x00\x00\xFF\xFE"
	  # 00 00 FF FE  UCS-4, unusual octet order BOM (2143)
	  @result = {'encoding' =>  "X-ISO-10646-UCS-4-2143", 'confidence' =>  1.0}
	elsif aBuf[0...2] == "\xFF\xFE"
	  # FF FE  UTF-16, little endian BOM
	  @result = {'encoding' =>  "UTF-16LE", 'confidence' =>  1.0}
	elsif aBuf[0...2] == "\xFE\xFF"
	  # FE FF  UTF-16, big endian BOM
	  @result = {'encoding' =>  "UTF-16BE", 'confidence' =>  1.0}
	end
      end

      @_mGotData = true
      if @result['encoding'] and (@result['confidence'] > 0.0)
	@done = true
	return
      end
      if @_mInputState == EPureAscii
	if @_highBitDetector =~ (aBuf)
	  @_mInputState = EHighbyte
	elsif (@_mInputState == EPureAscii) and @_escDetector =~ (@_mLastChar + aBuf)
	  @_mInputState = EEscAscii
	end
      end

      @_mLastChar = aBuf[-1..-1]
      if @_mInputState == EEscAscii
	if not @_mEscCharSetProber
	  @_mEscCharSetProber = EscCharSetProber.new()
	end
	if @_mEscCharSetProber.feed(aBuf) == EFoundIt
	  @result = {'encoding' =>  self._mEscCharSetProber.get_charset_name(),
			       'confidence' =>  @_mEscCharSetProber.get_confidence()
	  }
	  @done = true
	end
      elsif @_mInputState == EHighbyte
	if not @_mCharSetProbers or @_mCharSetProbers.empty?
	  @_mCharSetProbers = [MBCSGroupProber.new(), SBCSGroupProber.new(), Latin1Prober.new()]
	end
	for prober in @_mCharSetProbers
	  if prober.feed(aBuf) == EFoundIt
	    @result = {'encoding' =>  prober.get_charset_name(),
				   'confidence' =>  prober.get_confidence()}
	    @done = true
	    break
	  end
	end
      end

    end

    def close
      return if @done
      if not @_mGotData
	$stderr << "no data received!\n" if $debug
	return
      end
      @done = true

      if @_mInputState == EPureAscii
	@result = {'encoding' => 'ascii', 'confidence' => 1.0}
	return @result
      end

      if @_mInputState == EHighbyte
	confidences = {}
        @_mCharSetProbers.each{ |prober| confidences[prober] = prober.get_confidence }
	maxProber = @_mCharSetProbers.max{ |a,b| confidences[a] <=> confidences[b] }
	if maxProber and maxProber.get_confidence > MINIMUM_THRESHOLD
	  @result = {'encoding' =>  maxProber.get_charset_name(),
			       'confidence' =>  maxProber.get_confidence()}
	  return @result
	end
      end

      if $debug
	$stderr << "no probers hit minimum threshhold\n" if $debug
	for prober in @_mCharSetProbers[0]._mProbers
	  next if not prober
	  $stderr << "#{prober.get_charset_name} confidence = #{prober.get_confidence}\n" if $debug
	end
      end
    end
  end
end