File: texttokenizer.rb

package info (click to toggle)

gonzui 1.2-1

links: PTS
area: main
in suites: etch, etch-m68k
size: 2,824 kB
ctags: 1,448
sloc: ruby: 9,570; sh: 5,684; ansic: 1,334; lex: 1,140; makefile: 466; perl: 205; ml: 131

file content (50 lines) | stat: -rwxr-xr-x 1,149 bytes

parent folder | download | duplicates (4)

#! /usr/bin/env ruby
$LOAD_PATH.unshift("..")
require 'test/unit'
require 'gonzui'
require 'test-util'
include Gonzui

class TextTokenizerTest < Test::Unit::TestCase
  include TestUtil

  def _test_collect(text)
    list = []
    TextTokenizer.each_word(text) {|word, pos|
      assert(word.is_a?(String))
      assert(pos.is_a?(Integer))
      list.push([word, pos])
    }
    return list
  end

  def _test_with_words(words, delim)
    text = words.join(delim)
    list = _test_collect(text)

    pos = 0
    words.each_with_index {|word, i|
      assert_equal(word, list[i][0])
      assert_equal(pos, list[i][1])
      pos += word.length + delim.length
    }
  end

  def _test_text(text, n)
    list = _test_collect(text)
    assert_equal(n, list.length)
  end

  def test_each
    words = ["foo", "bar", "baz"]
    _test_with_words(words, " ")
    _test_with_words(words, "\n")
    _test_with_words(words, ", ")
    _test_with_words(words, ",\n")
    _test_with_words(words, "!#%&\n\n\n@@")

    _test_text("foo_bar", 1)
    _test_text("あ", 1)  # single multi-byte character
    _test_text("あい", 2) # two multi-byte characters
  end
end