File: texttokenizer.rb

package info (click to toggle)
gonzui 1.2-1
  • links: PTS
  • area: main
  • in suites: etch, etch-m68k
  • size: 2,824 kB
  • ctags: 1,448
  • sloc: ruby: 9,570; sh: 5,684; ansic: 1,334; lex: 1,140; makefile: 466; perl: 205; ml: 131
file content (50 lines) | stat: -rwxr-xr-x 1,149 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#! /usr/bin/env ruby
$LOAD_PATH.unshift("..")
require 'test/unit'
require 'gonzui'
require 'test-util'
include Gonzui

class TextTokenizerTest < Test::Unit::TestCase
  include TestUtil

  def _test_collect(text)
    list = []
    TextTokenizer.each_word(text) {|word, pos|
      assert(word.is_a?(String))
      assert(pos.is_a?(Integer))
      list.push([word, pos])
    }
    return list
  end

  def _test_with_words(words, delim)
    text = words.join(delim)
    list = _test_collect(text)

    pos = 0
    words.each_with_index {|word, i|
      assert_equal(word, list[i][0])
      assert_equal(pos, list[i][1])
      pos += word.length + delim.length
    }
  end

  def _test_text(text, n)
    list = _test_collect(text)
    assert_equal(n, list.length)
  end

  def test_each
    words = ["foo", "bar", "baz"]
    _test_with_words(words, " ")
    _test_with_words(words, "\n")
    _test_with_words(words, ", ")
    _test_with_words(words, ",\n")
    _test_with_words(words, "!#%&\n\n\n@@")

    _test_text("foo_bar", 1)
    _test_text("あ", 1)  # single multi-byte character
    _test_text("あい", 2) # two multi-byte characters
  end
end