File: 158-standard_tokenizer.t

package info (click to toggle)
liblucy-perl 0.3.3-4
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 9,328 kB
  • ctags: 8,492
  • sloc: ansic: 80,468; perl: 7,080; yacc: 681; java: 174; lex: 96; makefile: 20
file content (53 lines) | stat: -rw-r--r-- 1,928 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

use strict;
use warnings;

use Test::More tests => 6;
use Lucy::Test;

my $tokenizer = Lucy::Analysis::StandardTokenizer->new;
my $other     = Lucy::Analysis::StandardTokenizer->new;
ok( $other->equals($other), "Equals" );

my $text = $tokenizer->split("o'malley's")->[0];
is( $text, "o'malley's", "multiple apostrophes" );

my $inversion = Lucy::Analysis::Inversion->new( text => "a b c" );
$inversion = $tokenizer->transform($inversion);

my ( @token_texts, @start_offsets, @end_offsets );
while ( my $token = $inversion->next ) {
    push @token_texts,   $token->get_text;
    push @start_offsets, $token->get_start_offset;
    push @end_offsets,   $token->get_end_offset;
}
is_deeply( \@token_texts, [qw( a b c )], "correct texts" );
is_deeply( \@start_offsets, [ 0, 2, 4, ], "correctstart offsets" );
is_deeply( \@end_offsets,   [ 1, 3, 5, ], "correct end offsets" );

$inversion->reset;
$inversion   = $tokenizer->transform($inversion);
@token_texts = ();
while ( my $token = $inversion->next ) {
    push @token_texts, $token->get_text;
}
is_deeply(
    \@token_texts,
    [ 'a', 'b', 'c' ],
    "no freakout when fed multiple tokens"
);