File: BenchmarkingIndexer.pm

package info (click to toggle)
liblucy-perl 0.3.3-4
links: PTS, VCS
area: main
in suites: jessie, jessie-kfreebsd
size: 9,328 kB
ctags: 8,492
sloc: ansic: 80,468; perl: 7,080; yacc: 681; java: 174; lex: 96; makefile: 20
file content (226 lines) | stat: -rw-r--r-- 6,872 bytes
parent folder | download | duplicates (2)
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

use strict;
use warnings;

package BenchmarkingIndexer;

use Carp;
use Config;
use File::Spec::Functions qw( catfile catdir );
use POSIX qw( uname );

sub new {
    my $either = shift;
    my $class = ref($either) || $either;
    return bless {
        docs              => undef,
        increment         => undef,
        store             => undef,
        engine            => undef,
        version           => undef,
        index_dir         => undef,
        corpus_dir        => 'extracted_corpus',
        article_filepaths => undef,
        @_,
    }, $class;
}

sub init_indexer { confess "abstract method" }
sub build_index  { confess "abstract method" }

sub delayed_init {
    my $self              = shift;
    my $article_filepaths = $self->{article_filepaths}
        = $self->build_file_list;
    $self->{docs} = @$article_filepaths unless defined $self->{docs};
    $self->{increment} = $self->{docs} + 1 unless defined $self->{increment};
}

# Return a lexically sorted list of all article files from all subdirs.
sub build_file_list {
    my $self       = shift;
    my $corpus_dir = $self->{corpus_dir};
    my @article_filepaths;
    opendir CORPUS_DIR, $corpus_dir
        or confess "Can't opendir '$corpus_dir': $!";
    my @article_dir_names = grep {/articles/} readdir CORPUS_DIR;
    for my $article_dir_name (@article_dir_names) {
        my $article_dir = catdir( $corpus_dir, $article_dir_name );
        opendir ARTICLE_DIR, $article_dir
            or die "Can't opendir '$article_dir': $!";
        push @article_filepaths, map { catfile( $article_dir, $_ ) }
            grep {m/^article\d+\.txt$/} readdir ARTICLE_DIR;
    }
    @article_filepaths = sort @article_filepaths;
    $self->{article_filepaths} = \@article_filepaths;
}

# Print out stats for one run.
sub print_interim_report {
    my ( $self, %args ) = @_;
    printf( "%-3d  Secs: %.3f  Docs: %-4d\n", @args{qw( rep secs count )} );
}

sub start_report {
    # Start the output.
    print '-' x 60 . "\n";
}

# Print out aggregate stats.
sub print_final_report {
    my ( $self, $times ) = @_;

    # Produce mean and truncated mean.
    my @sorted_times = sort @$times;
    my $num_to_chop  = int( @sorted_times >> 2 );
    my $mean         = 0;
    my $trunc_mean   = 0;
    my $num_kept     = 0;
    for ( my $i = 0; $i < @sorted_times; $i++ ) {
        $mean += $sorted_times[$i];
        # Discard fastest 25% and slowest 25% of runs.
        next if $i < $num_to_chop;
        next if $i > ( $#sorted_times - $num_to_chop );
        $trunc_mean += $sorted_times[$i];
        $num_kept++;
    }

    $mean       /= @sorted_times;
    $trunc_mean /= $num_kept;
    my $num_discarded = @sorted_times - $num_kept;
    $mean       = sprintf( "%.3f", $mean );
    $trunc_mean = sprintf( "%.3f", $trunc_mean );

    # Get some info about the system.
    my $thread_support = $Config{usethreads} ? "yes" : "no";
    my @uname_info = (uname)[ 0, 2, 4 ];

    print <<END_REPORT;
------------------------------------------------------------
$self->{engine} $self->{version} 
Perl $Config{version}
Thread support: $thread_support
@uname_info
Mean: $mean secs 
Truncated mean ($num_kept kept, $num_discarded discarded): $trunc_mean secs
------------------------------------------------------------
END_REPORT
}

package BenchSchema::WhiteSpaceTokenizer;
use base qw( Lucy::Analysis::RegexTokenizer );

sub new { return shift->SUPER::new( pattern => '\S+' ) }

package BenchSchema;
use base qw( Lucy::Plan::Schema );
use Lucy::Analysis::RegexTokenizer;

sub new {
    my $self = shift->SUPER::new;
    my $type = Lucy::Plan::FullTextType->new(
        analyzer => BenchSchema::WhiteSpaceTokenizer->new, );
    $self->spec_field( name => 'title', type => $type );
    return $self;
}

package BenchmarkingIndexer::Lucy;
use base qw( BenchmarkingIndexer );

use Time::HiRes qw( gettimeofday );

sub new {
    my $class = shift;
    my $self  = $class->SUPER::new(@_);

    require Lucy;
    require Lucy::Index::Indexer;

    # Provide runtime flexibility.
    my $schema = $self->{schema} = BenchSchema->new;
    my $body_type = Lucy::Plan::FullTextType->new(
        analyzer      => BenchSchema::WhiteSpaceTokenizer->new,
        highlightable => $self->{store} ? 1 : 0,
        stored        => $self->{store} ? 1 : 0,
    );
    $schema->spec_field( name => 'body', type => $body_type );

    $self->{index_dir} = 'lucy_index';
    $self->{engine}    = 'Lucy';
    $self->{version}   = $Lucy::VERSION;

    return $self;
}

sub init_indexer {
    my ( $self, $count ) = @_;
    my $truncate = $count == 0 ? 1 : 0;
    return Lucy::Index::Indexer->new(
        schema   => $self->{schema},
        index    => $self->{index_dir},
        truncate => $truncate,
        create   => 1,
    );
}

# Build an index, stopping at $max docs if $max > 0.
sub build_index {
    my $self = shift;
    $self->delayed_init;
    my ( $max, $increment, $article_filepaths )
        = @{$self}{qw( docs increment article_filepaths )};

    # Start timer.
    my $start = gettimeofday();

    my $indexer = $self->init_indexer(0);

    my $count = 0;
    while ( $count < $max ) {
        for my $article_filepath (@$article_filepaths) {
            # The title is the first line, the body is the rest.
            open( my $article_fh, '<', $article_filepath )
                or die "Can't open file '$article_filepath'";

            my %doc;
            $doc{title} = <$article_fh>;
            $doc{body} = do { local $/; <$article_fh> };

            $indexer->add_doc( \%doc );

            # Bail if we've reached spec'd number of docs.
            $count++;
            last if $count >= $max;
            if ( $count % $increment == 0 and $count ) {
                $indexer->commit;
                undef $indexer;
                $indexer = $self->init_indexer($count);
            }
        }
    }

    # Finish index.
    $indexer->optimize;
    $indexer->commit;

    # Return elapsed seconds.
    my $end  = gettimeofday();
    my $secs = $end - $start;
    return ( $count, $secs );
}

1;