1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237
|
# Before `make install' is performed this script should be runnable with
# `make test'. After `make install' it should work as `perl test.pl'
#########################
# change 'tests => 1' to 'tests => last_test_to_print';
use Test::More tests => 10;
use Lingua::EN::Tagger;
ok('Lingua::EN::Tagger', 'module compiled'); # If we made it this far, we're ok.
#########################
# Insert your test code below, the Test module is use()ed here so read
# its man page ( perldoc Test ) for help writing this test script.
######################################
# Start by creating the parser object
# (without the stemmer)
######################################
ok( $parser = Lingua::EN::Tagger->new( stem => 0, weight_noun_phrases => 1, longest_noun_phrase => 15 ), 'creating parser object' );
$tagged = $parser->add_tags( penn() );
ok( %words = $parser->get_words( penn() ), 'get_words() method' );
$accuracy = compute_accuracy( \%words, np_benchmark() );
is( $accuracy, '100', "accuracy of np extraction ($accuracy%)" );
##############################################
# Test the extraction of maximal noun phrases
##############################################
ok( %max_noun_phrases = $parser->get_max_noun_phrases( $tagged ), 'extract MNPs' );
$accuracy = compute_accuracy( \%max_noun_phrases, mnp_benchmark() );
is( $accuracy, '100', "accuracy of mnp extraction ($accuracy%)" );
##############################################
# Test the extraction of all noun phrases
##############################################
ok( %noun_phrases = $parser->get_noun_phrases( $tagged ), 'extract noun phrases' );
$accuracy = compute_accuracy( \%noun_phrases, np_benchmark() );
is( $accuracy, '100', "accuracy of np extraction ($accuracy%)" );
##############################################
# Test the extraction of all nouns
##############################################
ok( %nouns = $parser->get_nouns( $tagged ), 'extract nouns' );
$accuracy = compute_accuracy( \%nouns, noun_benchmark() );
is( $accuracy, '100', "accuracy of np extraction ($accuracy%)" );
sub compute_accuracy {
( $hash_ref, $benchmark ) = @_;
( $errors, $i ) = ( 0 )x2;
foreach( keys %{ $hash_ref } ){
$i++;
unless( defined $benchmark->{$_} ){
# warn "$_ not in benchmark\n";
$errors++,
next;
}
$i++;
unless ( $hash_ref->{$_} == $benchmark->{$_} ){
# warn $_.": ".$hash_ref->{$_}." != ".$benchmark->{$_}." (benchmark)\n";
$errors++;
}
}
foreach( keys %{ $benchmark } ){
$i++;
unless( defined $hash_ref->{$_} ){
# warn "$_ not defined in extraction\n";
$errors++;
}
}
return sprintf( "%d", 100 * ( 1 - $errors / $i ) );
}
sub mnp_benchmark {
$hash_ref = { 'lisa raines' => 1,
'lawyer' => 1,
'director of government relations for the industrial biotechnical association' => 1,
'judge' => 1,
'patent law' => 1,
'concerns of research-based industries' => 1,
'judge newman' => 1,
'former patent lawyer' => 1,
'dissent' => 1,
'court' => 1,
'motion for a rehearing of the case by the full court' => 1,
'panel' => 1,
'judicial legislation' => 1,
'important high-technological industry' => 1,
'regard' => 1,
'consequences for research' => 1,
'innovation' => 1,
'public interest' => 1,
'ms. raines' => 1,
'judgement' => 1,
'concern that the absence of patent lawyers on the court' => 1
};
return $hash_ref;
}
sub noun_benchmark {
$hash_ref = { 'lisa' => 1,
'raines' => 2,
'lawyer' => 2,
'director' => 1,
'relations' => 1,
'government' => 1,
'association' => 1,
'judge' => 2,
'patent' => 3,
'law' => 1,
'concerns' => 1,
'industries' => 1,
'newman' => 1,
'dissent' => 1,
'court' => 3,
'motion' => 1,
'rehearing' => 1,
'case' => 1,
'panel' => 1,
'legislation' => 1,
'industry' => 1,
'regard' => 1,
'consequences' => 1,
'research' => 1,
'innovation' => 1,
'interest' => 1,
'ms.' => 1,
'judgement' => 1,
'concern' => 1,
'industrial' => 1,
'biotechnical' => 1,
'absence' => 1,
'lawyers' => 1
};
return $hash_ref;
}
sub np_benchmark {
$hash_ref = { 'lisa' => 1,
'raines' => 2,
'lawyer' => 2,
'director' => 1,
'relations' => 1,
'government' => 1,
'association' => 1,
'judge' => 2,
'patent' => 3,
'law' => 1,
'concerns' => 1,
'industries' => 1,
'newman' => 1,
'dissent' => 1,
'court' => 3,
'motion' => 1,
'rehearing' => 1,
'case' => 1,
'panel' => 1,
'legislation' => 1,
'industry' => 1,
'regard' => 1,
'consequences' => 1,
'research' => 1,
'innovation' => 1,
'interest' => 1,
'ms.' => 1,
'judgement' => 1,
'concern' => 1,
'industrial' => 1,
'biotechnical' => 1,
'absence' => 1,
'lawyers' => 1,
'lisa raines' => 2,
'director of government relations for the industrial biotechnical association' => 9,
'patent law' => 2,
'concerns of research-based industries' => 4,
'judge newman' => 2,
'former patent lawyer' => 3,
'motion for a rehearing of the case by the full court' => 11,
'judicial legislation' => 2,
'important high-technological industry' => 3,
'consequences for research' => 3,
'public interest' => 2,
'ms. raines' => 2,
'concern that the absence of patent lawyers on the court' => 10,
'government relations' => 2,
'industrial biotechnical association' => 3,
'biotechnical association' => 2,
'research-based industries' => 2,
'patent lawyer' => 2,
'full court' => 2,
'high-technological industry' => 2,
'patent lawyers' => 2
};
return $hash_ref;
}
# Lisa Raines, a lawyer and director of government relations for the Industrial Biotechnical Association, contends that a judge well-versed in patent law and the concerns of research-based industries would have ruled otherwise. And Judge Newman, a former patent lawyer, wrote in her dissent when the court denied a motion for a rehearing of the case by the full court, "The panel's judicial legislation has affected an important high-technological industry, without regard to the consequences for research and innovation or the public interest." Says Ms. Raines, "[The judgement] confirms our concern that the absence of patent lawyers on the court could prove troublesome."
###############################################
# Words that mostly don't occur in the lexicon
###############################################
sub jibberish {
return "Nils occludes the 5 corybantic sciolists from fressing upon the
northeast-oriented perambulations of the yabbering doyenne";
}
##########################################################
# Hyphenated words that mostly don't occur in the lexicon
##########################################################
sub hyphen {
# brother-in-law not in lexicon, sister-in-law is
return "The brother-in-law. The sister-in-law. A strategy of tit-for-tat among
middle-eastern states.";
}
####################################################
# Test the tagger against an actual tagged corpus
####################################################
sub penn {
return <<PENN
Lisa Raines, a lawyer and director of government relations for the Industrial Biotechnical Association, contends that a judge well-versed in patent law and the concerns of research-based industries would have ruled otherwise. And Judge Newman, a former patent lawyer, wrote in her dissent when the court denied a motion for a rehearing of the case by the full court, "The panel's judicial legislation has affected an important high-technological industry, without regard to the consequences for research and innovation or the public interest." Says Ms. Raines, "[The judgement] confirms our concern that the absence of patent lawyers on the court could prove troublesome."
PENN
}
|