1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730
|
// (C) Copyright 2017, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string> // for std::string
#include "include_gunit.h" // for TEST
#include "log.h" // for LOG
// ccmain
#include "paragraphs.h"
#include "paragraphs_internal.h"
// ccstruct
#include "ocrpara.h"
namespace tesseract {
// Functions for making monospace ASCII trial text for the paragraph detector.
const ParagraphJustification kLeft = JUSTIFICATION_LEFT;
const ParagraphJustification kCenter = JUSTIFICATION_CENTER;
const ParagraphJustification kRight = JUSTIFICATION_RIGHT;
const ParagraphJustification kUnknown = JUSTIFICATION_UNKNOWN;
enum TextModelInputType {
PCONT = 0, // Continuation line of a paragraph (default).
PSTART = 1, // First line of a paragraph.
PNONE = 2, // Not a paragraph line.
};
struct TextAndModel {
const char *ascii;
TextModelInputType model_type;
// fields corresponding to PARA (see ccstruct/ocrpara.h)
ParagraphModel model;
bool is_very_first_or_continuation;
bool is_list_item;
};
// Imagine that the given text is typewriter ASCII with each character ten
// pixels wide and twenty pixels high and return an appropriate row_info.
void AsciiToRowInfo(const char *text, int row_number, RowInfo *info) {
const int kCharWidth = 10;
const int kLineSpace = 30;
info->text = text;
info->has_leaders = strstr(text, "...") != nullptr || strstr(text, ". . .") != nullptr;
info->has_drop_cap = false;
info->pix_ldistance = info->pix_rdistance = 0;
info->average_interword_space = kCharWidth;
info->pix_xheight = kCharWidth;
info->lword_text = info->rword_text = "";
info->ltr = true;
std::vector<std::string> words = split(text, ' ');
info->num_words = words.size();
if (info->num_words < 1) {
return;
}
info->lword_text = words[0].c_str();
info->rword_text = words[words.size() - 1].c_str();
int lspace = 0;
while (lspace < info->text.size() && text[lspace] == ' ') {
lspace++;
}
int rspace = 0;
while (rspace < info->text.size() && text[info->text.size() - rspace - 1] == ' ') {
rspace++;
}
int top = -kLineSpace * row_number;
int bottom = top - kLineSpace;
int row_right = kCharWidth * info->text.size();
int lword_width = kCharWidth * info->lword_text.size();
int rword_width = kCharWidth * info->rword_text.size();
info->pix_ldistance = lspace * kCharWidth;
info->pix_rdistance = rspace * kCharWidth;
info->lword_box = TBOX(info->pix_ldistance, bottom, info->pix_ldistance + lword_width, top);
info->rword_box = TBOX(row_right - info->pix_rdistance - rword_width, bottom,
row_right - info->pix_rdistance, top);
LeftWordAttributes(nullptr, nullptr, info->lword_text, &info->lword_indicates_list_item,
&info->lword_likely_starts_idea, &info->lword_likely_ends_idea);
RightWordAttributes(nullptr, nullptr, info->rword_text, &info->rword_indicates_list_item,
&info->rword_likely_starts_idea, &info->rword_likely_ends_idea);
}
void MakeAsciiRowInfos(const TextAndModel *row_infos, int n, std::vector<RowInfo> *output) {
output->clear();
RowInfo info;
for (int i = 0; i < n; i++) {
AsciiToRowInfo(row_infos[i].ascii, i, &info);
output->push_back(info);
}
}
// Given n rows of reference ground truth, evaluate whether the n rows
// of PARA * pointers yield the same paragraph breakpoints.
void EvaluateParagraphDetection(const TextAndModel *correct, int n,
const std::vector<PARA *> &detector_output) {
int incorrect_breaks = 0;
int missed_breaks = 0;
int poorly_matched_models = 0;
int bad_crowns = 0;
int bad_list_items = 0;
ASSERT_EQ(detector_output.size(), n);
for (int i = 1; i < n; i++) {
bool has_break = correct[i].model_type != PCONT;
bool detected_break = (detector_output[i - 1] != detector_output[i]);
if (has_break && !detected_break) {
missed_breaks++;
}
if (detected_break && !has_break) {
incorrect_breaks++;
}
if (has_break) {
if (correct[i].model_type == PNONE) {
if (detector_output[i]->model != nullptr) {
poorly_matched_models++;
}
} else {
if (correct[i].model.justification() != kUnknown &&
(detector_output[i]->model == nullptr ||
!correct[i].model.Comparable(*detector_output[i]->model))) {
poorly_matched_models++;
}
}
if (correct[i].is_very_first_or_continuation ^
detector_output[i]->is_very_first_or_continuation) {
bad_crowns++;
}
if (correct[i].is_list_item ^ detector_output[i]->is_list_item) {
bad_list_items++;
}
}
}
EXPECT_EQ(incorrect_breaks, 0);
EXPECT_EQ(missed_breaks, 0);
EXPECT_EQ(poorly_matched_models, 0);
EXPECT_EQ(bad_list_items, 0);
EXPECT_EQ(bad_crowns, 0);
if (incorrect_breaks || missed_breaks || poorly_matched_models || bad_list_items || bad_crowns) {
std::vector<std::string> dbg_lines;
dbg_lines.emplace_back("# ==========================");
dbg_lines.emplace_back("# Correct paragraph breaks:");
dbg_lines.emplace_back("# ==========================");
for (int i = 0; i < n; i++) {
if (correct[i].model_type != PCONT) {
std::string s = std::string(correct[i].ascii) + " # " +
correct[i].model.ToString() +
(correct[i].is_very_first_or_continuation ? " crown" : "") +
(correct[i].is_list_item ? " li" : "");
dbg_lines.push_back(s);
} else {
dbg_lines.emplace_back(correct[i].ascii);
}
}
dbg_lines.emplace_back("");
dbg_lines.emplace_back("# ==========================");
dbg_lines.emplace_back("# Paragraph detector output:");
dbg_lines.emplace_back("# ==========================");
for (int i = 0; i < n; i++) {
std::string annotation;
if (i == 0 || (detector_output[i - 1] != detector_output[i])) {
if (detector_output[i] && detector_output[i]->model) {
annotation +=
" # " + detector_output[i]->model->ToString() +
(detector_output[i]->is_very_first_or_continuation ? " crown" : "") +
(detector_output[i]->is_list_item ? " li" : "");
} else {
annotation = " # Unmodeled paragraph.";
}
}
std::string s = correct[i].ascii + annotation;
dbg_lines.push_back(s);
}
std::string s;
for (auto &dbg_line : dbg_lines) {
s += dbg_line + "\n";
}
LOG(INFO) << "Discrepancy!\n" << s;
}
}
void TestParagraphDetection(const TextAndModel *correct, int num_rows) {
std::vector<RowInfo> row_infos;
std::vector<PARA *> row_owners;
PARA_LIST paragraphs;
std::vector<ParagraphModel *> models;
MakeAsciiRowInfos(correct, num_rows, &row_infos);
int debug_level(3);
tesseract::DetectParagraphs(debug_level, &row_infos, &row_owners, ¶graphs, &models);
EvaluateParagraphDetection(correct, num_rows, row_owners);
for (auto *model : models) {
delete model;
}
}
TEST(ParagraphsTest, ListItemsIdentified) {
EXPECT_TRUE(tesseract::AsciiLikelyListItem("iii"));
EXPECT_TRUE(tesseract::AsciiLikelyListItem("A."));
EXPECT_TRUE(tesseract::AsciiLikelyListItem("B."));
EXPECT_TRUE(tesseract::AsciiLikelyListItem("C."));
EXPECT_TRUE(tesseract::AsciiLikelyListItem("1."));
EXPECT_TRUE(tesseract::AsciiLikelyListItem("2."));
EXPECT_TRUE(tesseract::AsciiLikelyListItem("3."));
EXPECT_TRUE(tesseract::AsciiLikelyListItem("1"));
EXPECT_TRUE(tesseract::AsciiLikelyListItem("2"));
EXPECT_TRUE(tesseract::AsciiLikelyListItem("3"));
EXPECT_TRUE(tesseract::AsciiLikelyListItem("[[1]]"));
EXPECT_TRUE(tesseract::AsciiLikelyListItem("A-1."));
EXPECT_TRUE(tesseract::AsciiLikelyListItem("A-2"));
EXPECT_TRUE(tesseract::AsciiLikelyListItem("(A)(i)"));
EXPECT_FALSE(tesseract::AsciiLikelyListItem("The"));
EXPECT_FALSE(tesseract::AsciiLikelyListItem("first"));
EXPECT_FALSE(tesseract::AsciiLikelyListItem("house"));
EXPECT_FALSE(tesseract::AsciiLikelyListItem("Oregonian."));
EXPECT_FALSE(tesseract::AsciiLikelyListItem("on."));
}
typedef ParagraphModel PModel;
const TextAndModel kTwoSimpleParagraphs[] = {
{" Look here, I have a paragraph.", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
{"This paragraph starts at the top", PCONT, PModel(), false, false},
{"of the page and takes 3 lines. ", PCONT, PModel(), false, false},
{" Here I have a second paragraph", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
{"which indicates that the first ", PCONT, PModel(), false, false},
{"paragraph is not a continuation ", PCONT, PModel(), false, false},
{"from a previous page, as it is ", PCONT, PModel(), false, false},
{"indented just like this second ", PCONT, PModel(), false, false},
{"paragraph. ", PCONT, PModel(), false, false},
};
TEST(ParagraphsTest, TestSimpleParagraphDetection) {
TestParagraphDetection(kTwoSimpleParagraphs, countof(kTwoSimpleParagraphs));
}
const TextAndModel kFewCluesWithCrown[] = {
{"This paragraph starts at the top", PSTART, PModel(kLeft, 0, 20, 0, 0), true, false},
{"of the page and takes two lines.", PCONT, PModel(), false, false},
{" Here I have a second paragraph", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
{"which indicates that the first ", PCONT, PModel(), false, false},
{"paragraph is a continuation from", PCONT, PModel(), false, false},
{"a previous page, as it is ", PCONT, PModel(), false, false},
{"indented just like this second ", PCONT, PModel(), false, false},
{"paragraph. ", PCONT, PModel(), false, false},
};
TEST(ParagraphsTest, TestFewCluesWithCrown) {
TestParagraphDetection(kFewCluesWithCrown, countof(kFewCluesWithCrown));
}
const TextAndModel kCrownedParagraph[] = {
{"The first paragraph on a page is", PSTART, PModel(kLeft, 0, 20, 0, 0), true, false},
{"often not indented as the rest ", PCONT, PModel(), false, false},
{"of the paragraphs are. Nonethe-", PCONT, PModel(), false, false},
{"less it should be counted as the", PCONT, PModel(), false, false},
{"same type of paragraph. ", PCONT, PModel(), false, false},
{" The second and third para- ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
{"graphs are both indented two ", PCONT, PModel(), false, false},
{"spaces. ", PCONT, PModel(), false, false},
{" The first paragraph has what ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
{"fmt refers to as a 'crown.' ", PCONT, PModel(), false, false},
};
TEST(ParagraphsTest, TestCrownParagraphDetection) {
TestParagraphDetection(kCrownedParagraph, countof(kCrownedParagraph));
}
const TextAndModel kFlushLeftParagraphs[] = {
{"It is sometimes the case that", PSTART, PModel(kLeft, 0, 0, 0, 0), false, false},
{"flush left paragraphs (those", PCONT, PModel(), false, false},
{"with no body indent) are not", PCONT, PModel(), false, false},
{"actually crowns. ", PCONT, PModel(), false, false},
{"Instead, further paragraphs are", PSTART, PModel(kLeft, 0, 0, 0, 0), false, false},
{"also flush left aligned. Usual-", PCONT, PModel(), false, false},
{"ly, these paragraphs are set", PCONT, PModel(), false, false},
{"apart vertically by some white-", PCONT, PModel(), false, false},
{"space, but you can also detect", PCONT, PModel(), false, false},
{"them by observing the big empty", PCONT, PModel(), false, false},
{"space at the ends of the para-", PCONT, PModel(), false, false},
{"graphs. ", PCONT, PModel(), false, false},
};
TEST(ParagraphsText, TestRealFlushLeftParagraphs) {
TestParagraphDetection(kFlushLeftParagraphs, countof(kFlushLeftParagraphs));
}
const TextAndModel kSingleFullPageContinuation[] = {
{"sometimes a page is one giant", PSTART, PModel(kLeft, 0, 20, 0, 0), true, false},
{"continuation. It flows from", PCONT, PModel(), false, false},
{"line to line, using the full", PCONT, PModel(), false, false},
{"column width with no clear", PCONT, PModel(), false, false},
{"paragraph break, because it", PCONT, PModel(), false, false},
{"actually doesn't have one. It", PCONT, PModel(), false, false},
{"is the middle of one monster", PCONT, PModel(), false, false},
{"paragraph continued from the", PCONT, PModel(), false, false},
{"previous page and continuing", PCONT, PModel(), false, false},
{"onto the next page. There-", PCONT, PModel(), false, false},
{"fore, it ends up getting", PCONT, PModel(), false, false},
{"marked as a crown and then", PCONT, PModel(), false, false},
{"getting re-marked as any ex-", PCONT, PModel(), false, false},
{"isting model. Not great, but", PCONT, PModel(), false, false},
};
TEST(ParagraphsTest, TestSingleFullPageContinuation) {
const TextAndModel *correct = kSingleFullPageContinuation;
int num_rows = countof(kSingleFullPageContinuation);
std::vector<RowInfo> row_infos;
std::vector<PARA *> row_owners;
PARA_LIST paragraphs;
std::vector<ParagraphModel *> models;
models.push_back(new ParagraphModel(kLeft, 0, 20, 0, 10));
MakeAsciiRowInfos(correct, num_rows, &row_infos);
tesseract::DetectParagraphs(3, &row_infos, &row_owners, ¶graphs, &models);
EvaluateParagraphDetection(correct, num_rows, row_owners);
for (auto *model : models) {
delete model;
}
}
const TextAndModel kRightAligned[] = {
{"Right-aligned paragraphs are", PSTART, PModel(kRight, 0, 0, 0, 0), false, false},
{" uncommon in Left-to-Right", PCONT, PModel(), false, false},
{" languages, but they do", PCONT, PModel(), false, false},
{" exist.", PCONT, PModel(), false, false},
{" Mostly, however, they're", PSTART, PModel(kRight, 0, 0, 0, 0), false, false},
{" horribly tiny paragraphs in", PCONT, PModel(), false, false},
{" tables on which we have no", PCONT, PModel(), false, false},
{" chance anyways.", PCONT, PModel(), false, false},
};
TEST(ParagraphsTest, TestRightAlignedParagraph) {
TestParagraphDetection(kRightAligned, countof(kRightAligned));
}
const TextAndModel kTinyParagraphs[] = {
{" Occasionally, interspersed with", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
{"obvious paragraph text, you might", PCONT, PModel(), false, false},
{"find short exchanges of dialogue ", PCONT, PModel(), false, false},
{"between characters. ", PCONT, PModel(), false, false},
{" 'Oh?' ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
{" 'Don't be confused!' ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
{" 'Not me!' ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
{" One naive approach would be to ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
{"mark a new paragraph whenever one", PCONT, PModel(), false, false},
{"of the statistics (left, right or", PCONT, PModel(), false, false},
{"center) changes from one text-", PCONT, PModel(), false, false},
{"line to the next. Such an", PCONT, PModel(), false, false},
{"approach would misclassify the", PCONT, PModel(), false, false},
{"tiny paragraphs above as a single", PCONT, PModel(), false, false},
{"paragraph. ", PCONT, PModel(), false, false},
};
TEST(ParagraphsTest, TestTinyParagraphs) {
TestParagraphDetection(kTinyParagraphs, countof(kTinyParagraphs));
}
const TextAndModel kComplexPage1[] = {
{" Awesome ", PSTART, PModel(kCenter, 0, 0, 0, 0), false, false},
{" Centered Title ", PCONT, PModel(), false, false},
{" Paragraph Detection ", PCONT, PModel(), false, false},
{" OCR TEAM ", PCONT, PModel(), false, false},
{" 10 November 2010 ", PCONT, PModel(), false, false},
{" ", PNONE, PModel(), false, false},
{" Look here, I have a paragraph.", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
{"This paragraph starts at the top", PCONT, PModel(), false, false},
{"of the page and takes 3 lines. ", PCONT, PModel(), false, false},
{" Here I have a second paragraph", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
{"which indicates that the first ", PCONT, PModel(), false, false},
{"paragraph is not a continuation ", PCONT, PModel(), false, false},
{"from a previous page, as it is ", PCONT, PModel(), false, false},
{"indented just like this second ", PCONT, PModel(), false, false},
{"paragraph. ", PCONT, PModel(), false, false},
{" Here is a block quote. It ", PSTART, PModel(kLeft, 30, 0, 0, 0), true, false},
{" looks like the prior text ", PCONT, PModel(), false, false},
{" but it is indented more ", PCONT, PModel(), false, false},
{" and is fully justified. ", PCONT, PModel(), false, false},
{" So how does one deal with ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
{"centered text, block quotes, ", PCONT, PModel(), false, false},
{"normal paragraphs, and lists ", PCONT, PModel(), false, false},
{"like what follows? ", PCONT, PModel(), false, false},
{"1. Make a plan. ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true},
{"2. Use a heuristic, for example,", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true},
{" looking for lines where the ", PCONT, PModel(), false, false},
{" first word of the next line ", PCONT, PModel(), false, false},
{" would fit on the previous ", PCONT, PModel(), false, false},
{" line. ", PCONT, PModel(), false, false},
{"8. Try to implement the plan in ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true},
{" Python and try it out. ", PCONT, PModel(), false, false},
{"4. Determine how to fix the ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true},
{" mistakes. ", PCONT, PModel(), false, false},
{"5. Repeat. ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true},
{" For extra painful penalty work", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
{"you can try to identify source ", PCONT, PModel(), false, false},
{"code. Ouch! ", PCONT, PModel(), false, false},
};
TEST(ParagraphsTest, TestComplexPage1) {
TestParagraphDetection(kComplexPage1, countof(kComplexPage1));
}
// The same as above, but wider.
const TextAndModel kComplexPage2[] = {
{" Awesome ", PSTART, PModel(kCenter, 0, 0, 0, 0), false, false},
{" Centered Title ", PCONT, PModel(), false, false},
{" Paragraph Detection ", PCONT, PModel(), false, false},
{" OCR TEAM ", PCONT, PModel(), false, false},
{" 10 November 2010 ", PCONT, PModel(), false, false},
{" ", PNONE, PModel(), false, false},
{" Look here, I have a paragraph. ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
{"This paragraph starts at the top of", PCONT, PModel(), false, false},
{"the page and takes 3 lines. ", PCONT, PModel(), false, false},
{" Here I have a second paragraph ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
{"which indicates that the first ", PCONT, PModel(), false, false},
{"paragraph is not a continuation ", PCONT, PModel(), false, false},
{"from a previous page, as it is in- ", PCONT, PModel(), false, false},
{"dented just like this second para- ", PCONT, PModel(), false, false},
{"graph. ", PCONT, PModel(), false, false},
{" Here is a block quote. It ", PSTART, PModel(kLeft, 30, 0, 0, 0), true, false},
{" looks like the prior text ", PCONT, PModel(), false, false},
{" but it is indented more ", PCONT, PModel(), false, false},
{" and is fully justified. ", PCONT, PModel(), false, false},
{" So how does one deal with center-", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
{"ed text, block quotes, normal para-", PCONT, PModel(), false, false},
{"graphs, and lists like what follow?", PCONT, PModel(), false, false},
{"1. Make a plan. ", PCONT, PModel(), false, false}, // BUG!!
{"2. Use a heuristic, for example, ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true},
{" looking for lines where the ", PCONT, PModel(), false, false},
{" first word of the next line ", PCONT, PModel(), false, false},
{" would fit on the previous line. ", PCONT, PModel(), false, false},
{"8. Try to implement the plan in ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true},
{" Python and try it out. ", PCONT, PModel(), false, false},
{"4. Determine how to fix the ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true},
{" mistakes. ", PCONT, PModel(), false, false},
{"5. Repeat. ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true},
{" For extra painful penalty work ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
{"you can try to identify source ", PCONT, PModel(), false, false},
{"code. Ouch! ", PCONT, PModel(), false, false},
};
TEST(ParagraphsTest, TestComplexPage2) {
TestParagraphDetection(kComplexPage2, countof(kComplexPage2));
}
const TextAndModel kSubtleCrown[] = {
{"The first paragraph on a page is", PSTART, PModel(kLeft, 0, 20, 0, 0), true, false},
{"often not indented as the rest ", PCONT, PModel(), false, false},
{"of the paragraphs are. Nonethe-", PCONT, PModel(), false, false},
{"less it should be counted as the", PCONT, PModel(), false, false},
{"same type of paragraph. ", PCONT, PModel(), false, false},
{" Even a short second paragraph ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
{"should suffice. ", PCONT, PModel(), false, false},
{" 1235 ", PNONE, PModel(), false, false},
};
TEST(ParagraphsTest, TestSubtleCrown) {
TestParagraphDetection(kSubtleCrown, countof(kSubtleCrown) - 1);
}
TEST(ParagraphsTest, TestStrayLineInBlock) {
TestParagraphDetection(kSubtleCrown, countof(kSubtleCrown));
}
const TextAndModel kUnlvRep3AO[] = {
{" Defined contribution plans cover employees in Australia, New", PSTART,
PModel(kLeft, 0, 50, 0, 0), false, false},
{"Zealand, Spain, the United Kingdom and some U.S. subsidiaries. ", PCONT, PModel(), false,
false},
{"In addition, employees in the U.S. are eligible to participate in ", PCONT, PModel(),
false, false},
{"defined contribution plans (Employee Savings Plans) by contribut-", PCONT, PModel(), false,
false},
{"ing a portion of their compensation. The Company matches com- ", PCONT, PModel(), false,
false},
{"pensation, depending on Company profit levels. Contributions ", PCONT, PModel(), false,
false},
{"charged to income for defined contribution plans were $92 in ", PCONT, PModel(), false,
false},
{"1993, $98 in 1992 and $89 in 1991. ", PCONT, PModel(), false,
false},
{" In addition to providing pension benefits, the Company pro- ", PSTART,
PModel(kLeft, 0, 50, 0, 0), false, false},
{"vides certain health care and life insurance benefits to retired ", PCONT, PModel(), false,
false},
{"employees. As discussed in Note A, the Company adopted FASB ", PCONT, PModel(), false,
false},
{"Statement No. 106 effective January 1, 1992. Previously, the ", PCONT, PModel(), false,
false},
{"Company recognized the cost of providing these benefits as the ", PCONT, PModel(), false,
false},
{"benefits were paid. These pretax costs amounted to $53 in 1991. ", PCONT, PModel(), false,
false},
{"The Company continues to fund most of the cost of these medical ", PCONT, PModel(), false,
false},
{"and life insurance benefits in the year incurred. ", PCONT, PModel(), false,
false},
{" The U.S. plan covering the parent company is the largest plan.", PSTART,
PModel(kLeft, 0, 50, 0, 0), false, false},
{"It provides medical and life insurance benefits including hospital, ", PCONT, PModel(), false,
false},
{"physicians’ services and major medical expense benefits and life ", PCONT, PModel(), false,
false},
{"insurance benefits. The plan provides benefits supplemental to ", PCONT, PModel(), false,
false},
{"Medicare after retirees are eligible for these benefits. The cost of ", PCONT, PModel(),
false, false},
{"these benefits are shared by the Company and the retiree, with the ", PCONT, PModel(), false,
false},
{"Company portion increasing as the retiree has increased years of ", PCONT, PModel(), false,
false},
{"credited service. The Company has the ability to change these ", PCONT, PModel(), false,
false},
{"benefits at any time. ", PCONT, PModel(), false,
false},
{" Effective October 1993, the Company amended its health ", PSTART,
PModel(kLeft, 0, 50, 0, 0), false, false},
{"benefits plan in the U.S. to cap the cost absorbed by the Company ", PCONT, PModel(), false,
false},
{"at approximately twice the 1993 cost per person for employees who", PCONT, PModel(), false,
false},
{"retire after December 31, 1993. The effect of this amendment was ", PCONT, PModel(), false,
false},
{"to reduce the December 31, 1993 accumulated postretirement ", PCONT, PModel(), false,
false},
{"benefit obligation by $327. It also reduced the net periodic postre- ", PCONT, PModel(), false,
false},
{"tirement cost by $21 for 1993 and is estimated to reduce this cost ", PCONT, PModel(), false,
false},
{"for 1994 by approximately $83. ", PCONT, PModel(), false,
false},
};
TEST(ParagraphsTest, TestUnlvInsurance) {
TestParagraphDetection(kUnlvRep3AO, countof(kUnlvRep3AO));
}
// The basic outcome we want for something with a bunch of leader dots is that
// we group each logical entry as a separate item. Without knowledge of
// leaders, we would most likely mark the text below as a simple right aligned
// paragraph or two.
// This example comes from Volume 9886293, Page 5
const TextAndModel kTableOfContents[] = {
{"1 Hmong People ........... 1", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
{" Hmong Origins . . . . . 1", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
{" Language . . . . . . . 1", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
{" Proverbs . . . . . . 2", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
{" Discussion . . . . 2", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
{" Riddles . . . . . . . 2", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
{" Discussion . . . . 3", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
{" Appearance . . . . . 3", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
{" Hmong History . . . . . 4", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
{" Hmong in SE Asia . . . 4", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
{" Hmong in the West . . .5", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
{" Hmong in the USA . . . 5", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
{" Discussion . . . . 6", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
};
TEST(ParagraphsTest, TestSplitsOutLeaderLines) {
TestParagraphDetection(kTableOfContents, countof(kTableOfContents));
}
const TextAndModel kTextWithSourceCode[] = {
{" A typical page of a programming book may contain", PSTART, PModel(kLeft, 0, 20, 0, 0),
false, false},
{"examples of source code to exemplify an algorithm ", PCONT, PModel(), false, false},
{"being described in prose. Such examples should be", PCONT, PModel(), false, false},
{"rendered as lineated text, meaning text with ", PCONT, PModel(), false, false},
{"explicit line breaks but without extra inter-line ", PCONT, PModel(), false, false},
{"spacing. Accidentally finding stray paragraphs in", PCONT, PModel(), false, false},
{"source code would lead to a bad reading experience", PCONT, PModel(), false, false},
{"when the text is re-flowed. ", PCONT, PModel(), false, false},
{" Let's show this by describing the function fact-", PSTART, PModel(kLeft, 0, 20, 0, 0),
false, false},
{"orial. Factorial is a simple recursive function ", PCONT, PModel(), false, false},
{"which grows very quickly. So quickly, in fact, ", PCONT, PModel(), false, false},
{"that the typical C implementation will only work ", PCONT, PModel(), false, false},
{"for values less than about 12: ", PCONT, PModel(), false, false},
{" ", PNONE, PModel(), false, false},
{" # Naive implementation in C ", PCONT, PModel(), false, false},
{" int factorial(int n) { ", PCONT, PModel(), false, false},
{" if (n < 2) ", PCONT, PModel(), false, false},
{" return 1; ", PCONT, PModel(), false, false},
{" return n * factorial(n - 1); ", PCONT, PModel(), false, false},
{" } ", PCONT, PModel(), false, false},
{" ", PCONT, PModel(), false, false},
{" The C programming language does not have built- ", PSTART, PModel(kLeft, 0, 20, 0, 0),
false, false},
{"in support for detecting integer overflow, so this", PCONT, PModel(), false, false},
{"naive implementation simply returns random values ", PCONT, PModel(), false, false},
{"if even a moderate sized n is provided. ", PCONT, PModel(), false, false},
};
TEST(ParagraphsTest, NotDistractedBySourceCode) {
TestParagraphDetection(kTextWithSourceCode, countof(kTextWithSourceCode));
}
const TextAndModel kOldManAndSea[] = {
{"royal palm which are called guano and in it there was a bed, a", PSTART,
PModel(kLeft, 0, 50, 0, 0), false, false},
{"table, one chair, and a place on the dirt floor to cook with charcoal.", PCONT, PModel(),
false, false},
{"On the brown walls of the flattened, overlapping leaves of the", PCONT, PModel(),
false, false},
{"sturdy fibered guano there was a picture in color of the Sacred", PCONT, PModel(),
false, false},
{"Heart of Jesus and another of the Virgin of Cobre. These were", PCONT, PModel(),
false, false},
{"relics of his wife. Once there had been a tinted photograph of his", PCONT, PModel(),
false, false},
{"wife on the wall but he had taken it down because it made him too", PCONT, PModel(),
false, false},
{"lonely to see it and it was on the shelf in the corner under his clean", PCONT, PModel(),
false, false},
{"shirt. ", PCONT, PModel(),
false, false},
{" \"What do you have to eat?\" the boy asked. ", PSTART,
PModel(kLeft, 0, 50, 0, 0), false, false},
{" \"A pot of yellow rice with fish. Do you want some?\" ", PSTART,
PModel(kLeft, 0, 50, 0, 0), false, false},
{" \"No. I will eat at home. Do you want me to make the fire?\" ", PSTART,
PModel(kLeft, 0, 50, 0, 0), false, false},
{" \"No. I will make it later on. Or I may eat the rice cold.\" ", PSTART,
PModel(kLeft, 0, 50, 0, 0), false, false},
{" \"May I take the cast net?\" ", PSTART,
PModel(kLeft, 0, 50, 0, 0), false, false},
{" \"Of course.\" ", PSTART,
PModel(kLeft, 0, 50, 0, 0), false, false},
{" There was no cast net and the boy remembered when they had", PSTART,
PModel(kLeft, 0, 50, 0, 0), false, false},
{"sold it. But they went through this fiction every day. There was no", PCONT, PModel(),
false, false},
{"pot of yellow rice and fish and the boy knew this too. "
" ",
PCONT, PModel(), false, false},
{" \"Eighty-five is a lucky number,\" the old man said. \"How", PSTART,
PModel(kLeft, 0, 50, 0, 0), false, false},
{"would you like to see me bring one in that dressed out over a "
"thou-",
PCONT, PModel(), false, false},
{"sand pounds? "
" ",
PCONT, PModel(), false, false},
{" \"I'll get the cast net and go for sardines. Will you sit in the "
"sun",
PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
{"in the doorway?\" "
" ",
PCONT, PModel(), false, false},
{" \"Yes. I have yesterday's paper and I will read the baseball.\" ", PSTART,
PModel(kLeft, 0, 50, 0, 0), false, false},
{" The boy did not know whether yesterday's paper was a fiction", PSTART,
PModel(kLeft, 0, 50, 0, 0), false, false},
{"too. But the old man brought it out from under the bed. ", PCONT, PModel(),
false, false},
{" \"Pedrico gave it to me at the bodega,\" he explained. "
" ",
PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
{" \"I'll be back when I have the sardines. I'll keep yours and mine", PSTART,
PModel(kLeft, 0, 50, 0, 0), false, false},
{"together on ice and we can share them in the morning. When I", PCONT, PModel(),
false, false},
{"come back you can tell me about the baseball.\" ", PCONT, PModel(),
false, false},
{" \"The Yankees cannot lose.\" ", PSTART,
PModel(kLeft, 0, 50, 0, 0), false, false},
{" \"But I fear the Indians of Cleveland.\" ", PSTART,
PModel(kLeft, 0, 50, 0, 0), false, false},
{" \"Have faith in the Yankees my son. Think of the great Di-", PSTART,
PModel(kLeft, 0, 50, 0, 0), false, false},
{"Maggio.\" ", PCONT, PModel(),
false, false},
{" \"I fear both the Tigers of Detroit and the Indians of Cleve-", PSTART,
PModel(kLeft, 0, 50, 0, 0), false, false},
{"land.\" ", PCONT, PModel(),
false, false}};
TEST(ParagraphsTest, NotOverlyAggressiveWithBlockQuotes) {
TestParagraphDetection(kOldManAndSea, countof(kOldManAndSea));
}
const TextAndModel kNewZealandIndex[] = {
{"Oats, 51 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
{"O'Brien, Gregory, 175 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
{"Occupational composition, 110,", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
{" 138 ", PCONT, PModel(), false, false},
{"OECD rankings, 155, 172 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
{"Okiato (original capital), 47 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
{"Oil shock: 1974, xxx, 143; 1979,", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
{" 145 ", PCONT, PModel(), false, false},
{"Old Age Pensions, xxii, 89-90 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
{"Old World evils, 77 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
{"Oliver, W. H., 39, 77, 89 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
{"Olssen, Erik, 45, 64, 84 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
{"Olympic Games, 1924, 111, 144 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
{"Once on Chunuk Bair, 149 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
{"Once Were Warriors, xxxiii, 170", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
{"On—shore whaling, xvi ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
{"Opotiki, xix ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
{"Orakau battle of, xviii, 57 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
{"O’Regan, Tipene, 170, 198-99 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
{"Organic agriculture, 177 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
{"Orwell, George, 151 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
{"Otago, xvii, 45, 49-50, 70 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
{"Otago block, xvii ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
{"Otago Daily Times, 67 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
{"Otago Girls’ High School, xix, 61,", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
{" 85 ", PCONT, PModel(), false, false},
{"Otago gold rushes, 61-63 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
{"Otago Peninsula, xx ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
{"Otago Provincial Council, 68 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
{"Otaki, 33 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
{"Owls Do Cry, 139 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}};
TEST(ParagraphsTest, IndexPageTest) {
TestParagraphDetection(kNewZealandIndex, countof(kNewZealandIndex));
}
// TODO(eger): Add some right-to-left examples, and fix the algorithm as needed.
} // namespace tesseract
|