File: dawg_test.cc

package info (click to toggle)
tesseract 5.5.0-1
  • links: PTS
  • area: main
  • in suites: forky, sid, trixie
  • size: 43,508 kB
  • sloc: cpp: 154,570; makefile: 1,519; java: 1,143; ansic: 852; sh: 763; python: 51
file content (110 lines) | stat: -rw-r--r-- 4,193 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
// (C) Copyright 2017, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "include_gunit.h"

#include "ratngs.h"
#include "trie.h"
#include "unicharset.h"

#include <sys/stat.h>
#include <cstdlib> // for system
#include <fstream> // for ifstream
#include <set>
#include <string>
#include <vector>

#ifndef SW_TESTING
#  define wordlist2dawg_prog "wordlist2dawg"
#  define dawg2wordlist_prog "dawg2wordlist"
#endif

namespace tesseract {

// Test some basic functionality dealing with Dawgs (compressed dictionaries,
// aka Directed Acyclic Word Graphs).
class DawgTest : public testing::Test {
protected:
  void SetUp() override {
    std::locale::global(std::locale(""));
    file::MakeTmpdir();
  }

  void LoadWordlist(const std::string &filename, std::set<std::string> *words) const {
    std::ifstream file(filename);
    if (file.is_open()) {
      std::string line;
      while (getline(file, line)) {
        // Remove trailing line terminators from line.
        while (!line.empty() && (line.back() == '\n' || line.back() == '\r')) {
          line.resize(line.size() - 1);
        }
        // Add line to set.
        words->insert(line.c_str());
      }
      file.close();
    }
  }
  std::string TessBinaryPath(const std::string &name) const {
    return file::JoinPath(TESSBIN_DIR, name);
  }
  std::string OutputNameToPath(const std::string &name) const {
    return file::JoinPath(FLAGS_test_tmpdir, name);
  }
  int RunCommand(const std::string &program, const std::string &arg1, const std::string &arg2,
                 const std::string &arg3) const {
    std::string cmdline = TessBinaryPath(program) + " " + arg1 + " " + arg2 + " " + arg3;
    return system(cmdline.c_str());
  }
  // Test that we are able to convert a wordlist file (one "word" per line) to
  // a dawg (a compressed format) and then extract the original wordlist back
  // out using the tools "wordlist2dawg" and "dawg2wordlist."
  void TestDawgRoundTrip(const std::string &unicharset_filename,
                         const std::string &wordlist_filename) const {
    std::set<std::string> orig_words, roundtrip_words;
    std::string unicharset = file::JoinPath(TESTING_DIR, unicharset_filename);
    std::string orig_wordlist = file::JoinPath(TESTING_DIR, wordlist_filename);
    std::string output_dawg = OutputNameToPath(wordlist_filename + ".dawg");
    std::string output_wordlist = OutputNameToPath(wordlist_filename);
    LoadWordlist(orig_wordlist, &orig_words);
    EXPECT_EQ(RunCommand(wordlist2dawg_prog, orig_wordlist, output_dawg, unicharset), 0);
    EXPECT_EQ(RunCommand(dawg2wordlist_prog, unicharset, output_dawg, output_wordlist), 0);
    LoadWordlist(output_wordlist, &roundtrip_words);
    EXPECT_EQ(orig_words, roundtrip_words);
  }
};

TEST_F(DawgTest, TestDawgConversion) {
  TestDawgRoundTrip("eng.unicharset", "eng.wordlist.clean.freq");
}

TEST_F(DawgTest, TestMatching) {
  UNICHARSET unicharset;
  unicharset.load_from_file(file::JoinPath(TESTING_DIR, "eng.unicharset").c_str());
  tesseract::Trie trie(tesseract::DAWG_TYPE_WORD, "basic_dawg", NGRAM_PERM, unicharset.size(), 0);
  WERD_CHOICE space_apos(" '", unicharset);
  trie.add_word_to_dawg(space_apos);

  WERD_CHOICE space(" ", unicharset);

  // partial match ok - then good!
  EXPECT_TRUE(trie.prefix_in_dawg(space, false));
  // require complete match - not present.
  EXPECT_FALSE(trie.word_in_dawg(space));
  EXPECT_FALSE(trie.prefix_in_dawg(space, true));

  // partial or complete match ok for full word:
  EXPECT_TRUE(trie.prefix_in_dawg(space_apos, false));
  EXPECT_TRUE(trie.word_in_dawg(space_apos));
  EXPECT_TRUE(trie.prefix_in_dawg(space_apos, true));
}

} // namespace tesseract