File: check_count_words.cpp

package info (click to toggle)
bibledit-gtk 4.6-1
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd, wheezy
  • size: 31,668 kB
  • ctags: 11,053
  • sloc: xml: 289,607; sql: 160,978; cpp: 86,450; sh: 3,316; makefile: 609; ansic: 398; perl: 143; python: 36
file content (171 lines) | stat: -rw-r--r-- 5,577 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
/*
** Copyright (©) 2003-2012 Teus Benschop.
**  
** This program is free software; you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
** the Free Software Foundation; either version 3 of the License, or
** (at your option) any later version.
**  
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
** GNU General Public License for more details.
**  
** You should have received a copy of the GNU General Public License
** along with this program; if not, write to the Free Software
** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
**  
*/

#include "check_count_words.h"
#include "projectutils.h"
#include "settings.h"
#include "stylesheetutils.h"
#include "utilities.h"
#include "usfmtools.h"
#include "books.h"
#include "checks.h"

CheckCountWords::CheckCountWords(const ustring & project, const vector < unsigned int >&books, const ustring & extrachars, bool sortword, bool sortcount, unsigned int excludecount, bool gui)
/*
It counts the words in the project.
project: project to check.
books: books to check; if empty it checks them all.
extrachars: characters that are part of a word.
sortword: sort on word.
sortcount: sort on count.
excludecount: exclude words with a count of n and up.
gui: show graphical progressbar.
*/
{
  // Init variables.
  cancelled = false;
  // Get a list of the books to check. If no books were given, take them all.
  vector < unsigned int >mybooks(books.begin(), books.end());
  if (mybooks.empty())
    mybooks = project_get_books(project);
  // GUI.
  progresswindow = NULL;
  if (gui) {
    progresswindow = new ProgressWindow("Counting words", true);
    progresswindow->set_iterate(0, 1, mybooks.size());
  }
  // Deal with extra word-forming characters.
  ustring extrachars_cleaned = get_non_alphanumericals(extrachars);
  for (unsigned int i = 0; i < extrachars_cleaned.length(); i++) {
    extra_character.push_back(extrachars_cleaned.substr(i, 1));
    ustring s = "BIBLEDITREPLACEMENT";
    for (unsigned int i2 = 0; i2 < i; i2++)
      s.append("X");
    temporal_replacement.push_back(s);
  }
  // Check each book.
  for (unsigned int bk = 0; bk < mybooks.size(); bk++) {
    if (gui) {
      progresswindow->iterate();
      if (progresswindow->cancel) {
        cancelled = true;
        return;
      }
    }
    // Get text of the book and go through each line.
    vector < ustring > lines = project_retrieve_book(project, mybooks[bk]);
    for (unsigned int ln = 0; ln < lines.size(); ln++) {
      // Handle extra characters.
      for (unsigned int i = 0; i < extra_character.size(); i++) {
        replace_text(lines[ln], extra_character[i], temporal_replacement[i]);
      }
      // Categorize the line for a cleaner results.
      CategorizeLine categorizeline(lines[ln]);
      // Count the words from all categories.
      ParseWords pwid(categorizeline.id);
      count(pwid.words);
      ParseWords pwintro(categorizeline.intro);
      count(pwintro.words);
      ParseWords pwhead(categorizeline.head);
      count(pwhead.words);
      ParseWords pwchap(categorizeline.chap);
      count(pwchap.words);
      ParseWords pwstudy(categorizeline.study);
      count(pwstudy.words);
      ParseWords pwnote(categorizeline.note);
      count(pwnote.words);
      ParseWords pwref(categorizeline.ref);
      count(pwref.words);
      ParseWords pwverse(categorizeline.verse);
      count(pwverse.words);
    }
  }
  // Store results.
  if (gui) {
    progresswindow->set_fraction(0.2);
    progresswindow->set_text("Processing results");
  }
  words.assign(wordset.begin(), wordset.end());
  for (unsigned int i = 0; i < words.size(); i++) {
    counts.push_back(counter[words[i]]);
  }
  // Change temporal replacements back to the original characters.
  if (gui)
    progresswindow->set_fraction(0.4);
  for (unsigned int i = 0; i < words.size(); i++) {
    for (int i2 = extra_character.size() - 1; i2 >= 0; i2--) {
      replace_text(words[i], temporal_replacement[i2], extra_character[i2]);
    }
  }
  // Store total counts.
  if (gui)
    progresswindow->set_fraction(0.6);
  total_unique_count = words.size();
  total_count = 0;
  for (unsigned int i = 0; i < counts.size(); i++) {
    total_count += counts[i];
  }
  // Exclude counts above a certain limit, if given.
  if (gui)
    progresswindow->set_fraction(0.8);
  if (excludecount > 0) {
    vector < ustring > mywords;
    vector < unsigned int >mycounts;
    for (unsigned int i = 0; i < words.size(); i++) {
      if (counts[i] < excludecount) {
        mywords.push_back(words[i]);
        mycounts.push_back(counts[i]);
      }
    }
    words.clear();
    words = mywords;
    counts.clear();
    counts = mycounts;
  }
  // Store filtered counts.
  filtered_unique_count = words.size();
  filtered_count = 0;
  for (unsigned int i = 0; i < counts.size(); i++) {
    filtered_count += counts[i];
  }
  // Sorting, if requested.
  if (gui)
    progresswindow->set_fraction(1);
  if (sortword) {
    quick_sort(words, counts, 0, words.size());
  }
  if (sortcount) {
    quick_sort(counts, words, 0, counts.size());
  }
}

CheckCountWords::~CheckCountWords()
{
  if (progresswindow)
    delete progresswindow;
}

void CheckCountWords::count(vector < ustring > &words)
{
  // Count each word.
  for (unsigned int i = 0; i < words.size(); i++) {
    wordset.insert(words[i]);
    counter[words[i]]++;
  }
}