File: etcbc4.cpp

package info (click to toggle)
bibledit-cloud 5.1.036-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 250,636 kB
  • sloc: xml: 915,934; ansic: 261,349; cpp: 92,628; javascript: 32,542; sh: 4,915; makefile: 586; php: 69
file content (227 lines) | stat: -rw-r--r-- 8,714 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
/*
 Copyright (©) 2003-2025 Teus Benschop.
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 3 of the License, or
 (at your option) any later version.
 
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 */


#include <sources/etcbc4.h>
#include <database/logs.h>
#include <database/etcbc4.h>
#include <filter/string.h>
#include <filter/url.h>
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Weffc++"
#pragma GCC diagnostic ignored "-Wsuggest-override"
#pragma GCC diagnostic ignored "-Wzero-as-null-pointer-constant"
#ifndef HAVE_PUGIXML
#include <pugixml/pugixml.hpp>
#endif
#ifdef HAVE_PUGIXML
#include <pugixml.hpp>
#endif
#pragma GCC diagnostic pop


void sources_etcbc4_download ()
{
  Database_Logs::log ("Start to download the raw Hebrew morphology data from the ETCBC4 database");
  database::etcbc4::create ();
  
  // The book names for downloading data.
  std::vector <std::string> books = {
    "Genesis",
    "Exodus",
    "Leviticus",
    "Numeri",
    "Deuteronomium",
    "Josua",
    "Judices",
    "Ruth",
    "Samuel_I",
    "Samuel_II",
    "Reges_I",
    "Reges_II",
    "Chronica_I",
    "Chronica_II",
    "Esra",
    "Nehemia",
    "Esther",
    "Iob",
    "Psalmi",
    "Proverbia",
    "Ecclesiastes",
    "Canticum",
    "Jesaia",
    "Jeremia",
    "Threni",
    "Ezechiel",
    "Daniel",
    "Hosea",
    "Joel",
    "Amos",
    "Obadia",
    "Jona",
    "Micha",
    "Nahum",
    "Habakuk",
    "Zephania",
    "Haggai",
    "Sacharia",
    "Maleachi"
  };

  for (size_t bk = 0; bk < books.size (); bk++) {

    int book = static_cast<int>(bk + 1);
    std::string bookname = books[bk];

    bool book_done = false;
    for (int chapter = 1; chapter <= 150; chapter++) {
      if (book_done) continue;

      for (int verse = 1; verse < 200; verse++) {
        if (book_done) continue;

        std::string data = database::etcbc4::raw (book, chapter, verse);
        if (!data.empty ()) continue;
        
        std::string url = "https://shebanq.ancient-data.org/hebrew/verse?version=4b&book=" + bookname + "&chapter=" + std::to_string (chapter) + "&verse=" + std::to_string (verse);

        std::string error;
        std::string response = filter_url_http_get (url, error, false);
        if (!error.empty ()) {
          Database_Logs::log (error);
          continue;
        }
        if (response.find ("does not exist") != std::string::npos) {
          if (verse == 1) book_done = true;
          break;
        }
        Database_Logs::log (bookname + " " + std::to_string (chapter) + "." + std::to_string (verse));
        database::etcbc4::store (book, chapter, verse, response);
        // Wait a second: Be polite: Do not overload the website.
        std::this_thread::sleep_for (std::chrono::seconds (1));
      }
    }
  }

  Database_Logs::log ("Finished downloading from the ETCBC4 database");
}


std::string sources_etcbc4_clean (std::string item)
{
  item = filter::strings::replace ("/", "", item);
  item = filter::strings::replace ("]", "", item);
  item = filter::strings::replace ("[", "", item);
  item = filter::strings::replace ("=", "", item);
  item = filter::strings::trim (item);
  return item;
}


// Parses the raw html data as downloaded from the ETCBC4 database.
// The parser is supposed to be ran only by the developers.
void sources_etcbc4_parse ()
{
  Database_Logs::log ("Parsing data from the ETCBC4 database");
  database::etcbc4::create ();
  const std::vector <int> books = database::etcbc4::books ();
  for (auto book : books) {
    const std::vector <int> chapters = database::etcbc4::chapters (book);
    for (auto chapter : chapters) {
      Database_Logs::log ("Parsing book " + std::to_string (book) + " chapter " + std::to_string (chapter));
      std::vector <int> verses = database::etcbc4::verses (book, chapter);
      for (auto verse : verses) {
        // The raw data for the verse.
        std::string data = database::etcbc4::raw (book, chapter, verse);
        if (data.empty ()) continue;
        data = filter::strings::replace (filter::strings::unicode_non_breaking_space_entity (), "", data);
        // Parse the data.
        pugi::xml_document document;
        document.load_string (data.c_str());
        // Iterate through the <table> elements, one element per word or word fragment.
        for (pugi::xml_node table : document.children()) {
          // The relevant grammatical information to be extracted from the data.
          std::string word;
          std::string vocalized_lexeme;
          std::string consonantal_lexeme;
          std::string gloss;
          std::string pos;
          std::string subpos;
          std::string gender;
          std::string number;
          std::string person;
          std::string state;
          std::string tense;
          std::string stem;
          std::string phrase_function;
          std::string phrase_type;
          std::string phrase_relation;
          std::string phrase_a_relation;
          std::string clause_text_type;
          std::string clause_type;
          std::string clause_relation;
          // Iterate through the <tr> elements.
          // Each element contains one or more table cells with information.
          for (pugi::xml_node tr : table.children ()) {
            // Iterate through the <td> elements.
            for (pugi::xml_node td : tr.children ()) {
              // Iterate through the one or more <span> elements within this table cell.
              // Each <span> elements has a grammatical tag.
              for (pugi::xml_node span : td.children ()) {
                // Get the text this <span> contains.
                pugi::xml_node txtnode = span.first_child ();
                std::string value = txtnode.text ().get ();
                value = sources_etcbc4_clean (value);
                // The class of the <span> element indicates what kind of grammatical tag it has.
                std::string clazz = span.attribute ("class").value ();
                if (clazz == "ht") word = value;
                if (clazz.find ("hl_hlv") != std::string::npos) vocalized_lexeme = value;
                if (clazz.find ("hl_hlc") != std::string::npos) consonantal_lexeme = value;
                if (clazz == "gl") gloss = value;
                if (clazz.find ("_pos") != std::string::npos) pos = value;
                if (clazz.find ("_subpos") != std::string::npos) subpos = value;
                if (clazz.find ("_gender") != std::string::npos) gender = value;
                if (clazz.find ("_gnumber") != std::string::npos) number = value;
                if (clazz.find ("_person") != std::string::npos) person = value;
                if (clazz.find ("_state") != std::string::npos) state = value;
                if (clazz.find ("_tense") != std::string::npos) tense = value;
                if (clazz.find ("_stem") != std::string::npos) stem = value;
                if (clazz.find ("ph_fun") != std::string::npos) phrase_function = value;
                if (clazz.find ("ph_typ") != std::string::npos) phrase_type = value;
                if (clazz.find ("ph_rela") != std::string::npos) phrase_relation = value;
                if (clazz.find ("ph_arela") != std::string::npos) phrase_a_relation = value;
                if (clazz.find ("cl_txt") != std::string::npos) clause_text_type = value;
                if (clazz.find ("cl_typ") != std::string::npos) clause_type = value;
                if (clazz.find ("cl_rela") != std::string::npos) clause_relation = value;
              }
            }
          }
          // The table element has been done: Store it.
          database::etcbc4::store (book, chapter, verse,
                                   word, vocalized_lexeme, consonantal_lexeme, gloss, pos, subpos,
                                   gender, number, person,
                                   state, tense, stem,
                                   phrase_function, phrase_type, phrase_relation,
                                   phrase_a_relation, clause_text_type, clause_type, clause_relation);
        }
      }
    }
  }
  
  Database_Logs::log ("Finished parsing data from the ETCBC4 database");
}