File: abbott-smith.cpp

package info (click to toggle)
bibledit-cloud 5.1.036-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 250,636 kB
  • sloc: xml: 915,934; ansic: 261,349; cpp: 92,628; javascript: 32,542; sh: 4,915; makefile: 586; php: 69
file content (136 lines) | stat: -rw-r--r-- 5,910 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
/*
 Copyright (©) 2003-2025 Teus Benschop.
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 3 of the License, or
 (at your option) any later version.
 
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 */


#include <sources/abbott-smith.h>
#include <database/abbottsmith.h>
#include <filter/string.h>
#include <filter/passage.h>
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Weffc++"
#pragma GCC diagnostic ignored "-Wold-style-cast"
#pragma GCC diagnostic ignored "-Wsuggest-override"
#pragma GCC diagnostic ignored "-Wzero-as-null-pointer-constant"
#ifndef HAVE_PUGIXML
#include <pugixml/pugixml.hpp>
#endif
#ifdef HAVE_PUGIXML
#include <pugixml.hpp>
#endif
#ifdef HAVE_ICU
#include <unicode/ustdio.h>
#include <unicode/normlzr.h>
#include <unicode/utypes.h>
#include <unicode/unistr.h>
#include <unicode/translit.h>
#endif
#pragma GCC diagnostic pop


int entry_element_count {0};


void sources_abbott_smith_parse_entry_element (pugi::xml_node & node)
{
  std::string entry = "entry";
  if (node.name() != entry) 
    return;
  entry_element_count++;
  
  // Example <entry> element and its content:
  
  //<entry lemma="α" strong="G1">
  //  <form><orth>Α, α, ἄλφα</orth> (q.v.), <foreign xml:lang="grc">τό</foreign>, indecl., </form>
  //  <sense><gloss>alpha</gloss>, the first letter of the Greek alphabet. As a numeral, <foreign xml:lang="grc">αʹ</foreign> = 1, <foreign xml:lang="grc">α͵</foreign> = 1000. As a prefix, it appears to have at least two and perhaps three distinct senses:
  //    <sense n="1."><foreign xml:lang="grc">ἀ-</foreign> (before a vowel, <foreign xml:lang="grc">ἀν-</foreign>) <gloss>negative</gloss>, as in <foreign xml:lang="grc">ἄ-γνωστος</foreign>, <foreign xml:lang="grc">ἄ-δικος</foreign>.</sense>
  //    <sense n="2."><foreign xml:lang="grc">ἀ-</foreign>, <foreign xml:lang="grc">ἁ-</foreign><gloss>copulative</gloss>, indicating community and fellowship, as in <foreign xml:lang="grc">ἁ-πλοῦς</foreign>, <foreign xml:lang="grc">ἀ-κολουθέω</foreign>, <foreign xml:lang="grc">ἀ-δελφός</foreign>.</sense>
  //    <sense n="3.">An <gloss>intensive</gloss> force (LS, s. <foreign xml:lang="grc">α</foreign>), as in <foreign xml:lang="grc">ἀ-τενίζω</foreign> is sometimes assumed (but v. Boisacq, s.v.).</sense>
  //  </sense>
  //</entry>

  // Get the lemma, and the Strong's number, and the raw XML of the entry's contents.
  std::string lemma = filter::strings::trim (node.attribute ("lemma").value ());
#ifdef HAVE_ICU
  lemma = filter::strings::icu_string_normalize (lemma, true, true);
#endif
  std::string strong = filter::strings::trim (node.attribute ("strong").value ());
  std::stringstream ss;
  for (pugi::xml_node child : node.children()) child.print(ss, "", pugi::format_raw);
  std::string contents = ss.str ();
  
  // If there's no lemma, or no Strong's number, then there's nothing to store.
  if (lemma.empty () && strong.empty()) return;

  // There's a few cases in the XML file that the lemma looks like this:
  // ἀκριβῶς|G199
  // It means that a Strong's number is added to the lemma.
  // Such a Strong's number should be parsed too, and put at its proper place in the database.
  // And the lemma should be remove from its attached Strong's number.
  std::vector <std::string> strongs = filter::strings::explode (lemma, '|');
  if (strongs.size() >= 2) {
    lemma = strongs[0];
    strongs.erase (strongs.begin());
  } else {
    strongs.clear();
  }

  // Store the original lemma, the casefolded lemma, and the Strong's number,
  // together with the entry's raw XML, into the database.
  std::string lemma_case_folded = filter::strings::unicode_string_casefold (lemma);
  database::abboth_smith::store (lemma, lemma_case_folded, strong, contents);
  
  // If there's more Strong's numbers in the entry, store those too, but without any lemma.
  for (auto strong2 : strongs) {
    database::abboth_smith::store (std::string(), std::string(), strong2, contents);
  }
}


void sources_abbott_smith_parse ()
{
  std::cout << "Starting" << std::endl;
  database::abboth_smith::create ();
    
  std::string file = "sources/abbott-smith/abbott-smith.tei_lemma.xml";
  
  pugi::xml_document document;
  document.load_file (file.c_str());
  pugi::xml_node TEI_node = document.first_child ();
  // Do a deep parsing.
  // The depth as used below was found out empirically in March 2021.
  // The number of <entry> elements was 6153 when counted in a text editor.
  // And this same number of elements was found when parsing as deep as is done below.
  for (pugi::xml_node node1 : TEI_node.children()) {
    sources_abbott_smith_parse_entry_element (node1);
    for (pugi::xml_node node2 : node1.children()) {
      sources_abbott_smith_parse_entry_element (node2);
      for (pugi::xml_node node3 : node2.children()) {
        sources_abbott_smith_parse_entry_element (node3);
        for (pugi::xml_node node4 : node3.children()) {
          sources_abbott_smith_parse_entry_element (node4);
          for (pugi::xml_node node5 : node4.children()) {
            sources_abbott_smith_parse_entry_element (node5);
          }
        }
      }
    }
  }
  std::cout << entry_element_count << " entry elements parsed" << std::endl;
  database::abboth_smith::optimize ();
  std::cout << "Completed" << std::endl;
}