File: dsl_details.hh

package info (click to toggle)
goldendict-webengine 23.02.05-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 19,148 kB
  • sloc: cpp: 58,537; javascript: 9,942; ansic: 9,242; xml: 41; makefile: 15; sh: 9
file content (218 lines) | stat: -rw-r--r-- 6,501 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
/* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org>
 * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */

#ifndef __DSL_DETAILS_HH_INCLUDED__
#define __DSL_DETAILS_HH_INCLUDED__

#include <string>
#include <list>
#include <vector>
#include <zlib.h>
#include "dictionary.hh"
#include "iconv.hh"
#if (QT_VERSION >= QT_VERSION_CHECK(6,0,0))
#include <QtCore5Compat/QTextCodec>
#else
#include <QTextCodec>
#endif
#include <QByteArray>
#include "utf8.hh"

// Implementation details for Dsl, not part of its interface
namespace Dsl {
namespace Details {

using std::string;
using gd::wstring;
using gd::wchar;
using std::list;
using std::vector;
using Utf8::Encoding;
using Utf8::LineFeed;



struct DSLLangCode
{
  int code_id;
  char code[ 3 ]; // ISO 639-1
};

string findCodeForDslId( int id );

bool isAtSignFirst( wstring const & str );

/// Parses the DSL language, representing it in its structural DOM form.
struct ArticleDom
{
  struct Node: public list< Node >
  {
    bool isTag; // true if it is a tag with subnodes, false if it's a leaf text
                // data.
    // Those are only used if isTag is true
    wstring tagName;
    wstring tagAttrs;
    wstring text; // This is only used if isTag is false

    class Text {};
    class Tag {};

    Node( Tag, wstring const & name, wstring const & attrs ): isTag( true ),
      tagName( name ), tagAttrs( attrs )
    {}

    Node( Text, wstring const & text_ ): isTag( false ), text( text_ )
    {}

    /// Concatenates all childen text nodes recursively to form all text
    /// the node contains stripped of any markup.
    wstring renderAsText( bool stripTrsTag = false ) const;
  };

  /// Does the parse at construction. Refer to the 'root' member variable
  /// afterwards.
  explicit ArticleDom( wstring const &, string const & dictName = string(),
              wstring const & headword_ = wstring() );

  /// Root of DOM's tree
  Node root;

private:

  void openTag( wstring const & name, wstring const & attr, list< Node * > & stack );

  void closeTag( wstring const & name, list< Node * > & stack,
                 bool warn = true );

  bool atSignFirstInLine();

  wchar const * stringPos, * lineStartPos;

  class eot: std::exception {};

  wchar ch;
  bool escaped;
  unsigned transcriptionCount; // >0 = inside a [t] tag
  unsigned mediaCount; // >0 = inside a [s] tag

  void nextChar() ;

  /// Information for diagnostic purposes
  string dictionaryName;
  wstring headword;
};

/// Opens the .dsl or .dsl.dz file and allows line-by-line reading. Auto-detects
/// the encoding, and reads all headers by itself.
class DslScanner
{
  gzFile f;
  Encoding encoding;
  QTextCodec* codec;
  wstring dictionaryName;
  wstring langFrom, langTo;
  wstring soundDictionary;
  char readBuffer[ 65536 ];
  char * readBufferPtr;
  LineFeed lineFeed;
  size_t readBufferLeft;
  //qint64 pos;
  unsigned linesRead;

public:

  DEF_EX( Ex, "Dsl scanner exception", Dictionary::Ex )
  DEF_EX_STR( exCantOpen, "Can't open .dsl file", Ex )
  DEF_EX( exCantReadDslFile, "Can't read .dsl file", Ex )
  DEF_EX_STR( exMalformedDslFile, "The .dsl file is malformed:", Ex )
  DEF_EX( exUnknownCodePage, "The .dsl file specified an unknown code page", Ex )
  DEF_EX( exEncodingError, "Encoding error", Ex ) // Should never happen really

  explicit DslScanner( string const & fileName ) ;
  ~DslScanner() noexcept;

  /// Returns the detected encoding of this file.
  Encoding getEncoding() const
  { return encoding; }

  /// Returns the dictionary's name, as was read from file's headers.
  wstring const & getDictionaryName() const
  { return dictionaryName; }

  /// Returns the dictionary's source language, as was read from file's headers.
  wstring const & getLangFrom() const
  { return langFrom; }

  /// Returns the dictionary's target language, as was read from file's headers.
  wstring const & getLangTo() const
  { return langTo; }

  /// Returns the preferred external dictionary with sounds, as was read from file's headers.
  wstring const & getSoundDictionaryName() const
  { return soundDictionary; }

  /// Reads next line from the file. Returns true if reading succeeded --
  /// the string gets stored in the one passed, along with its physical
  /// file offset in the file (the uncompressed one if the file is compressed).
  /// If end of file is reached, false is returned.
  /// Reading begins from the first line after the headers (ones which start
  /// with #).
  bool readNextLine( wstring &, size_t & offset, bool only_head_word = false ) ;

  /// Similar readNextLine but strip all DSL comments {{...}}
  bool readNextLineWithoutComments( wstring &, size_t & offset, bool only_headword = false ) ;

  /// Returns the number of lines read so far from the file.
  unsigned getLinesRead() const
  { return linesRead; }

  /// Converts the given number of characters to the number of bytes they
  /// would occupy in the file, knowing its encoding. It's possible to know
  /// that because no multibyte encodings are supported in .dsls.
  inline size_t distanceToBytes( size_t ) const;
};

/// This function either removes parts of string enclosed in braces, or leaves
/// them intact. The braces themselves are removed always, though.
void processUnsortedParts( wstring & str, bool strip );

/// Expands optional parts of a headword (ones marked with parentheses),
/// producing all possible combinations where they are present or absent.
void expandOptionalParts( wstring & str, list< wstring > * result,
                          size_t x = 0, bool inside_recurse = false );

/// Expands all unescaped tildes, inserting tildeReplacement text instead of
/// them.
void expandTildes( wstring & str, wstring const & tildeReplacement );

/// Unescapes any escaped chars. Be sure to handle all their special meanings
/// before unescaping them.
void unescapeDsl( wstring & str );

/// Normalizes the headword. Currently turns any sequences of consecutive spaces
/// into a single space.
void normalizeHeadword( wstring & );

/// Strip DSL {{...}} comments
void stripComments( wstring &, bool & );

inline size_t DslScanner::distanceToBytes( size_t x ) const
{
  switch( encoding )
  {
    case Utf8::Utf16LE:
    case Utf8::Utf16BE:
      return x*2;
    default:
      return x;
  }
}

/// Converts the given language name taken from Dsl header (i.e. getLangFrom(),
/// getLangTo()) to its proper language id.
quint32 dslLanguageToId( wstring const & name );

}
}

#endif