File: parser.rb

package info (click to toggle)
ruby-simple-po-parser 1.1.5-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye, forky, sid, trixie
  • size: 224 kB
  • sloc: ruby: 646; makefile: 4
file content (376 lines) | stat: -rw-r--r-- 13,281 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
# encoding: utf-8

module SimplePoParser
  # Fast parser directly using Rubys powerful StringScanner (strscan)
  #
  # Important notes about StringScanner.scan:
  # * scan will return nil if there is no match. Using the regex * (zero or more) quantifier will
  #  let scan return an empty string if there is "no match" as the empty string qualifies as
  #  a match of the regex (zero times). We make use of this "trick"
  # * the start of line anchor ^ is obsolete as scan will only match start of line.
  # * rubys regex is by default in single-line mode, therefore scan will only match until
  #  the next newline is hit (unless multi-line mode is explicitly enabled)
  class Parser
    require_relative 'error'
    require 'strscan'

    # parse a single message of the PO format.
    #
    # @param message a single PO message in String format without leading or trailing whitespace
    # @return [Hash] parsed PO message information in Hash format
    def parse(message)
      @result = {}
      @scanner = StringScanner.new(message.strip)
      begin
        lines
      rescue ParserError => pe
        error_msg = "SimplePoParser::ParserError"
        error_msg += pe.message
        error_msg += "\nParseing result before error: '#{@result}'"
        error_msg += "\nSimplePoParser filtered backtrace: SimplePoParser::ParserError"
        backtrace = "#{pe.backtrace.select{|i| i =~ /lib\/simple_po_parser/}.join("\n\tfrom ")}"
        raise ParserError, error_msg, backtrace
      end
      @result
    end

    private

    #########################################
    ###            branching              ###
    #########################################

    # arbitary line of a PO message. Can be comment or message
    # message parsing is always started with checking for msgctxt as content is expected in
    # msgctxt -> msgid -> msgid_plural -> msgstr order
    def lines
      begin
        if @scanner.scan(/#/)
          comment
        else
          msgctxt
        end
      rescue PoSyntaxError => pe
        # throw a normal ParserError to break the recursion
        raise ParserError, "Syntax error in lines\n" + pe.message, pe.backtrace
      end
    end

    # match a comment line. called on lines starting with '#'.
    # Recalls line when the comment line was parsed
    def comment
      begin
        case @scanner.getch
        when ' '
          skip_whitespace
          add_result(:translator_comment, comment_text)
          lines
        when '.'
          skip_whitespace
          add_result(:extracted_comment, comment_text)
          lines
        when ':'
          skip_whitespace
          add_result(:reference, comment_text)
          lines
        when ','
          skip_whitespace
          add_result(:flag, comment_text)
          lines
        when '|'
          skip_whitespace
          previous_comments
          lines
        when "\n"
          add_result(:translator_comment, "") # empty comment line
          lines
        when '~'
          if @result[:previous_msgctxt] || @result[:previous_msgid] || @result[:previous_msgid_plural]
            raise PoSyntaxError, "Previous comment entries need to be marked obsolete too in obsolete message entries. But already got: #{@result}"
          end
          skip_whitespace
          add_result(:obsolete, comment_text)
          obsoletes
        else
          @scanner.pos = @scanner.pos - 2
          raise PoSyntaxError, "Unknown comment type #{@scanner.peek(10).inspect}"
        end
      rescue PoSyntaxError => pe
        raise PoSyntaxError, "Syntax error in comment\n" + pe.message, pe.backtrace
      end
    end

    # matches the msgctxt line and will continue to check for msgid afterwards
    #
    # msgctxt is optional
    def msgctxt
      begin
        if @scanner.scan(/msgctxt/)
          skip_whitespace
          text = message_line
          add_result(:msgctxt, text)
          message_multiline(:msgctxt) if text.empty?
        end
        msgid
      rescue PoSyntaxError => pe
        raise PoSyntaxError, "Syntax error in msgctxt\n" + pe.message, pe.backtrace
      end
    end

    # matches the msgid line. Will check for optional msgid_plural.
    # Will advance to msgstr or msgstr_plural based on msgid_plural
    #
    # msgid is required
    def msgid
      begin
        if @scanner.scan(/msgid/)
          skip_whitespace
          text = message_line
          add_result(:msgid, text)
          message_multiline(:msgid) if text.empty?
          if msgid_plural
            msgstr_plural
          else
            msgstr
          end
        else
          err_msg = "Message without msgid is not allowed."
          err_msg += "The Line started unexpectedly with #{@scanner.peek(10).inspect}."
          raise PoSyntaxError, err_msg
        end
      rescue PoSyntaxError => pe
        raise PoSyntaxError, "Syntax error in msgid\n" + pe.message, pe.backtrace
      end

    end

    # matches the msgid_plural line.
    #
    # msgid_plural is optional
    #
    # @return [boolean] true if msgid_plural is present, false otherwise
    def msgid_plural
      begin
        if @scanner.scan(/msgid_plural/)
          skip_whitespace
          text = message_line
          add_result(:msgid_plural, text)
          message_multiline(:msgid_plural) if text.empty?
          true
        else
          false
        end
      rescue PoSyntaxError => pe
        raise PoSyntaxError, "Syntax error in msgid\n" + pe.message, pe.backtrace
      end
    end

    # parses the msgstr singular line
    #
    # msgstr is required in singular translations
    def msgstr
      begin
        if @scanner.scan(/msgstr/)
          skip_whitespace
          text = message_line
          add_result(:msgstr, text)
          message_multiline(:msgstr) if text.empty?
          skip_whitespace
          raise PoSyntaxError, "Unexpected content after expected message end #{@scanner.peek(10).inspect}" unless @scanner.eos?
        else
         raise PoSyntaxError, "Singular message without msgstr is not allowed. Line started unexpectedly with #{@scanner.peek(10).inspect}."
        end
      rescue PoSyntaxError => pe
        raise PoSyntaxError, "Syntax error in msgstr\n" + pe.message, pe.backtrace
      end
    end

    # parses the msgstr plural lines
    #
    # msgstr plural lines are used when there is msgid_plural.
    # They have the format msgstr[N] where N is incremental number starting from zero representing
    # the plural number as specified in the headers "Plural-Forms" entry. Most languages, like the
    # English language only have two plural forms (singular and plural),
    # but there are languages with more plurals
    def msgstr_plural(num = 0)
      begin
        msgstr_key = @scanner.scan(/msgstr\[\d\]/) # matches 'msgstr[0]' to 'msgstr[9]'
        if msgstr_key
          # msgstr plurals must come in 0-based index in order
          msgstr_num = msgstr_key.match(/\d/)[0].to_i
          raise PoSyntaxError, "Bad 'msgstr[index]' index." if msgstr_num != num
          skip_whitespace
          text = message_line
          add_result(msgstr_key, text)
          message_multiline(msgstr_key) if text.empty?
          msgstr_plural(num+1)
        elsif num == 0 # and msgstr_key was false
          raise PoSyntaxError, "Plural message without msgstr[0] is not allowed. Line started unexpectedly with #{@scanner.peek(10).inspect}."
        else
          raise PoSyntaxError, "End of message was expected, but line started unexpectedly with #{@scanner.peek(10).inspect}" unless @scanner.eos?
        end
      rescue PoSyntaxError => pe
        raise PoSyntaxError, "Syntax error in msgstr_plural\n" + pe.message, pe.backtrace
      end
    end

    # parses previous comments, which provide additional information on fuzzy matching
    #
    # previous comments are:
    # * #| msgctxt
    # * #| msgid
    # * #| msgid_plural
    def previous_comments
      begin
        # next part must be msgctxt, msgid or msgid_plural
        if @scanner.scan(/msg/)
          if @scanner.scan(/id/)
            if @scanner.scan(/_plural/)
              key = :previous_msgid_plural
            else
              key = :previous_msgid
            end
          elsif @scanner.scan(/ctxt/)
            key = :previous_msgctxt
          else
            raise PoSyntaxError, "Previous comment type #{("msg" + @scanner.peek(10)).inspect} unknown."
          end
          skip_whitespace
          text = message_line
          add_result(key, text)
          previous_multiline(key) if text.empty?
        else
          raise PoSyntaxError, "Previous comments must start with '#| msg'. #{@scanner.peek(10).inspect} unknown."
        end
      rescue PoSyntaxError => pe
        raise PoSyntaxError, "Syntax error in previous_comments\n" + pe.message, pe.backtrace
      end
    end

    # parses the multiline messages of the previous comment lines
    def previous_multiline(key)
      begin
        # scan multilines until no further multiline is hit
        # /#\|\p{Blank}"/ needs to catch the double quote to ensure it hits a previous
        # multiline and not another line type.
        if @scanner.scan(/#\|\p{Blank}*"/)
          @scanner.pos = @scanner.pos - 1 # go one character back, so we can reuse the "message line" method
          add_result(key, message_line)
          previous_multiline(key) # go on until we no longer hit a multiline line
        end
      rescue PoSyntaxError => pe
        raise PoSyntaxError, "Syntax error in previous_multiline\n" + pe.message, pe.backtrace
      end
    end

    # parses a multiline message
    #
    # multiline messages are indicated by an empty content as first line and the next line
    # starting with the double quote character
    def message_multiline(key)
      begin
        skip_whitespace
        if @scanner.check(/"/)
          add_result(key, message_line)
          message_multiline(key)
        end
      rescue PoSyntaxError => pe
        raise PoSyntaxError, "Syntax error in message_multiline with key '#{key}'\n" + pe.message, pe.backtrace
      end
    end

    # identifies a message line and returns it's text or raises an error
    #
    # @return [String] message_text
    def message_line
      begin
        if @scanner.getch == '"'
          text = message_text
          unless @scanner.getch == '"'
            err_msg = "The message text '#{text}' must be finished with the double quote character '\"'."
            raise PoSyntaxError, err_msg
          end
          skip_whitespace
          unless end_of_line
            err_msg = "There should be only whitespace until the end of line"
            err_msg += " after the double quote character of a message text."
            raise PoSyntaxError.new(err_msg)
          end
          text
        else
          @scanner.pos = @scanner.pos - 1
          err_msg = "A message text needs to start with the double quote character '\"',"
          err_msg += " but this was found: #{@scanner.peek(10).inspect}"
          raise PoSyntaxError, err_msg
        end
      rescue PoSyntaxError => pe
        raise PoSyntaxError, "Syntax error in message_line\n" + pe.message, pe.backtrace
      end
    end

    # parses all obsolete lines.
    # An obsolete message may only contain obsolete lines
    def obsoletes
      if @scanner.scan(/#~/)
        skip_whitespace
        add_result(:obsolete, comment_text)
        obsoletes
      else
        raise PoSyntaxError, "All lines must be obsolete after the first obsolete line, but got #{@scanner.peek(10).inspect}." unless @scanner.eos?
      end
    end

    #########################################
    ###             scanning              ###
    #########################################

    # returns the text of a comment
    #
    # @return [String] text
    def comment_text
      begin
        text = @scanner.scan(/.*/) # everything until newline
        text.rstrip! # benchmarked faster too rstrip the string in place
        raise PoSyntaxError, "Comment text should advance to next line or stop at eos" unless end_of_line
        text
      rescue PoSyntaxError => pe
        raise PoSyntaxError, "Syntax error in commtent_text\n" + pe.message, pe.backtrace
      end
    end

    # returns the text of a message line
    #
    # @return [String] text
    def message_text
      @scanner.scan_until(/(\\(\\|")|[^"])*/) # this parses anything until an unescaped quote is hit
    end

    # advances the scanner until the next non whitespace position.
    # Does not match newlines. See WHITESPACE_REGEX constant
    def skip_whitespace
      @scanner.skip(/\p{Blank}+/)
    end

    # returns true if the scanner is at beginning of next line or end of string
    #
    # @return [Boolean] true if scanner at beginning of line or eos
    def end_of_line
      @scanner.scan(/\n/)
      @scanner.eos? || @scanner.bol?
    end

    # adds text to the given key in results
    # creates an array if the given key already has a result
    def add_result(key, text)
      if @result[key]
        if @result[key].is_a? Array
          @result[key].push(text)
        else
          @result[key] = [@result[key], text]
        end
      else
        @result[key] = text
      end
    end
  end
end