File: saxy_html.rb

package info (click to toggle)
ruby-ox 2.14.23-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 3,504 kB
  • sloc: xml: 39,683; ansic: 9,626; ruby: 6,441; sh: 47; makefile: 2
file content (113 lines) | stat: -rwxr-xr-x 3,361 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/usr/bin/env ruby

# This example demonstrates the use of the Ox.sax_html parser and the
# Ox.Builder. The parser is used to parse and HTML file and add a
# `class="ppp"` to each '<p>' element start.
#
# The approach taken is to build while parsing. An HTML parse is started and a
# builder call is made on each parser callback. If the element is a 'p' then
# the class attribute is added. All others remain the same.

# Use the current repo if run from the examples directory.
ox_dir = File.dirname(File.dirname(File.expand_path(__FILE__)))
$LOAD_PATH << File.join(ox_dir, 'ext')
$LOAD_PATH << File.join(ox_dir, 'lib')

require 'ox'

# First create a handler for the SAX callbacks. The class instances include a
# builder that builds as parsing takes place.
class Saxy < Ox::Sax
  VOID_ELEMENTS = [:area, :base, :br, :col, :embed, :hr, :img, :input, :link, :meta, :param, :source, :track, :wbr]

  def initialize
    super
    # The build is created with an indentation of 2 but that can be changed to
    # the desired indentation.
    @builder = Ox::Builder.new(indent: 2)
    # element_name and attributes are used for deferred writing of the element
    # start.
    @element_name = nil
    @attrs = {}
  end

  def to_s
    @builder.to_s
  end

  # The builder creates element starts with attributes but the parser uses a
  # seprate call for attributes and element starts. To deal with the
  # difference keep track of the start name and attributes as they are
  # added. When another callback other than attributes is called write any
  # pending element start.
  def push_element
    return if @element_name.nil?

    # Add the class attribute if the element is a <p> element.
    @attrs[:class] = 'ppp' if :p == @element_name

    # Check @void_elements to determine how the element start would be
    # written. HTML includes void elements that are self closing so those
    # should be handled correctly.
    if VOID_ELEMENTS.include?(@element_name)
      @builder.void_element(@element_name, @attrs)
    else
      @builder.element(@element_name, @attrs)
    end
    # Reset the element name.
    @element_name = nil
    @attrs = {}
  end

  def start_element(name)
    push_element
    @element_name = name
  end

  def attr(name, value)
    @attrs[name] = value
  end

  def doctype(value)
    push_element
    @builder.doctype(value)
  end

  def comment(value)
    push_element
    @builder.comment(value)
  end

  def text(value)
    push_element
    @builder.text(value)
  end

  def end_element(name)
    push_element
    @builder.pop unless VOID_ELEMENTS.include?(name)
  end

  # Just in case there is a parse error this will display the error along with
  # where the error occurred in the XML file.
  def error(message, line, column)
    puts "*-*-* error at #{line}:#{column}: #{message}"
  end
end

# Load the XML file. The Ox.sax_html also handles IO objects.
xml = File.read('saxy.html')
# Create an instance of the handler.
handler = Saxy.new

Ox.sax_html(handler, xml)

# For debugging uncomment these lines.
# puts "******************** original *************************\n#{xml}"
# puts "******************** modifified ***********************\n#{handler.to_s}"

# For benchmarks these lines should be repeated to parse and to generate a
# modified XML string.
# handler = Saxy.new()
# Ox.sax_html(handler, xml)
# handler.to_s