File: xml_utils.py

package info (click to toggle)
chromium 120.0.6099.224-1~deb11u1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 6,112,112 kB
  • sloc: cpp: 32,907,025; ansic: 8,148,123; javascript: 3,679,536; python: 2,031,248; asm: 959,718; java: 804,675; xml: 617,256; sh: 111,417; objc: 100,835; perl: 88,443; cs: 53,032; makefile: 29,579; fortran: 24,137; php: 21,162; tcl: 21,147; sql: 20,809; ruby: 17,735; pascal: 12,864; yacc: 8,045; lisp: 3,388; lex: 1,323; ada: 727; awk: 329; jsp: 267; csh: 117; exp: 43; sed: 37
file content (161 lines) | stat: -rw-r--r-- 4,639 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# Copyright 2023 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Helpers to parse content of xml files."""
import html
from xml.dom import minidom

from typing import Iterator

_ELEMENT_NODE = minidom.Node.ELEMENT_NODE


def GetTagSubTree(tree: minidom.Element, tag: str,
                  depth: int) -> minidom.Element:
  """Returns sub tree with tag element as a root.

  When no element with tag name is found or there are many of them
  original tree is returned.

  Args:
    tree: XML dom tree.
    tag: Element's tag name.
    depth: Defines how deep in the tree function should search for a match.

  Returns:
    xml.dom.minidom.Node: Sub tree (matching criteria) or original one.
  """
  entries = list(IterElementsWithTag(tree, tag, depth))
  if len(entries) == 1:
    tree = entries[0]
  return tree


def NormalizeString(text: str) -> str:
  r"""Replaces all white space sequences with a single space.

  Also, unescapes any HTML escaped characters, e.g. " or >.

  Args:
    text: The string to normalize, '\n\n a \n b>c  '.

  Returns:
    The normalized string 'a b>c'.
  """
  line = ' '.join(text.split())

  # Unescape using default ASCII encoding. Unescapes any HTML escaped character
  # like " etc.
  return html.unescape(line)


def NormalizeAllAttributeValues(node: minidom.Element) -> minidom.Element:
  """Recursively normalizes all tag attribute values in the given tree.

  Args:
    node: The minidom node to be normalized.

  Returns:
    The normalized minidom node.
  """
  if node.nodeType == _ELEMENT_NODE:
    for a in node.attributes.keys():
      node.attributes[a].value = NormalizeString(node.attributes[a].value)

  for c in node.childNodes:
    NormalizeAllAttributeValues(c)
  return node


def GetTextFromChildNodes(node: minidom.Element) -> str:
  """Returns a string concatenation of the text of the given node's children.

  Comments are ignored, consecutive lines of text are joined with a single
  space, and paragraphs are maintained so that long text is more readable on
  dashboards.

  Args:
    node: The DOM Element whose children's text is to be extracted, processed,
      and returned.

  Returns:
    A string concatenation of the text of the given node's children.
  """
  paragraph_break = '\n\n'
  text_parts = []

  for child in node.childNodes:
    if child.nodeType != minidom.Node.COMMENT_NODE:
      child_text = child.toxml()
      if not child_text:
        continue

      # If the given node has the below XML representation, then the text
      # added to the list is 'Some words.\n\nWords.'
      # <tag>
      #   Some
      #   words.
      #
      #   <!--Child comment node.-->
      #
      #   Words.
      # </tag>

      # In the case of the first child text node, raw_paragraphs would store
      # ['\n  Some\n  words.', '  '], and in the case of the second,
      # raw_paragraphs would store ['', '  Words.\n'].
      raw_paragraphs = child_text.split(paragraph_break)

      # In the case of the first child text node, processed_paragraphs would
      # store ['Some words.', ''], and in the case of the second,
      # processed_paragraphs would store ['Words.'].
      processed_paragraphs = [
          NormalizeString(text) for text in raw_paragraphs if text
      ]
      text_parts.append(paragraph_break.join(processed_paragraphs))

  return ''.join(text_parts).strip()


def IterElementsWithTag(root: minidom.Element,
                        tag: str,
                        depth: int = -1) -> Iterator[minidom.Element]:
  """Iterates over DOM tree and yields elements matching tag name.

  It's meant to be replacement for `getElementsByTagName`,
  (which does recursive search) but without recursive search
  (nested tags are not supported in histograms files).

  Note: This generator stops going deeper in the tree when it detects
  that there are elements with given tag.

  Args:
    root: XML dom tree.
    tag: Element's tag name.
    depth: Defines how deep in the tree function should search for a match.

  Yields:
    xml.dom.minidom.Node: Element matching criteria.

  """
  if depth == 0 and root.nodeType == _ELEMENT_NODE and root.tagName == tag:
    yield root
    return

  had_tag = False

  skipped = 0

  for child in root.childNodes:
    if child.nodeType == _ELEMENT_NODE and child.tagName == tag:
      had_tag = True
      yield child
    else:
      skipped += 1

  depth -= 1

  if not had_tag and depth != 0:
    for child in root.childNodes:
      for match in IterElementsWithTag(child, tag, depth):
        yield match