File: split_xml.py

package info (click to toggle)
chromium 120.0.6099.224-1~deb11u1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 6,112,112 kB
  • sloc: cpp: 32,907,025; ansic: 8,148,123; javascript: 3,679,536; python: 2,031,248; asm: 959,718; java: 804,675; xml: 617,256; sh: 111,417; objc: 100,835; perl: 88,443; cs: 53,032; makefile: 29,579; fortran: 24,137; php: 21,162; tcl: 21,147; sql: 20,809; ruby: 17,735; pascal: 12,864; yacc: 8,045; lisp: 3,388; lex: 1,323; ada: 727; awk: 329; jsp: 267; csh: 117; exp: 43; sed: 37
file content (301 lines) | stat: -rw-r--r-- 10,612 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
# Copyright 2020 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Splits a XML file into smaller XMLs in subfolders.

Splits nodes according to the first camelcase part of their name attribute.
Intended to be used to split up the large histograms.xml or enums.xml file.
"""

import os
import re
from xml.dom import minidom

import histogram_configuration_model
import histogram_paths
import merge_xml
import path_util

# The top level comment templates that will be formatted and added to each split
# histograms xml.
FIRST_TOP_LEVEL_COMMENT_TEMPLATE = """
Copyright 2021 The Chromium Authors
Use of this source code is governed by a BSD-style license that can be
found in the LICENSE file.
"""
SECOND_TOP_LEVEL_COMMENT_TEMPLATE = """
This file is used to generate a comprehensive list of %s
along with a detailed description for each histogram.

For best practices on writing histogram descriptions, see
https://chromium.googlesource.com/chromium/src.git/+/HEAD/tools/metrics/histograms/README.md

Please send CLs to individuals in the OWNERS file in the same directory as this
xml file. If no OWNERS file exists, then send the CL to
chromium-metrics-reviews@google.com.
"""
# Number of times that splitting of histograms will be carried out.
TARGET_DEPTH = 1
# The number of histograms below which they will be aggregated into
# the histograms.xml in 'others'.
AGGREGATE_THRESHOLD = 20
# A map from the histogram name to the folder name these histograms should be
# put in.
_PREDEFINED_NAMES_MAPPING = {
    'BackForwardCache': 'BackForwardCache',
    'ChromeOS': 'ChromeOS',
    'CustomTabs': 'CustomTabs',
    'CustomTab': 'CustomTabs',
    'DataReductionProxy': 'DataReductionProxy',
    'DataUse': 'DataUse',
    'MultiDevice': 'MultiDevice',
    'NaCl': 'NaCl',
    'SafeBrowsing': 'SafeBrowsing',
    'SafeBrowsingBinaryUploadRequest': 'SafeBrowsing',
    'SafeBrowsingFCMService': 'SafeBrowsing',
    'NewTabPage': 'NewTabPage',
    'SiteEngagementService': 'SiteEngagementService',
    'SiteIsolation': 'SiteIsolation',
    'Tabs': 'Tab',
    'TextFragmentAnchor': 'TextFragmentAnchor',
    'TextToSpeech': 'TextToSpeech',
    'UpdateEngine': 'UpdateEngine',
    'WebApk': 'WebApk',
    'WebApp': 'WebApp',
    'WebAudio': 'WebAudio',
    'WebAuthentication': 'WebAuthentication',
    'WebCore': 'WebCore',
    'WebFont': 'WebFont',
    'WebHistory': 'WebHistory',
    'WebRTC': 'WebRTC',
    'WebRtcEventLogging': 'WebRTC',
    'WebRtcTextLogging': 'WebRTC',
    'WebUI': 'WebUI',
    'WebUITabStrip': 'WebUI',
}


def _ParseMergedXML():
  """Parses merged xml into different types of nodes"""
  merged_histograms = merge_xml.MergeFiles(histogram_paths.HISTOGRAMS_XMLS)
  histogram_nodes = merged_histograms.getElementsByTagName('histogram')
  variants_nodes = merged_histograms.getElementsByTagName('variants')
  histogram_suffixes_nodes = merged_histograms.getElementsByTagName(
      'histogram_suffixes')
  return histogram_nodes, variants_nodes, histogram_suffixes_nodes


def _CreateXMLFile(comment, parent_node_string, nodes, output_dir, filename):
  """Creates XML file for given type of XML nodes.

  This function also creates a |parent_node_string| tag as the parent node, e.g.
  <histograms> or <histogram_suffixes_list>, that wraps all the |nodes| in the
  output XML.

  Args:
    comment: The string to be formatted in the |TOP_LEVEL_COMMENT_TEMPLATE|
        which will then be added on top of each split xml.
    parent_node_string: The name of the the second-level parent node, e.g.
        <histograms> or <histogram_suffixes_list>.
    nodes: A DOM NodeList object or a list containing <histogram> or
        <histogram_suffixes> that will be inserted under the parent node.
    output_dir: The output directory.
    filename: The output filename.
  """
  doc = minidom.Document()

  doc.appendChild(doc.createComment(FIRST_TOP_LEVEL_COMMENT_TEMPLATE))
  doc.appendChild(doc.createComment(SECOND_TOP_LEVEL_COMMENT_TEMPLATE %
                                    comment))

  # Create the <histogram-configuration> node for the new histograms.xml file.
  histogram_config_element = doc.createElement('histogram-configuration')
  doc.appendChild(histogram_config_element)
  parent_element = doc.createElement(parent_node_string)
  histogram_config_element.appendChild(parent_element)

  # Under the parent node, append the children nodes.
  for node in nodes:
    parent_element.appendChild(node)

  output_path = os.path.join(output_dir, filename)
  if os.path.exists(output_path):
    os.remove(output_path)

  # Use the model to get pretty-printed XML string and write into file.
  with open(output_path, 'w') as output_file:
    pretty_xml_string = histogram_configuration_model.PrettifyTree(doc)
    output_file.write(pretty_xml_string)


def _GetCamelCaseName(node, depth=0):
  """Returns the first camelcase name part of the given |node|.

  Args:
    node: The node to get name from.
    depth: The depth that specifies which name part will be returned.
        e.g. For a node of name
        'CustomTabs.DynamicModule.CreatePackageContextTime'
        The returned camel name for depth 0 is 'Custom';
        The returned camel name for depth 1 is 'Dynamic';
        The returned camel name for depth 2 is 'Create'.

        Default depth is set to 0 as this function is imported and
        used in other files, where depth used is 0.

  Returns:
    The camelcase name part at specified depth. If the number of name parts is
    less than the depth, return 'others'.
  """
  name = node.getAttribute('name')
  split_string_list = name.split('.')
  if len(split_string_list) <= depth:
    return 'others'
  elif split_string_list[depth] in _PREDEFINED_NAMES_MAPPING:
    return _PREDEFINED_NAMES_MAPPING[split_string_list[depth]]
  else:
    name_part = split_string_list[depth]
    start_index = 0
    # |all_upper| is used to identify the case where the name is ABCDelta, in
    # which case the camel name of depth 0 should be ABC, instead of A.
    all_upper = True
    for index, letter in enumerate(name_part):
      if letter.islower() or letter.isnumeric():
        all_upper = False
      if letter.isupper() and not all_upper:
        start_index = index
        break

  if start_index == 0:
    return name_part
  else:
    return name_part[0:start_index]


def GetDirForNode(node):
  """Returns the correct directory that the given |node| should be placed in."""
  camel_name = _GetCamelCaseName(node)
  # Check if the directory of its prefix exists. Return the |camel_name| if the
  # folder exists. Otherwise, this |node| should be placed in 'others' folder.
  if camel_name in histogram_paths.HISTOGRAMS_PREFIX_LIST:
    return camel_name
  return 'others'


def _CamelCaseToSnakeCase(name):
  """Converts CamelCase |name| to snake_case."""
  name = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
  return re.sub('([a-z0-9])([A-Z])', r'\1_\2', name).lower()


def _OutputToFolderAndXML(nodes, output_dir, key):
  """Creates new folder and XML file for separated histograms.

  Args:
    nodes: A list of histogram/variants nodes of a prefix.
    output_dir: The output directory.
    key: The prefix of the histograms, also the name of the new folder.
  """
  # Convert CamelCase name to snake_case when creating a directory.
  output_dir = os.path.join(output_dir, _CamelCaseToSnakeCase(key))
  if not os.path.exists(output_dir):
    os.makedirs(output_dir)
  _CreateXMLFile(key + ' histograms', 'histograms', nodes, output_dir,
                 'histograms.xml')


def _WriteDocumentDict(document_dict, output_dir):
  """Recursively writes |document_dict| to xmls in |output_dir|.

  Args:
    document_dict: A dictionary where the key is the prefix of the histogram and
        value is a list of nodes or another dict.
    output_dir: The output directory of the resulting folders.
  """
  for key, val in document_dict.items():
    if isinstance(val, list):
      _OutputToFolderAndXML(val, output_dir, key)
    else:
      _WriteDocumentDict(val, os.path.join(output_dir, key))


def _AggregateMinorNodes(node_dict):
  """Aggregates groups of nodes below threshold number into 'others'.

  Args:
    node_dict: A dictionary where the key is the prefix of the histogram/variant
        and value is a list of histogram/variant nodes.
  """
  others = node_dict.pop('others', [])

  for key, nodes in node_dict.items():
    # For a prefix, if the number of histograms is fewer than threshold,
    # aggregate into others.
    if len(nodes) < AGGREGATE_THRESHOLD:
      others.extend(nodes)
      del node_dict[key]

  if others:
    node_dict['others'] = others


def _BuildDocumentDict(nodes, depth):
  """Recursively builds a document dict which will be written later.

  This function recursively builds a document dict which the key of the dict is
  the first word of the node's name at the given |depth| and the value of the
  dict is either a list of nodes that correspond to the key or another dict if
  it doesn't reach to |TARGET_DEPTH|.

  Args:
    nodes: A list of histogram nodes or variants node.
    depth: The current depth, starting from 0.

  Returns:
    The document dict.
  """
  if depth == TARGET_DEPTH:
    return nodes

  temp_dict = document_dict = {}
  for node in nodes:
    name_part = _GetCamelCaseName(node, depth)
    if name_part not in temp_dict:
      temp_dict[name_part] = []
    temp_dict[name_part].append(node)

  # Aggregate keys with less than |AGGREGATE_THRESHOLD| values to 'others'.
  _AggregateMinorNodes(temp_dict)

  for key, nodes in temp_dict.items():
    if key == 'others':
      document_dict[key] = nodes
    else:
      document_dict[key] = _BuildDocumentDict(nodes, depth + 1)

  return document_dict


def SplitIntoMultipleHistogramXMLs(output_base_dir):
  """Splits a large histograms.xml and writes out the split xmls.

  Args:
    output_base_dir: The output base directory.
  """
  if not os.path.exists(output_base_dir):
    os.mkdir(output_base_dir)

  histogram_nodes, variants_nodes, histogram_suffixes_nodes = _ParseMergedXML()

  # Create separate XML file for histogram suffixes.
  _CreateXMLFile('histogram suffixes', 'histogram_suffixes_list',
                 histogram_suffixes_nodes, output_base_dir,
                 'histogram_suffixes_list.xml')
  document_dict = _BuildDocumentDict(histogram_nodes + variants_nodes, 0)

  _WriteDocumentDict(document_dict, output_base_dir)


if __name__ == '__main__':
  SplitIntoMultipleHistogramXMLs(
      path_util.GetInputFile('tools/metrics/histograms/metadata'))