File: string_extract.py

package info (click to toggle)
chromium 139.0.7258.127-1
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 6,122,068 kB
  • sloc: cpp: 35,100,771; ansic: 7,163,530; javascript: 4,103,002; python: 1,436,920; asm: 946,517; xml: 746,709; pascal: 187,653; perl: 88,691; sh: 88,436; objc: 79,953; sql: 51,488; cs: 44,583; fortran: 24,137; makefile: 22,147; tcl: 15,277; php: 13,980; yacc: 8,984; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (349 lines) | stat: -rw-r--r-- 13,048 bytes parent folder | download | duplicates (10)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
# Copyright 2018 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

"""Utilities to extract string literals from object files.

LookupElfRodataInfo():
  Runs readelf to extract and return .rodata section spec of an ELF file.

ReadFileChunks():
  Reads raw data from a file, given a list of ranges in the file.

ReadStringLiterals():
  Reads the ELF file to find the string contents of a list of string literals.

ResolveStringPiecesIndirect():
  BulkForkAndCall() target: Given {path: [string addresses]} and
  [raw_string_data for each string_section]:
  - Reads {path: [src_strings]}.
  - For each path, searches for src_strings in at most 1 raw_string_data over
    each string_section. If found, translates to string_range and annotates it
    to the string_section.
  - Returns [{path: [string_ranges]} for each string_section].

ResolveStringPieces():
  BulkForkAndCall() target: Given {path: [strings]} and
  [raw_string_data for each string_section]:
  - For each path, searches for src_strings in at most 1 raw_string_data over
    each string_section. If found, translates to string_range and annotates it
    to the string_section.
  - Returns [{path: [string_ranges]} for each string_section].

GetNameOfStringLiteralBytes():
  Converts string literal bytes to printable form, useful for assigning
  full_name of string literal symbols. If any non-printable character is found
  then returns models.STRING_LITERAL_NAME. Otherwise the returned string is
  quoted, and may be truncated (with "[...]" appended).
"""

import ast
import collections
import itertools
import logging
import os
import string
import subprocess

import ar
import models
import parallel
import path_util


_STRING_LITERAL_LENGTH_CUTOFF = 30

_PRINTABLE_TABLE = [False] * 256
for ch in string.printable:
  _PRINTABLE_TABLE[ord(ch)] = True


def LookupElfRodataInfo(elf_path):
  """Returns (address, offset, size) for the .rodata section."""
  args = [path_util.GetReadElfPath(), '-S', '--wide', elf_path]
  output = subprocess.check_output(args).decode('ascii')
  lines = output.splitlines()
  for line in lines:
    # [Nr] Name           Type        Addr     Off     Size   ES Flg Lk Inf Al
    # [07] .rodata        PROGBITS    025e7000 237c000 5ec4f6 00   A  0   0 256
    if '.rodata ' in line:
      fields = line[line.index(models.SECTION_RODATA):].split()
      return int(fields[2], 16), int(fields[3], 16), int(fields[4], 16)
  raise AssertionError('No .rodata for command: ' + repr(args))


def ReadFileChunks(path, section_ranges):
  """Returns a list of raw data from |path|, specified by |section_ranges|.

  Args:
    section_ranges: List of (offset, size).
  """
  ret = []
  if not section_ranges:
    return ret
  with open(path, 'rb') as f:
    for offset, size in section_ranges:
      f.seek(offset)
      ret.append(f.read(size))
  return ret


def _ExtractArchivePath(path):
  # E.g. foo/bar.a(baz.o)
  if path.endswith(')'):
    start_idx = path.index('(')
    return path[:start_idx]
  return None


def _LookupStringSectionPositions(target, output_directory):
  """Returns a dict of object_path -> [(offset, size)...] of .rodata sections.

  Args:
    target: An archive path string (e.g., "foo.a") or a list of object paths.
  """
  is_archive = isinstance(target, str)
  args = [path_util.GetReadElfPath(), '-S', '--wide']
  if is_archive:
    args.append(target)
  else:
    # Assign path for when len(target) == 1, (no File: line exists).
    path = target[0]
    args.extend(target)

  output = subprocess.check_output(args, cwd=output_directory).decode('ascii')
  lines = output.splitlines()
  section_positions_by_path = {}
  cur_offsets = []
  for line in lines:
    # File: base/third_party/libevent/libevent.a(buffer.o)
    # [Nr] Name              Type        Addr     Off    Size   ES Flg Lk Inf Al
    # [11] .rodata.str1.1    PROGBITS    00000000 0000b4 000004 01 AMS  0   0  1
    # [11] .rodata.str4.4    PROGBITS    00000000 0000b4 000004 01 AMS  0   0  4
    # [11] .rodata.str8.8    PROGBITS    00000000 0000b4 000004 01 AMS  0   0  8
    # [80] .rodata..L.str    PROGBITS    00000000 000530 000002 00   A  0   0  1
    # The various string sections differ by alignment.
    # The presence of a wchar_t literal (L"asdf") seems to make a str4 section.
    # When multiple sections exist, nm gives us no indication as to which
    # section each string corresponds to.
    if line.startswith('File: '):
      if cur_offsets:
        section_positions_by_path[path] = cur_offsets
        cur_offsets = []
      path = line[6:]
    elif '.rodata.' in line:
      progbits_idx = line.find('PROGBITS ')
      if progbits_idx != -1:
        fields = line[progbits_idx:].split()
        position = (int(fields[2], 16), int(fields[3], 16))
        # The heuristics in _IterStringLiterals rely on str1 coming first.
        if fields[-1] == '1':
          cur_offsets.insert(0, position)
        else:
          cur_offsets.append(position)
  if cur_offsets:
    section_positions_by_path[path] = cur_offsets
  return section_positions_by_path


def _ReadStringSections(target, output_directory, positions_by_path):
  """Returns a dict of object_path -> [string...] of .rodata chunks.

  Args:
    target: An archive path string (e.g., "foo.a") or a list of object paths.
    positions_by_path: A dict of object_path -> [(offset, size)...]
  """
  is_archive = isinstance(target, str)
  string_sections_by_path = {}
  if is_archive:
    for subpath, chunk in ar.IterArchiveChunks(
        os.path.join(output_directory, target)):
      path = '{}({})'.format(target, subpath)
      positions = positions_by_path.get(path)
      # No positions if file has no string literals.
      if positions:
        string_sections_by_path[path] = (
            [chunk[offset:offset + size] for offset, size in positions])
  else:
    for path in target:
      positions = positions_by_path.get(path)
      # We already log a warning about this in _IterStringLiterals().
      if positions:
        string_sections_by_path[path] = ReadFileChunks(
            os.path.join(output_directory, path), positions)
  return string_sections_by_path


def _IterStringLiterals(path, addresses, obj_sections):
  """Yields all string literals (including \0) for the given object path.

  Args:
    path: Object file path.
    addresses: List of string offsets encoded as hex strings.
    obj_sections: List of contents of .rodata.str sections read from the given
        object file.
  """

  next_offsets = sorted(int(a, 16) for a in addresses)
  if not obj_sections:
    # Happens when there is an address for a symbol which is not actually a
    # string literal, or when string_sections_by_path is missing an entry.
    logging.warning('Object has %d strings but no string sections: %s',
                    len(addresses), path)
    return
  for section_data in obj_sections:
    cur_offsets = next_offsets
    # Always assume first element is 0. I'm not entirely sure why this is
    # necessary, but strings get missed without it.
    next_offsets = [0]
    prev_offset = 0
    # TODO(agrieve): Switch to using nm --print-size in order to capture the
    #     address+size of each string rather than just the address.
    for offset in cur_offsets[1:]:
      if offset >= len(section_data):
        # Remaining offsets are for next section.
        next_offsets.append(offset)
        continue
      # Figure out which offsets apply to this section via heuristic of them
      # all ending with a null character.
      if offset == prev_offset or section_data[offset - 1] != 0:
        next_offsets.append(offset)
        continue
      yield section_data[prev_offset:offset]
      prev_offset = offset

    if prev_offset < len(section_data):
      yield section_data[prev_offset:]


def _AnnotateStringData(string_data, path_value_gen):
  """Annotates each |string_data| section data with paths and ranges.

  Args:
    string_data: [raw_string_data for each string_section] from an ELF file.
    path_value_gen: A generator of (path, value) pairs, where |path|
      is the path to an object file and |value| is a string to annotate.

  Returns:
    [{path: [string_ranges]} for each string_section].
  """
  ret = [collections.defaultdict(list) for _ in string_data]

  # Brute-force search ** merge strings sections in |string_data| for string
  # values from |path_value_gen|. This is by far the slowest part of
  # AnalyzeStringLiterals().
  # TODO(agrieve): Pre-process |string_data| into a dict of literal->address (at
  # least for ASCII strings).
  for path, value in path_value_gen:
    first_match = -1
    first_match_dict = None
    for target_dict, data in zip(ret, string_data):
      # Set offset so that it will be 0 when len(value) is added to it below.
      offset = -len(value)
      while True:
        offset = data.find(value, offset + len(value))
        if offset == -1:
          break
        # Preferring exact matches (those following \0) over substring matches
        # significantly increases accuracy (although shows that linker isn't
        # being optimal).
        if offset == 0 or data[offset - 1] == 0:
          break
        if first_match == -1:
          first_match = offset
          first_match_dict = target_dict
      if offset != -1:
        break
    if offset == -1:
      # Exact match not found, so take suffix match if it exists.
      offset = first_match
      target_dict = first_match_dict
    # Missing strings happen when optimization make them unused.
    if offset != -1:
      # Encode tuple as a string for easier mashalling.
      target_dict[path].append(str(offset) + ':' + str(len(value)))

  return ret


# This is a target for BulkForkAndCall().
def ResolveStringPiecesIndirect(encoded_string_addresses_by_path, string_data,
                                output_directory):
  string_addresses_by_path = parallel.DecodeDictOfLists(
      encoded_string_addresses_by_path)
  # Assign |target| as archive path, or a list of object paths.
  any_path = next(iter(string_addresses_by_path.keys()))
  target = _ExtractArchivePath(any_path)
  if not target:
    target = list(string_addresses_by_path.keys())

  # Run readelf to find location of .rodata within the .o files.
  section_positions_by_path = _LookupStringSectionPositions(
      target, output_directory)
  # Load the .rodata sections (from object files) as strings.
  string_sections_by_path = _ReadStringSections(
      target, output_directory, section_positions_by_path)

  def GeneratePathAndValues():
    for path, object_addresses in string_addresses_by_path.items():
      for value in _IterStringLiterals(
          path, object_addresses, string_sections_by_path.get(path)):
        yield path, value

  ret = _AnnotateStringData(string_data, GeneratePathAndValues())
  return [parallel.EncodeDictOfLists(x) for x in ret]


# This is a target for BulkForkAndCall().
def ResolveStringPieces(encoded_strings_by_path, string_data):
  # ast.literal_eval() undoes repr() applied to strings.
  strings_by_path = parallel.DecodeDictOfLists(
      encoded_strings_by_path, value_transform=ast.literal_eval)

  def GeneratePathAndValues():
    for path, strings in strings_by_path.items():
      for value in strings:
        yield path, value

  ret = _AnnotateStringData(string_data, GeneratePathAndValues())
  return [parallel.EncodeDictOfLists(x) for x in ret]


def ReadStringLiterals(symbols, elf_path, all_rodata=False):
  """Returns an iterable of (symbol, data) for all string literal symbols.

  Emitted string literal data are null-terminated bytes.

  Args:
    symbols: An iterable of Symbols
    elf_path: Path to the executable containing the symbols.
    all_rodata: Assume every symbol within .rodata that ends with a \0 is a
         string literal.
  """
  address, offset, _ = LookupElfRodataInfo(elf_path)
  adjust = offset - address
  with open(elf_path, 'rb') as f:
    for symbol in symbols:
      if symbol.section != 'r':
        continue
      f.seek(symbol.address + adjust)
      data = f.read(symbol.size_without_padding)
      # As of Oct 2017, there are ~90 symbols name .L.str(.##). These appear
      # in the linker map file explicitly, and there doesn't seem to be a
      # pattern as to which variables lose their kConstant name (the more
      # common case), or which string literals don't get moved to
      # ** merge strings (less common).
      if symbol.IsStringLiteral() or (all_rodata and data and data[-1] == 0):
        yield ((symbol, data))


def GetNameOfStringLiteralBytes(b):
  """Converts string literal bytes to printable form, may be truncated."""
  b = b.replace(b'\n', b'').replace(b'\t', b'').strip(b'\00')
  is_printable = all(_PRINTABLE_TABLE[c] for c in b)
  if is_printable:
    s = b.decode('ascii')
    if len(s) > _STRING_LITERAL_LENGTH_CUTOFF:
      return '"{}[...]"'.format(s[:_STRING_LITERAL_LENGTH_CUTOFF])
    return '"{}"'.format(s)
  return models.STRING_LITERAL_NAME