File: linker_map_parser.py

package info (click to toggle)
chromium 139.0.7258.127-1
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 6,122,068 kB
  • sloc: cpp: 35,100,771; ansic: 7,163,530; javascript: 4,103,002; python: 1,436,920; asm: 946,517; xml: 746,709; pascal: 187,653; perl: 88,691; sh: 88,436; objc: 79,953; sql: 51,488; cs: 44,583; fortran: 24,137; makefile: 22,147; tcl: 15,277; php: 13,980; yacc: 8,984; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (861 lines) | stat: -rwxr-xr-x 35,493 bytes parent folder | download | duplicates (8)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
#!/usr/bin/env python3
# Copyright 2017 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

"""Parser for linker map files.

The format of a linker map file depends on the linker that generates it. This
file uses "coded linker name" to identify formats and variants:

  'gold': The gold linker (usage is being deprecated by Chrome).
  'lld_v0': LLD linker (no LTO), old format.
  'lld-lto_v0': LLD linker with ThinLTO, old format.
  'lld_v1': LLD linker (no LTO), new format.
  'lld-lto_v1': LLD linker with ThinLTO, new format.
"""

import argparse
import code
import collections
import gzip
import itertools
import logging
import os
import re
import readline
import sys

import demangle
import models

# About linker maps:
# * "Discarded input sections" include symbols merged with other symbols
#   (aliases), so the information there is not actually a list of unused things.
# * Linker maps include symbols that do not have names (with object path),
#   whereas "nm" skips over these (they don't account for much though).
# * The parse time for compressed linker maps is dominated by ungzipping.

_STRIP_NAME_PREFIX = {
    models.FLAG_STARTUP: 8,
    models.FLAG_UNLIKELY: 9,
    models.FLAG_REL_LOCAL: 10,
    models.FLAG_REL: 4,
    models.FLAG_HOT: 4,
}


def _OpenMaybeGzAsText(path):
  """Calls `gzip.open()` if |path| ends in ".gz", otherwise calls `open()`."""
  if path.endswith('.gz'):
    return gzip.open(path, 'rt')
  return open(path, 'rt')


def _FlagsFromMangledName(name):
  # Currently, lld map files have section = '.text.startup' and put the symbol
  # name in the section break-down ("level 3 symbols").
  if name.startswith('startup.') or name == 'startup':
    return models.FLAG_STARTUP
  if name.startswith('unlikely.'):
    return models.FLAG_UNLIKELY
  if name.startswith('rel.local.'):
    return models.FLAG_REL_LOCAL
  if name.startswith('rel.'):
    return models.FLAG_REL
  if name.startswith('hot.'):
    return models.FLAG_HOT
  return 0


def _NormalizeName(name):
  # Outlined functions have names like OUTLINED_FUNCTION_0, which can
  # appear 1000+ time, and can cause false aliasing. We treat these as
  # special cases by designating them as a placeholder symbols and
  # renaming them to '** outlined function'.
  if name.startswith('OUTLINED_FUNCTION_'):
    return '** outlined function'
  if name.startswith('.L.str'):
    return models.STRING_LITERAL_NAME
  if name.endswith(' (.cfi)'):
    return name[:-7]
  return name


class MapFileParserGold:
  """Parses a linker map file from gold linker."""
  # Map file writer for gold linker:
  # https://github.com/gittup/binutils/blob/HEAD/gold/mapfile.cc

  def __init__(self):
    self._common_symbols = []
    self._symbols = []
    self._section_ranges = {}
    self._lines = None

  def Parse(self, lines):
    """Parses a linker map file.

    Args:
      lines: Iterable of lines, the first of which has been consumed to
      identify file type.

    Returns:
      A tuple of (section_ranges, symbols, extras).
    """
    self._lines = iter(lines)
    logging.debug('Scanning for Header')

    while True:
      line = self._SkipToLineWithPrefix('Common symbol', 'Memory map')
      if line.startswith('Common symbol'):
        self._common_symbols = self._ParseCommonSymbols()
        logging.debug('.bss common entries: %d', len(self._common_symbols))
        continue
      if line.startswith('Memory map'):
        self._ParseSections()
      break
    return self._section_ranges, self._symbols, {}

  def _SkipToLineWithPrefix(self, prefix, prefix2=None):
    for l in self._lines:
      if l.startswith(prefix) or (prefix2 and l.startswith(prefix2)):
        return l
    return None

  def _ParsePossiblyWrappedParts(self, line, count):
    parts = line.split(None, count - 1)
    if not parts:
      return None
    if len(parts) != count:
      line = next(self._lines)
      parts.extend(line.split(None, count - len(parts) - 1))
      assert len(parts) == count, 'parts: ' + ' '.join(parts)
    parts[-1] = parts[-1].rstrip()
    return parts

  def _ParseCommonSymbols(self):
    # Common symbol       size              file
    #
    # ff_cos_131072       0x40000           obj/third_party/<snip>
    # ff_cos_131072_fixed
    #                     0x20000           obj/third_party/<snip>
    ret = []
    next(self._lines)  # Skip past blank line

    name, size_str, path = None, None, None
    for l in self._lines:
      parts = self._ParsePossiblyWrappedParts(l, 3)
      if not parts:
        break
      name, size_str, path = parts
      sym = models.Symbol(
          models.SECTION_BSS,
          int(size_str[2:], 16),
          full_name=name,
          object_path=path)
      ret.append(sym)
    return ret

  def _ParseSections(self):
    # .text           0x0028c600  0x22d3468
    #  .text.startup._GLOBAL__sub_I_bbr_sender.cc
    #                 0x0028c600       0x38 obj/net/net/bbr_sender.o
    #  .text._reset   0x00339d00       0xf0 obj/third_party/icu/icuuc/ucnv.o
    #  ** fill        0x0255fb00   0x02
    #  .text._ZN4base8AutoLockD2Ev
    #                 0x00290710        0xe obj/net/net/file_name.o
    #                 0x00290711                base::AutoLock::~AutoLock()
    #                 0x00290711                base::AutoLock::~AutoLock()
    # .text._ZNK5blink15LayoutBlockFlow31mustSeparateMarginAfterForChildERK...
    #                 0xffffffffffffffff       0x46 obj/...
    #                 0x006808e1                blink::LayoutBlockFlow::...
    # .text.OUTLINED_FUNCTION_0
    #                 0x002a2000       0x20 obj/net/net/tag.o
    # .bss
    #  .bss._ZGVZN11GrProcessor11initClassIDI10LightingFPEEvvE8kClassID
    #                 0x02d4b294        0x4 obj/skia/skia/SkLightingShader.o
    #                 0x02d4b294   guard variable for void GrProcessor::ini...
    # .data           0x0028c600  0x22d3468
    #  .data.rel.ro._ZTVN3gvr7android19ScopedJavaGlobalRefIP12_jfloatArrayEE
    #                 0x02d1e668       0x10 ../third_party/.../libfoo.a(bar.o)
    #                 0x02d1e668   vtable for gvr::android::GlobalRef<_jflo...
    #  ** merge strings
    #                 0x0255fb00   0x1f2424
    #  ** merge constants
    #                 0x0255fb00   0x8
    # ** common       0x02db5700   0x13ab48
    syms = self._symbols
    while True:
      line = self._SkipToLineWithPrefix('.')
      if not line:
        break
      section_name = None
      try:
        # Parse section name and size.
        parts = self._ParsePossiblyWrappedParts(line, 3)
        if not parts:
          break
        section_name, section_address_str, section_size_str = parts
        section_address = int(section_address_str[2:], 16)
        section_size = int(section_size_str[2:], 16)
        self._section_ranges[section_name] = (section_address, section_size)
        if (section_name in models.BSS_SECTIONS
            or section_name in (models.SECTION_RODATA, models.SECTION_TEXT)
            or section_name.startswith(models.SECTION_DATA)):
          logging.info('Parsing %s', section_name)
          if section_name in models.BSS_SECTIONS:
            # Common symbols have no address.
            syms.extend(self._common_symbols)
          prefix_len = len(section_name) + 1  # + 1 for the trailing .
          symbol_gap_count = 0
          merge_symbol_start_address = section_address
          sym_count_at_start = len(syms)
          line = next(self._lines)
          # Parse section symbols.
          while True:
            if not line or line.isspace():
              break
            if line.startswith(' **'):
              zero_index = line.find('0')
              if zero_index == -1:
                # Line wraps.
                name = line.strip()
                line = next(self._lines)
              else:
                # Line does not wrap.
                name = line[:zero_index].strip()
                line = line[zero_index:]
              address_str, size_str = self._ParsePossiblyWrappedParts(line, 2)
              line = next(self._lines)
              # These bytes are already accounted for.
              if name == '** common':
                continue
              address = int(address_str[2:], 16)
              size = int(size_str[2:], 16)
              path = None
              sym = models.Symbol(section_name, size, address=address,
                                  full_name=name, object_path=path)
              syms.append(sym)
              if merge_symbol_start_address > 0:
                merge_symbol_start_address += size
            else:
              # A normal symbol entry.
              subsection_name, address_str, size_str, path = (
                  self._ParsePossiblyWrappedParts(line, 4))
              size = int(size_str[2:], 16)
              assert subsection_name.startswith(section_name), (
                  'subsection name was: ' + subsection_name)
              mangled_name = subsection_name[prefix_len:]
              name = None
              address_str2 = None
              while True:
                line = next(self._lines).rstrip()
                if not line or line.startswith(' .'):
                  break
                # clang includes ** fill, but gcc does not.
                if line.startswith(' ** fill'):
                  # Alignment explicitly recorded in map file. Rather than
                  # record padding based on these entries, we calculate it
                  # using addresses. We do this because fill lines are not
                  # present when compiling with gcc (only for clang).
                  continue
                if line.startswith(' **'):
                  break
                if name is None:
                  address_str2, name = self._ParsePossiblyWrappedParts(line, 2)

              if address_str == '0xffffffffffffffff':
                # The section needs special handling (e.g., a merge section)
                # It also generally has a large offset after it, so don't
                # penalize the subsequent symbol for this gap (e.g. a 50kb gap).
                # There seems to be no corelation between where these gaps occur
                # and the symbols they come in-between.
                # TODO(agrieve): Learn more about why this happens.
                if address_str2:
                  address = int(address_str2[2:], 16) - 1
                elif syms and syms[-1].address > 0:
                  # Merge sym with no second line showing real address.
                  address = syms[-1].end_address
                else:
                  logging.warning('First symbol of section had address -1')
                  address = 0

                merge_symbol_start_address = address + size
              else:
                address = int(address_str[2:], 16)
                # Finish off active address gap / merge section.
                if merge_symbol_start_address:
                  merge_size = address - merge_symbol_start_address
                  merge_symbol_start_address = 0
                  if merge_size > 0:
                    # merge_size == 0 for the initial symbol generally.
                    logging.debug('Merge symbol of size %d found at:\n  %r',
                                  merge_size, syms[-1])
                    # Set size=0 so that it will show up as padding.
                    sym = models.Symbol(
                        section_name, 0,
                        address=address,
                        full_name='** symbol gap %d' % symbol_gap_count)
                    symbol_gap_count += 1
                    syms.append(sym)

              #  .text.res_findResource_60
              #                 0x00178de8       0x12a obj/...
              #                 0x00178de9                res_findResource_60
              #  .text._ZN3url6ParsedC2Ev
              #                 0x0021ad62       0x2e obj/url/url/url_parse.o
              #                 0x0021ad63                url::Parsed::Parsed()
              #  .text.unlikely._ZN4base3CPUC2Ev
              #                 0x003f9d3c       0x48 obj/base/base/cpu.o
              #                 0x003f9d3d                base::CPU::CPU()
              full_name = name or mangled_name
              if mangled_name and (not name or mangled_name.startswith('_Z') or
                                   '._Z' in mangled_name):
                full_name = mangled_name

              flags = _FlagsFromMangledName(mangled_name)
              if full_name:
                if flags:
                  full_name = full_name[_STRIP_NAME_PREFIX[flags]:]
                else:
                  full_name = _NormalizeName(full_name)

              sym = models.Symbol(section_name, size, address=address,
                                  full_name=full_name, object_path=path,
                                  flags=flags)
              syms.append(sym)
          logging.debug('Symbol count for %s: %d', section_name,
                        len(syms) - sym_count_at_start)
      except:
        logging.error('Problem line: %r', line)
        logging.error('In section: %r', section_name)
        raise


class MapFileParserLld:
  """Parses a linker map file from LLD."""
  # Map file writer for LLD linker (for ELF):
  # https://github.com/llvm-mirror/lld/blob/HEAD/ELF/MapFile.cpp
  _LINE_RE_V0 = re.compile(r'([0-9a-f]+)\s+([0-9a-f]+)\s+(\d+) ( *)(.*)')
  _LINE_RE_V1 = re.compile(
      r'\s*[0-9a-f]+\s+([0-9a-f]+)\s+([0-9a-f]+)\s+(\d+) ( *)(.*)')
  _LINE_RE = [_LINE_RE_V0, _LINE_RE_V1]

  def __init__(self, linker_name):
    self._linker_name = linker_name
    self._common_symbols = []
    self._section_ranges = {}

  @staticmethod
  def ParseArmAnnotations(tok):
    """Decides whether a Level 3 token is an annotation.

    Returns:
      A 2-tuple (is_annotation, next_thumb2_mode):
        is_annotation: Whether |tok| is an annotation.
        next_thumb2_mode: New |thumb2_mode| value, or None if keep old value.
    """
    # Annotations for ARM match '$t', '$d.1', but not '$_21::invoke'.
    if tok.startswith('$') and (len(tok) == 2 or
                                (len(tok) >= 3 and tok[2] == '.')):
      if tok.startswith('$t'):
        return True, True  # Is annotation, enter Thumb2 mode.
      if tok.startswith('$a'):
        return True, False  # Is annotation, enter ARM32 mode.
      return True, None  # Is annotation, keep old |thumb2_mode| value.
    return False, None  # Not annotation, keep old |thumb2_mode| value.

  def Tokenize(self, lines):
    """Generator to filter and tokenize linker map lines."""
    # Extract e.g., 'lld_v0' -> 0, or 'lld-lto_v1' -> 1.
    map_file_version = int(self._linker_name.split('_v')[1])
    pattern = MapFileParserLld._LINE_RE[map_file_version]

    # A Level 3 symbol can have |size == 0| in some situations (e.g., assembly
    # code symbols). To provided better size estimates in this case, the "span"
    # of a Level 3 symbol is computed as:
    #  (A) The |address| difference compared to the next Level 3 symbol.
    #  (B) If the Level 3 symbol is the last among Level 3 lines nested under a
    #      Level 2 line: The difference between the Level 3 symbol's |address|
    #      and the containing Level 2 line's end address.
    # To handle (A), |lines| is visited using a one-step lookahead, using
    # |sentinel| to handle the last line. To handle (B), |level2_end_address| is
    # computed for each Level 2 line.
    sentinel = '0 0 0 0 THE_END'
    assert pattern.match(sentinel)
    level2_end_address = None
    thumb2_mode = False
    (line, address, size, level, tok) = (None, None, None, None, None)
    for next_line in itertools.chain(lines, (sentinel,)):
      m = pattern.match(next_line)
      if m is None:
        continue
      next_address = int(m.group(1), 16)
      next_size = int(m.group(2), 16)
      next_level = (len(m.group(4)) // 8) + 1  # Add 1 to agree with comments.
      next_tok = m.group(5)

      if next_level == 3:
        assert level >= 2, 'Cannot jump from Level 1 to Level 3.'
        # Detect annotations. If found, maybe update |thumb2_mode|, then skip.
        (is_annotation, next_thumb2_mode) = (
            MapFileParserLld.ParseArmAnnotations(next_tok))
        if is_annotation:
          if next_thumb2_mode:
            thumb2_mode = next_thumb2_mode
          continue  # Skip annotations.
        if thumb2_mode:
          # Adjust odd address to even. Alignment is not guanteed for all
          # symbols (e.g., data, or x86), so this is judiciously applied.
          next_address &= ~1
      else:
        thumb2_mode = False  # Resets on leaving Level 3.

      if address is not None:
        span = None
        if level == 3:
          span = next_address if next_level == 3 else level2_end_address
          span -= address
        elif level == 2:
          level2_end_address = address + size
        yield (line, address, size, level, span, tok)

      line = next_line
      address = next_address
      size = next_size
      level = next_level
      tok = next_tok

  def Parse(self, lines):
    """Parses a linker map file.

    Args:
      lines: Iterable of lines, the first of which has been consumed to
      identify file type.

    Returns:
      A tuple of (section_ranges, symbols).
    """
    # Newest format:
    #     VMA      LMA     Size Align Out     In      Symbol
    #     194      194       13     1 .interp
    #     194      194       13     1         <internal>:(.interp)
    #     1a8      1a8     22d8     4 .ARM.exidx
    #     1b0      1b0        8     4         obj/sandbox/syscall.o:(.ARM.exidx)
    #     400      400   123400    64 .text
    #     600      600       14     4         ...:(.text.OUTLINED_FUNCTION_0)
    #     600      600        0     1                 $x.3
    #     600      600       14     1                 OUTLINED_FUNCTION_0
    #  123800   123800    20000   256 .rodata
    #  123800   123800       4      4         ...:o:(.rodata._ZN3fooE.llvm.1234)
    #  123800   123800       4      1                 foo (.llvm.1234)
    #  123804   123804       4      4         ...:o:(.rodata.bar.llvm.1234)
    #  123804   123804       4      1                 bar.llvm.1234
    # Older format:
    # Address          Size             Align Out     In      Symbol
    # 00000000002002a8 000000000000001c     1 .interp
    # 00000000002002a8 000000000000001c     1         <internal>:(.interp)
    # ...
    # 0000000000201000 0000000000000202    16 .text
    # 0000000000201000 000000000000002a     1         /[...]/crt1.o:(.text)
    # 0000000000201000 0000000000000000     0                 _start
    # 000000000020102a 0000000000000000     1         /[...]/crti.o:(.text)
    # 0000000000201030 00000000000000bd    16         /[...]/crtbegin.o:(.text)
    # 0000000000201030 0000000000000000     0             deregister_tm_clones
    # 0000000000201060 0000000000000000     0             register_tm_clones
    # 00000000002010a0 0000000000000000     0             __do_global_dtors_aux
    # 00000000002010c0 0000000000000000     0             frame_dummy
    # 00000000002010ed 0000000000000071     1         a.o:(.text)
    # 00000000002010ed 0000000000000071     0             main
    syms = []
    cur_section = None
    cur_section_is_useful = False
    promoted_name_count = 0
    # |is_partial| indicates that an eligible Level 3 line should be used to
    # update |syms[-1].full_name| instead of creating a new symbol.
    is_partial = False
    # Assembly code can create consecutive Level 3 lines with |size == 0|. These
    # lines can represent
    #  (1) assembly functions (should form symbol), or
    #  (2) assembly labels (should NOT form symbol).
    # It seems (2) correlates with the presence of a leading Level 3 line with
    # |size > 0|. This gives rise to the following strategy: Each symbol S from
    # a Level 3 line suppresses Level 3 lines with |address| less than
    # |next_usable_address := S.address + S.size|.
    next_usable_address = 0

    # For Thin-LTO, a map from each address to the Thin-LTO cache file. This
    # provides hints downstream to identify object_paths for .L.ref.tmp symbols,
    # but is not useful in the final output. Therefore it's stored separately,
    # instead of being in Symbol.
    thin_map = {}

    tokenizer = self.Tokenize(lines)

    in_partitions = False
    in_jump_table = False
    jump_tables_count = 0
    jump_entries_count = 0

    for (line, address, size, level, span, tok) in tokenizer:
      # Level 1 data match the "Out" column. They specify sections or
      # PROVIDE_HIDDEN lines.
      if level == 1:
        # Ignore sections that belong to feature library partitions. Seeing a
        # partition name is an indicator that we've entered a list of feature
        # partitions. After these, a single .part.end section will follow to
        # reserve memory at runtime. Seeing the .part.end section also marks the
        # end of partition sections in the map file.
        if tok.endswith('_partition'):
          in_partitions = True
        elif tok == '.part.end':
          # Note that we want to retain .part.end section, so it's fine to
          # restart processing on this section, rather than the next one.
          in_partitions = False

        if in_partitions:
          # For now, completely ignore feature partitions.
          cur_section = None
          cur_section_is_useful = False
        else:
          if not tok.startswith('PROVIDE_HIDDEN'):
            self._section_ranges[tok] = (address, size)
          cur_section = tok
          # E.g., Want to convert "(.text._name)" -> "_name" later.
          mangled_start_idx = len(cur_section) + 2
          cur_section_is_useful = (
              cur_section in models.BSS_SECTIONS
              or cur_section in (models.SECTION_RODATA, models.SECTION_TEXT)
              or cur_section.startswith(models.SECTION_DATA))

      elif cur_section_is_useful:
        # Level 2 data match the "In" column. They specify object paths and
        # section names within objects, or '<internal>:...'.
        if level == 2:
          # E.g., 'path.o:(.text._name)' => ['path.o', '(.text._name)'].
          cur_obj, paren_value = tok.split(':')

          in_jump_table = '.L.cfi.jumptable' in paren_value
          if in_jump_table:
            # Store each CFI jump table as a Level 2 symbol, whose Level 3
            # details are discarded.
            jump_tables_count += 1
            cur_obj = ''  # Replaces 'lto.tmp' to prevent problem later.
            mangled_name = '** CFI jump table'
          else:
            # E.g., '(.text.unlikely._name)' -> '_name'.
            mangled_name = paren_value[mangled_start_idx:-1]
            cur_flags = _FlagsFromMangledName(mangled_name)
            is_partial = True
            # As of 2017/11 LLD does not distinguish merged strings from other
            # merged data. Feature request is filed under:
            # https://bugs.llvm.org/show_bug.cgi?id=35248
            if cur_obj == '<internal>':
              if cur_section == '.rodata':
                # Treat all <internal> sections within .rodata as as string
                # literals. Some may hold numeric constants or other data, but
                # there is currently no way to distinguish them.
                mangled_name = '** lld merge strings'
              else:
                # e.g. <internal>:(.text.thunk)
                mangled_name = '** ' + mangled_name

              is_partial = False
              cur_obj = None
            elif (cur_obj == 'lto.tmp' or 'thinlto-cache' in cur_obj
                  or '.lto.' in cur_obj):
              thin_map[address] = os.path.basename(cur_obj)
              cur_obj = None

          # Create a symbol here since there may be no ensuing Level 3 lines.
          # But if there are, then the symbol can be modified later as sym[-1].
          sym = models.Symbol(cur_section, size, address=address,
                              full_name=mangled_name, object_path=cur_obj,
                              flags=cur_flags)
          syms.append(sym)

          # Level 3 |address| is nested under Level 2, don't add |size|.
          next_usable_address = address

        # Level 3 data match the "Symbol" column. They specify symbol names or
        # special names such as '.L_MergeGlobals'. Annotations such as '$d',
        # '$t.42' also appear at Level 3, but they are consumed by |tokenizer|,
        # so don't appear hear.
        elif level == 3:
          # Handle .L.cfi.jumptable.
          if in_jump_table:
            # Level 3 entries in CFI jump tables are thunks with mangled names.
            # Extracting them as symbols is not worthwhile; we only store the
            # Level 2 symbol, and print the count for verbose output. For
            # counting, '__typeid_' entries are excluded since they're likely
            # just annotations.
            if not tok.startswith('__typeid_'):
              jump_entries_count += 1
            continue

          # Ignore anything with '.L_MergedGlobals' prefix. This seems to only
          # happen for ARM (32-bit) builds.
          if tok.startswith('.L_MergedGlobals'):
            continue

          # Use |span| to decide whether to use a Level 3 line for Symbols. This
          # is useful for two purposes:
          # * This is a better indicator than |size|, which can be 0 for
          #   assembly functions.
          # * If multiple Level 3 lines have the same starting address, this
          #   cause all but the last line to have |span > 0|. This dedups lines
          #   with identical symbol names (why do they exist?). Note that this
          #   also skips legitimate aliases, but that's desired because nm.py
          #   (downstream) assumes no aliases already exist.
          if span > 0:
            stripped_tok = demangle.StripLlvmPromotedGlobalNames(tok)
            if len(tok) != len(stripped_tok):
              promoted_name_count += 1
              tok = stripped_tok
            tok = _NormalizeName(tok)

            # Handle special case where a partial symbol consumes bytes before
            # the first Level 3 symbol.
            if is_partial and syms[-1].address < address:
              # Truncate the partial symbol and leave it without |full_name|.
              # The data from the current line will form a new symbol.
              syms[-1].size = address - syms[-1].address
              next_usable_address = address
              is_partial = False

            if is_partial:
              syms[-1].full_name = tok
              syms[-1].size = size if size > 0 else min(syms[-1].size, span)
              next_usable_address = address + syms[-1].size
              is_partial = False
            elif address >= next_usable_address:
              if tok.startswith('__typeid_'):
                assert size == 1
                if tok.endswith('_byte_array'):
                  # CFI byte array table: |size| is inaccurate, so use |span|.
                  size_to_use = span
                else:
                  # Likely '_global_addr' or '_unique_member'. These should be:
                  # * Skipped since they're in CFI tables.
                  # * Suppressed (via |next_usable_address|) by another Level 3
                  #   symbol.
                  # Anything that makes it here would be an anomaly worthy of
                  # investigation, so print warnings.
                  logging.warning('Unrecognized __typeid_ symbol at %08X',
                                  address)
                  continue
              else:
                # Prefer |size|, and only fall back to |span| if |size == 0|.
                size_to_use = size if size > 0 else span
              sym = models.Symbol(cur_section, size_to_use, address=address,
                                  full_name=tok, flags=cur_flags)
              syms.append(sym)

              # Suppress symbols with overlapping |address|. This eliminates
              # labels from assembly sources.
              next_usable_address = address + size_to_use
              if cur_obj is not None:
                syms[-1].object_path = cur_obj

        else:
          logging.error('Problem line: %r', line)

    if promoted_name_count:
      logging.info('Found %d promoted global names', promoted_name_count)
    if jump_tables_count:
      logging.info('Found %d CFI jump tables with %d total entries',
                   jump_tables_count, jump_entries_count)
    return self._section_ranges, syms, {'thin_map': thin_map}


def _DetectLto(lines):
  """Scans LLD linker map file and returns whether LTO was used."""
  # It's assumed that the first line in |lines| was consumed to determine that
  # LLD was used. Seek 'thinlto-cache' prefix or the string '.lto' within an
  # "indicator section" as indicator for LTO.
  found_indicator_section = False
  # Potential names of "main section". Only one gets used.
  indicator_section_set = set(['.rodata', '.ARM.exidx'])
  start_pos = -1
  for line in lines:
    # Shortcut to avoid regex: The first line seen (second line in file) should
    # start a section, and start with '.', e.g.:
    #     194      194       13     1 .interp
    # Assign |start_pos| as position of '.', and trim everything before!
    if start_pos < 0:
      start_pos = line.index('.')
    if len(line) < start_pos:
      continue
    line = line[start_pos:]
    tok = line.lstrip()  # Allow whitespace at right.
    indent_size = len(line) - len(tok)
    if indent_size == 0:  # Section change.
      if found_indicator_section:  # Exit if just visited "main section".
        break
      if tok.strip() in indicator_section_set:
        found_indicator_section = True
    elif indent_size == 8:
      if found_indicator_section:
        if tok.startswith('thinlto-cache') or '.lto.' in tok:
          return True
  return False


def _DetectLinkerName(lines):
  """Heuristic linker detection from partial scan of the linker map.

  Args:
    lines: Iterable of lines from the linker map.

  Returns:
    A coded linker name.
  """
  first_line = next(lines)

  if first_line.startswith('Address'):
    return 'lld-lto_v0' if _DetectLto(lines) else 'lld_v0'

  if first_line.lstrip().startswith('VMA'):
    return 'lld-lto_v1' if _DetectLto(lines) else 'lld_v1'

  if first_line.startswith('Archive member'):
    return 'gold'

  raise Exception('Invalid map file: ' + first_line)


def ParseLines(lines):
  """Parses a linker map file given an iterable of its lines.

  Returns:
    A tuple of (section_ranges, symbols, extras).
  """
  # Buffer 1000 lines for format detection.
  header = list(itertools.islice(lines, 1000))
  lines = itertools.chain(header, lines)
  linker_name = _DetectLinkerName(iter(header))
  logging.info('Detected map file of type %s', linker_name)
  if linker_name.startswith('lld'):
    inner_parser = MapFileParserLld(linker_name)
  elif linker_name == 'gold':
    inner_parser = MapFileParserGold()
  else:
    raise Exception('.map file is from a unsupported linker.')

  next(lines)  # Consume the first line of headers.
  section_ranges, syms, extras = inner_parser.Parse(lines)
  for sym in syms:
    if sym.object_path and not sym.object_path.endswith(')'):
      # Don't want '' to become '.'.
      # Thin archives' paths will get fixed in |ar.CreateThinObjectPath|.
      sym.object_path = os.path.normpath(sym.object_path)
  return section_ranges, syms, extras


def ParseFile(path):
  """Parses a linker map file pointed to by |path|.

  Returns:
    A tuple of (section_ranges, symbols, extras).
  """
  with _OpenMaybeGzAsText(path) as f:
    return ParseLines(f)


def DeduceObjectPathsFromThinMap(raw_symbols, extras):
  """Uses Thin-LTO object paths to find object_paths of symbols. """
  thin_map = extras.get('thin_map', None)  # |address| -> |thin_obj|
  if not thin_map:  # None or empty.
    logging.info('No thin-object-path found: Skipping object path deduction.')
    return

  # Build map of |thin_obj| -> |object_paths|.
  thin_obj_to_object_paths = collections.defaultdict(set)
  logging.info('Building map of thin-object-path -> object path.')
  for symbol in raw_symbols:
    if symbol.object_path:
      thin_obj = thin_map.get(symbol.address, None)
      if thin_obj:
        thin_obj_to_object_paths[thin_obj].add(symbol.object_path)

  # For each symbol without |object_path|, translate |address| -> |thin_obj| ->
  # |object_paths|. If unique, then assign to symbol. Stats are kept, keyed on
  # |len(object_paths)|.
  # Example symbols this happens with: ".L.ref.tmp", "** outlined function".
  logging.info('Assigning object paths to using ThinLTO paths.')
  ref_tmp_popu = [0] * 3
  ref_tmp_pss = [0] * 3
  for symbol in raw_symbols:
    if not symbol.object_path:
      thin_obj = thin_map.get(symbol.address)
      # Ignore non-native symbols.
      if thin_obj:
        count = 0
        object_paths = thin_obj_to_object_paths.get(thin_obj)
        if object_paths is not None:
          count = min(len(object_paths), 2)  # 2+ maps to 2.
          # We could create path aliases when count > 1, but it wouldn't
          # necessarily be correct. That occurs when *another* symbol from the
          # same .o file contains a path alias, but not necessarily this symbol.
          if count == 1:
            symbol.object_path = next(iter(object_paths))
        ref_tmp_popu[count] += 1
        ref_tmp_pss[count] += symbol.pss

  # As of Mar 2019:
  #   No match: 2 symbols with total PSS = 20
  #   Assigned (1 object path): 1098 symbols with total PSS = 55454
  #   Ambiguous (2+ object paths): 2315 symbols with total PSS = 41941
  logging.info('Object path deduction results for pathless symbols:')
  logging.info('  No match: %d symbols with total PSS = %d', ref_tmp_popu[0],
               ref_tmp_pss[0])
  logging.info('  Assigned (1 object path): %d symbols with total PSS = %d',
               ref_tmp_popu[1], ref_tmp_pss[1])
  logging.info('  Ambiguous (2+ object paths): %d symbols with total PSS = %d',
               ref_tmp_popu[2], ref_tmp_pss[2])


def main():
  parser = argparse.ArgumentParser()
  parser.add_argument('linker_file', type=os.path.realpath)
  parser.add_argument(
      '-v',
      '--verbose',
      default=0,
      action='count',
      help='Verbose level (multiple times for more)')
  parser.add_argument('--dump', action='store_true')
  args = parser.parse_args()

  logging.basicConfig(
      level=logging.WARNING - args.verbose * 10,
      format='%(levelname).1s %(relativeCreated)6d %(message)s')

  section_ranges, syms, extras = ParseFile(args.linker_file)

  if args.dump:
    print(section_ranges)
    for sym in syms:
      print(sym)
  else:
    # Enter interactive shell.
    readline.parse_and_bind('tab: complete')
    variables = {
        'section_ranges': section_ranges,
        'syms': syms,
        'extras': extras
    }
    banner_lines = [
        '*' * 80,
        'Variables:',
        '  section_ranges: Map from section name to (address, size).',
        '  syms: Raw symbols parsed from the linker map file.',
        '  extras: Format-specific extra data.',
        '*' * 80,
    ]
    code.InteractiveConsole(variables).interact('\n'.join(banner_lines))


if __name__ == '__main__':
  main()