File: utils.py

package info (click to toggle)
pyinstaller 6.13.0%2Bds-2
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 11,520 kB
  • sloc: python: 41,347; ansic: 11,334; makefile: 176; sh: 136; xml: 19
file content (834 lines) | stat: -rw-r--r-- 37,855 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
#-----------------------------------------------------------------------------
# Copyright (c) 2005-2023, PyInstaller Development Team.
#
# Distributed under the terms of the GNU General Public License (version 2
# or later) with exception for distributing the bootloader.
#
# The full license is in the file COPYING.txt, distributed with this software.
#
# SPDX-License-Identifier: (GPL-2.0-or-later WITH Bootloader-exception)
#-----------------------------------------------------------------------------

import fnmatch
import glob
import hashlib
import io
import marshal
import os
import pathlib
import platform
import shutil
import struct
import subprocess
import sys
import zipfile

from PyInstaller import compat
from PyInstaller import log as logging
from PyInstaller.compat import EXTENSION_SUFFIXES, is_darwin, is_win, is_linux
from PyInstaller.config import CONF
from PyInstaller.exceptions import InvalidSrcDestTupleError
from PyInstaller.utils import misc

if is_win:
    from PyInstaller.utils.win32 import versioninfo

if is_darwin:
    import PyInstaller.utils.osx as osxutils

logger = logging.getLogger(__name__)

# -- Helpers for checking guts.
#
# NOTE: by _GUTS it is meant intermediate files and data structures that PyInstaller creates for bundling files and
# creating final executable.


def _check_guts_eq(attr_name, old_value, new_value, last_build):
    """
    Rebuild is required if values differ.
    """
    if old_value != new_value:
        logger.info("Building because %s changed", attr_name)
        return True
    return False


def _check_guts_toc_mtime(attr_name, old_toc, new_toc, last_build):
    """
    Rebuild is required if mtimes of files listed in old TOC are newer than last_build.

    Use this for calculated/analysed values read from cache.
    """
    for dest_name, src_name, typecode in old_toc:
        if misc.mtime(src_name) > last_build:
            logger.info("Building because %s changed", src_name)
            return True
    return False


def _check_guts_toc(attr_name, old_toc, new_toc, last_build):
    """
    Rebuild is required if either TOC content changed or mtimes of files listed in old TOC are newer than last_build.

    Use this for input parameters.
    """
    return _check_guts_eq(attr_name, old_toc, new_toc, last_build) or \
        _check_guts_toc_mtime(attr_name, old_toc, new_toc, last_build)


def add_suffix_to_extension(dest_name, src_name, typecode):
    """
    Take a TOC entry (dest_name, src_name, typecode) and adjust the dest_name for EXTENSION to include the full library
    suffix.
    """
    # No-op for non-extension
    if typecode != 'EXTENSION':
        return dest_name, src_name, typecode

    # If dest_name completely fits into end of the src_name, it has already been processed.
    if src_name.endswith(dest_name):
        return dest_name, src_name, typecode

    # Change the dotted name into a relative path. This places C extensions in the Python-standard location.
    dest_name = dest_name.replace('.', os.sep)
    # In some rare cases extension might already contain a suffix. Skip it in this case.
    if os.path.splitext(dest_name)[1] not in EXTENSION_SUFFIXES:
        # Determine the base name of the file.
        base_name = os.path.basename(dest_name)
        assert '.' not in base_name
        # Use this file's existing extension. For extensions such as ``libzmq.cp36-win_amd64.pyd``, we cannot use
        # ``os.path.splitext``, which would give only the ```.pyd`` part of the extension.
        dest_name = dest_name + os.path.basename(src_name)[len(base_name):]

    return dest_name, src_name, typecode


def process_collected_binary(
    src_name,
    dest_name,
    use_strip=False,
    use_upx=False,
    upx_exclude=None,
    target_arch=None,
    codesign_identity=None,
    entitlements_file=None,
    strict_arch_validation=False
):
    """
    Process the collected binary using strip or UPX (or both), and apply any platform-specific processing. On macOS,
    this rewrites the library paths in the headers, and (re-)signs the binary. On-disk cache is used to avoid processing
    the same binary with same options over and over.

    In addition to given arguments, this function also uses CONF['cachedir'] and CONF['upx_dir'].
    """
    from PyInstaller.config import CONF

    # We need to use cache in the following scenarios:
    #  * extra binary processing due to use of `strip` or `upx`
    #  * building on macOS, where we need to rewrite library paths in binaries' headers and (re-)sign the binaries.
    if not use_strip and not use_upx and not is_darwin:
        return src_name

    # Match against provided UPX exclude patterns.
    upx_exclude = upx_exclude or []
    if use_upx:
        src_path = pathlib.PurePath(src_name)
        for upx_exclude_entry in upx_exclude:
            # pathlib.PurePath.match() matches from right to left, and supports * wildcard, but does not support the
            # "**" syntax for directory recursion. Case sensitivity follows the OS default.
            if src_path.match(upx_exclude_entry):
                logger.info("Disabling UPX for %s due to match in exclude pattern: %s", src_name, upx_exclude_entry)
                use_upx = False
                break

    # Additional automatic disablement rules for UPX and strip.

    # On Windows, avoid using UPX with binaries that have control flow guard (CFG) enabled.
    if use_upx and is_win and versioninfo.pefile_check_control_flow_guard(src_name):
        logger.info('Disabling UPX for %s due to CFG!', src_name)
        use_upx = False

    # Avoid using UPX with Qt plugins, as it strips the data required by the Qt plugin loader.
    if use_upx and misc.is_file_qt_plugin(src_name):
        logger.info('Disabling UPX for %s due to it being a Qt plugin!', src_name)
        use_upx = False

    # On linux, if a binary has an accompanying HMAC or CHK file, avoid modifying it in any way.
    if (use_upx or use_strip) and is_linux:
        src_path = pathlib.Path(src_name)
        hmac_path = src_path.with_name(f".{src_path.name}.hmac")
        chk_path = src_path.with_suffix(".chk")
        if hmac_path.is_file():
            logger.info('Disabling UPX and/or strip for %s due to accompanying .hmac file!', src_name)
            use_upx = use_strip = False
        elif chk_path.is_file():
            logger.info('Disabling UPX and/or strip for %s due to accompanying .chk file!', src_name)
            use_upx = use_strip = False
        del src_path, hmac_path, chk_path

    # Exit early if no processing is required after above rules are applied.
    if not use_strip and not use_upx and not is_darwin:
        return src_name

    # Prepare cache directory path. Cache is tied to python major/minor version, but also to various processing options.
    pyver = f'py{sys.version_info[0]}{sys.version_info[1]}'
    arch = platform.architecture()[0]
    cache_dir = os.path.join(
        CONF['cachedir'],
        f'bincache{use_strip:d}{use_upx:d}{pyver}{arch}',
    )
    if target_arch:
        cache_dir = os.path.join(cache_dir, target_arch)
    if is_darwin:
        # Separate by codesign identity
        if codesign_identity:
            # Compute hex digest of codesign identity string to prevent issues with invalid characters.
            csi_hash = hashlib.sha256(codesign_identity.encode('utf-8'))
            cache_dir = os.path.join(cache_dir, csi_hash.hexdigest())
        else:
            cache_dir = os.path.join(cache_dir, 'adhoc')  # ad-hoc signing
        # Separate by entitlements
        if entitlements_file:
            # Compute hex digest of entitlements file contents
            with open(entitlements_file, 'rb') as fp:
                ef_hash = hashlib.sha256(fp.read())
            cache_dir = os.path.join(cache_dir, ef_hash.hexdigest())
        else:
            cache_dir = os.path.join(cache_dir, 'no-entitlements')
    os.makedirs(cache_dir, exist_ok=True)

    # Load cache index, if available
    cache_index_file = os.path.join(cache_dir, "index.dat")
    try:
        cache_index = misc.load_py_data_struct(cache_index_file)
    except FileNotFoundError:
        cache_index = {}
    except Exception:
        # Tell the user they may want to fix their cache... However, do not delete it for them; if it keeps getting
        # corrupted, we will never find out.
        logger.warning("PyInstaller bincache may be corrupted; use pyinstaller --clean to fix it.")
        raise

    # Look up the file in cache; use case-normalized destination name as identifier.
    cached_id = os.path.normcase(dest_name)
    cached_name = os.path.join(cache_dir, dest_name)
    src_digest = _compute_file_digest(src_name)

    if cached_id in cache_index:
        # If digest matches to the cached digest, return the cached file...
        if src_digest == cache_index[cached_id]:
            return cached_name

        # ... otherwise remove it.
        os.remove(cached_name)

    # Ensure parent path exists
    os.makedirs(os.path.dirname(cached_name), exist_ok=True)

    # Use `shutil.copyfile` to copy the file with default permissions bits, then manually set executable
    # bits. This way, we avoid copying permission bits and metadata from the original file, which might be too
    # restrictive for further processing (read-only permissions, immutable flag on FreeBSD, and so on).
    shutil.copyfile(src_name, cached_name)
    os.chmod(cached_name, 0o755)

    # Apply strip
    if use_strip:
        strip_options = []
        if is_darwin:
            # The default strip behavior breaks some shared libraries under macOS.
            strip_options = ["-S"]  # -S = strip only debug symbols.

        cmd = ["strip", *strip_options, cached_name]
        logger.info("Executing: %s", " ".join(cmd))
        try:
            p = subprocess.run(
                cmd,
                stdin=subprocess.DEVNULL,
                stdout=subprocess.PIPE,
                stderr=subprocess.STDOUT,
                check=True,
                errors='ignore',
                encoding='utf-8',
            )
            logger.debug("Output from strip command:\n%s", p.stdout)
        except subprocess.CalledProcessError as e:
            logger.warning("Failed to run strip on %r!", cached_name, exc_info=True)
            logger.warning("Output from strip command:\n%s", e.stdout)
        except Exception:
            logger.warning("Failed to run strip on %r!", cached_name, exc_info=True)

    # Apply UPX
    if use_upx:
        upx_exe = 'upx'
        upx_dir = CONF['upx_dir']
        if upx_dir:
            upx_exe = os.path.join(upx_dir, upx_exe)

        upx_options = [
            # Do not compress icons, so that they can still be accessed externally.
            '--compress-icons=0',
            # Use LZMA compression.
            '--lzma',
            # Quiet mode.
            '-q',
        ]
        if is_win:
            # Binaries built with Visual Studio 7.1 require --strip-loadconf or they will not compress.
            upx_options.append('--strip-loadconf')

        cmd = [upx_exe, *upx_options, cached_name]
        logger.info("Executing: %s", " ".join(cmd))
        try:
            p = subprocess.run(
                cmd,
                stdin=subprocess.DEVNULL,
                stdout=subprocess.PIPE,
                stderr=subprocess.STDOUT,
                check=True,
                errors='ignore',
                encoding='utf-8',
            )
            logger.debug("Output from upx command:\n%s", p.stdout)
        except subprocess.CalledProcessError as e:
            logger.warning("Failed to upx strip on %r!", cached_name, exc_info=True)
            logger.warning("Output from upx command:\n%s", e.stdout)
        except Exception:
            logger.warning("Failed to run upx on %r!", cached_name, exc_info=True)

    # On macOS, we need to modify the given binary's paths to the dependent libraries, in order to ensure they are
    # relocatable and always refer to location within the frozen application. Specifically, we make all dependent
    # library paths relative to @rpath, and set @rpath to point to the top-level application directory, relative to
    # the binary's location (i.e., @loader_path).
    #
    # While modifying the headers invalidates existing signatures, we avoid removing them in order to speed things up
    # (and to avoid potential bugs in the codesign utility, like the one reported on macOS 10.13 in #6167).
    # The forced re-signing at the end should take care of the invalidated signatures.
    if is_darwin:
        try:
            osxutils.binary_to_target_arch(cached_name, target_arch, display_name=src_name)
            #osxutils.remove_signature_from_binary(cached_name)  # Disabled as per comment above.
            target_rpath = str(
                pathlib.PurePath('@loader_path', *['..' for level in pathlib.PurePath(dest_name).parent.parts])
            )
            osxutils.set_dylib_dependency_paths(cached_name, target_rpath)
            osxutils.sign_binary(cached_name, codesign_identity, entitlements_file)
        except osxutils.InvalidBinaryError:
            # Raised by osxutils.binary_to_target_arch when the given file is not a valid macOS binary (for example,
            # a linux .so file; see issue #6327). The error prevents any further processing, so just ignore it.
            pass
        except osxutils.IncompatibleBinaryArchError:
            # Raised by osxutils.binary_to_target_arch when the given file does not contain (all) required arch slices.
            # Depending on the strict validation mode, re-raise or swallow the error.
            #
            # Strict validation should be enabled only for binaries where the architecture *must* match the target one,
            # i.e., the extension modules. Everything else is pretty much a gray area, for example:
            #  * a universal2 extension may have its x86_64 and arm64 slices linked against distinct single-arch/thin
            #    shared libraries
            #  * a collected executable that is launched by python code via a subprocess can be x86_64-only, even though
            #    the actual python code is running on M1 in native arm64 mode.
            if strict_arch_validation:
                raise
            logger.debug("File %s failed optional architecture validation - collecting as-is!", src_name)
        except Exception as e:
            raise SystemError(f"Failed to process binary {cached_name!r}!") from e

    # Update cache index
    cache_index[cached_id] = src_digest
    misc.save_py_data_struct(cache_index_file, cache_index)

    return cached_name


def _compute_file_digest(filename):
    hasher = hashlib.sha1()
    with open(filename, "rb") as fp:
        for chunk in iter(lambda: fp.read(16 * 1024), b""):
            hasher.update(chunk)
    return bytearray(hasher.digest())


def _check_path_overlap(path):
    """
    Check that path does not overlap with WORKPATH or SPECPATH (i.e., WORKPATH and SPECPATH may not start with path,
    which could be caused by a faulty hand-edited specfile).

    Raise SystemExit if there is overlap, return True otherwise
    """
    from PyInstaller.config import CONF
    specerr = 0
    if CONF['workpath'].startswith(path):
        logger.error('Specfile error: The output path "%s" contains WORKPATH (%s)', path, CONF['workpath'])
        specerr += 1
    if CONF['specpath'].startswith(path):
        logger.error('Specfile error: The output path "%s" contains SPECPATH (%s)', path, CONF['specpath'])
        specerr += 1
    if specerr:
        raise SystemExit(
            'ERROR: Please edit/recreate the specfile (%s) and set a different output name (e.g. "dist").' %
            CONF['spec']
        )
    return True


def _make_clean_directory(path):
    """
    Create a clean directory from the given directory name.
    """
    if _check_path_overlap(path):
        if os.path.isdir(path) or os.path.isfile(path):
            try:
                os.remove(path)
            except OSError:
                _rmtree(path)

        os.makedirs(path, exist_ok=True)


def _rmtree(path):
    """
    Remove directory and all its contents, but only after user confirmation, or if the -y option is set.
    """
    from PyInstaller.config import CONF
    if CONF['noconfirm']:
        choice = 'y'
    elif sys.stdout.isatty():
        choice = input(
            'WARNING: The output directory "%s" and ALL ITS CONTENTS will be REMOVED! Continue? (y/N)' % path
        )
    else:
        raise SystemExit(
            'ERROR: The output directory "%s" is not empty. Please remove all its contents or use the -y option (remove'
            ' output directory without confirmation).' % path
        )
    if choice.strip().lower() == 'y':
        if not CONF['noconfirm']:
            print("On your own risk, you can use the option `--noconfirm` to get rid of this question.")
        logger.info('Removing dir %s', path)
        shutil.rmtree(path)
    else:
        raise SystemExit('User aborted')


# TODO Refactor to prohibit empty target directories. As the docstring below documents, this function currently permits
# the second item of each 2-tuple in "hook.datas" to be the empty string, in which case the target directory defaults to
# the source directory's basename. However, this functionality is very fragile and hence bad. Instead:
#
# * An exception should be raised if such item is empty.
# * All hooks currently passing the empty string for such item (e.g.,
#   "hooks/hook-babel.py", "hooks/hook-matplotlib.py") should be refactored
#   to instead pass such basename.
def format_binaries_and_datas(binaries_or_datas, workingdir=None):
    """
    Convert the passed list of hook-style 2-tuples into a returned set of `TOC`-style 2-tuples.

    Elements of the passed list are 2-tuples `(source_dir_or_glob, target_dir)`.
    Elements of the returned set are 2-tuples `(target_file, source_file)`.
    For backwards compatibility, the order of elements in the former tuples are the reverse of the order of elements in
    the latter tuples!

    Parameters
    ----------
    binaries_or_datas : list
        List of hook-style 2-tuples (e.g., the top-level `binaries` and `datas` attributes defined by hooks) whose:
        * The first element is either:
          * A glob matching only the absolute or relative paths of source non-Python data files.
          * The absolute or relative path of a source directory containing only source non-Python data files.
        * The second element is the relative path of the target directory into which these source files will be
          recursively copied.

        If the optional `workingdir` parameter is passed, source paths may be either absolute or relative; else, source
        paths _must_ be absolute.
    workingdir : str
        Optional absolute path of the directory to which all relative source paths in the `binaries_or_datas`
        parameter will be prepended by (and hence converted into absolute paths) _or_ `None` if these paths are to be
        preserved as relative. Defaults to `None`.

    Returns
    ----------
    set
        Set of `TOC`-style 2-tuples whose:
        * First element is the absolute or relative path of a target file.
        * Second element is the absolute or relative path of the corresponding source file to be copied to this target
          file.
    """
    toc_datas = set()

    for src_root_path_or_glob, trg_root_dir in binaries_or_datas:
        # Disallow empty source path. Those are typically result of errors, and result in implicit collection of the
        # whole current working directory, which is never a good idea.
        if not src_root_path_or_glob:
            raise InvalidSrcDestTupleError(
                (src_root_path_or_glob, trg_root_dir),
                "Empty SRC is not allowed when adding binary and data files, as it would result in collection of the "
                "whole current working directory."
            )
        if not trg_root_dir:
            raise InvalidSrcDestTupleError(
                (src_root_path_or_glob, trg_root_dir),
                "Empty DEST_DIR is not allowed - to collect files into application's top-level directory, use "
                f"{os.curdir!r}."
            )
        # Disallow absolute target paths, as well as target paths that would end up pointing outside of the
        # application's top-level directory.
        if os.path.isabs(trg_root_dir):
            raise InvalidSrcDestTupleError((src_root_path_or_glob, trg_root_dir), "DEST_DIR must be a relative path!")
        if os.path.normpath(trg_root_dir).startswith('..'):
            raise InvalidSrcDestTupleError(
                (src_root_path_or_glob, trg_root_dir),
                "DEST_DIR must not point outside of application's top-level directory!",
            )

        # Convert relative to absolute paths if required.
        if workingdir and not os.path.isabs(src_root_path_or_glob):
            src_root_path_or_glob = os.path.join(workingdir, src_root_path_or_glob)

        # Normalize paths.
        src_root_path_or_glob = os.path.normpath(src_root_path_or_glob)
        if os.path.isfile(src_root_path_or_glob):
            src_root_paths = [src_root_path_or_glob]
        else:
            # List of the absolute paths of all source paths matching the current glob.
            src_root_paths = glob.glob(src_root_path_or_glob)

        if not src_root_paths:
            raise SystemExit(f'ERROR: Unable to find {src_root_path_or_glob!r} when adding binary and data files.')

        for src_root_path in src_root_paths:
            if os.path.isfile(src_root_path):
                # Normalizing the result to remove redundant relative paths (e.g., removing "./" from "trg/./file").
                toc_datas.add((
                    os.path.normpath(os.path.join(trg_root_dir, os.path.basename(src_root_path))),
                    os.path.normpath(src_root_path),
                ))
            elif os.path.isdir(src_root_path):
                for src_dir, src_subdir_basenames, src_file_basenames in os.walk(src_root_path):
                    # Ensure the current source directory is a subdirectory of the passed top-level source directory.
                    # Since os.walk() does *NOT* follow symlinks by default, this should be the case. (But let's make
                    # sure.)
                    assert src_dir.startswith(src_root_path)

                    # Relative path of the current target directory, obtained by:
                    #
                    # * Stripping the top-level source directory from the current source directory (e.g., removing
                    #   "/top" from "/top/dir").
                    # * Normalizing the result to remove redundant relative paths (e.g., removing "./" from
                    #   "trg/./file").
                    trg_dir = os.path.normpath(os.path.join(trg_root_dir, os.path.relpath(src_dir, src_root_path)))

                    for src_file_basename in src_file_basenames:
                        src_file = os.path.join(src_dir, src_file_basename)
                        if os.path.isfile(src_file):
                            # Normalize the result to remove redundant relative paths (e.g., removing "./" from
                            # "trg/./file").
                            toc_datas.add((
                                os.path.normpath(os.path.join(trg_dir, src_file_basename)), os.path.normpath(src_file)
                            ))

    return toc_datas


def get_code_object(modname, filename, optimize):
    """
    Get the code-object for a module.

    This is a simplifed non-performant version which circumvents __pycache__.
    """

    if filename in ('-', None):
        # This is a NamespacePackage, modulegraph marks them by using the filename '-'. (But wants to use None, so
        # check for None, too, to be forward-compatible.)
        logger.debug('Compiling namespace package %s', modname)
        txt = '#\n'
        code_object = compile(txt, filename, 'exec', optimize=optimize)
    else:
        _, ext = os.path.splitext(filename)
        ext = ext.lower()

        if ext == '.pyc':
            # The module is available in binary-only form. Read the contents of .pyc file using helper function, which
            # supports reading from either stand-alone or archive-embedded .pyc files.
            logger.debug('Reading code object from .pyc file %s', filename)
            pyc_data = _read_pyc_data(filename)
            code_object = marshal.loads(pyc_data[16:])
        else:
            # Assume this is a source .py file, but allow an arbitrary extension (other than .pyc, which is taken in
            # the above branch). This allows entry-point scripts to have an arbitrary (or no) extension, as tested by
            # the `test_arbitrary_ext` in `test_basic.py`.
            logger.debug('Compiling python script/module file %s', filename)

            with open(filename, 'rb') as f:
                source = f.read()

            # If entry-point script has no suffix, append .py when compiling the source. In POSIX builds, the executable
            # has no suffix either; this causes issues with `traceback` module, as it tries to read the executable file
            # when trying to look up the code for the entry-point script (when current working directory contains the
            # executable).
            _, ext = os.path.splitext(filename)
            if not ext:
                logger.debug("Appending .py to compiled entry-point name...")
                filename += '.py'

            try:
                code_object = compile(source, filename, 'exec', optimize=optimize)
            except SyntaxError:
                logger.warning("Sytnax error while compiling %s", filename)
                raise

    return code_object


def strip_paths_in_code(co, new_filename=None):
    # Paths to remove from filenames embedded in code objects
    replace_paths = sys.path + CONF['pathex']
    # Make sure paths end with os.sep and the longest paths are first
    replace_paths = sorted((os.path.join(f, '') for f in replace_paths), key=len, reverse=True)

    if new_filename is None:
        original_filename = os.path.normpath(co.co_filename)
        for f in replace_paths:
            if original_filename.startswith(f):
                new_filename = original_filename[len(f):]
                break

        else:
            return co

    code_func = type(co)

    consts = tuple(
        strip_paths_in_code(const_co, new_filename) if isinstance(const_co, code_func) else const_co
        for const_co in co.co_consts
    )

    return co.replace(co_consts=consts, co_filename=new_filename)


def _should_include_system_binary(binary_tuple, exceptions):
    """
    Return True if the given binary_tuple describes a system binary that should be included.

    Exclude all system library binaries other than those with "lib-dynload" in the destination or "python" in the
    source, except for those matching the patterns in the exceptions list. Intended to be used from the Analysis
    exclude_system_libraries method.
    """
    dest = binary_tuple[0]
    if dest.startswith('lib-dynload'):
        return True
    src = binary_tuple[1]
    if fnmatch.fnmatch(src, '*python*'):
        return True
    if not src.startswith('/lib') and not src.startswith('/usr/lib'):
        return True
    for exception in exceptions:
        if fnmatch.fnmatch(dest, exception):
            return True
    return False


def compile_pymodule(name, src_path, workpath, optimize, code_cache=None):
    """
    Given the name and source file for a pure-python module, compile the module in the specified working directory,
    and return the name of resulting .pyc file. The paths in the resulting .pyc module are anonymized by having their
    absolute prefix removed.

    If a .pyc file with matching name already exists in the target working directory, it is re-used (provided it has
    compatible bytecode magic in the header, and that its modification time is newer than that of the source file).

    If the specified module is available in binary-only form, the input .pyc file is copied to the target working
    directory and post-processed. If the specified module is available in source form, it is compiled only if
    corresponding code object is not available in the optional code-object cache; otherwise, it is copied from cache
    and post-processed. When compiling the module, the specified byte-code optimization level is used.

    It is up to caller to ensure that the optional code-object cache contains only code-objects of target optimization
    level, and that if the specified working directory already contains .pyc files, that they were created with target
    optimization level.
    """

    # Construct the target .pyc filename in the workpath
    split_name = name.split(".")
    if "__init__" in src_path:
        # __init__ module; use "__init__" as module name, and construct parent path using all components of the
        # fully-qualified name
        parent_dirs = split_name
        mod_basename = "__init__"
    else:
        # Regular module; use last component of the fully-qualified name as module name, and the rest as the parent
        # path.
        parent_dirs = split_name[:-1]
        mod_basename = split_name[-1]
    pyc_path = os.path.join(workpath, *parent_dirs, mod_basename + '.pyc')

    # Check if optional cache contains module entry
    code_object = code_cache.get(name, None) if code_cache else None

    if code_object is None:
        _, ext = os.path.splitext(src_path)
        ext = ext.lower()

        if ext == '.py':
            # Source py file; read source and compile it.
            with open(src_path, 'rb') as f:
                src_data = f.read()
            code_object = compile(src_data, src_path, 'exec', optimize=optimize)
        elif ext == '.pyc':
            # The module is available in binary-only form. Read the contents of .pyc file using helper function, which
            # supports reading from either stand-alone or archive-embedded .pyc files.
            pyc_data = _read_pyc_data(src_path)
            # Unmarshal code object; this is necessary if we want to strip paths from it
            code_object = marshal.loads(pyc_data[16:])
        else:
            raise ValueError(f"Invalid python module file {src_path}; unhandled extension {ext}!")

    # Strip code paths from the code object
    code_object = strip_paths_in_code(code_object)

    # Write complete .pyc module to in-memory stream. Then, check if .pyc file already exists, compare contents, and
    # (re)write it only if different. This avoids unnecessary (re)writing of the file, and in turn also avoids
    # unnecessary cache invalidation for targets that make use of the .pyc file (e.g., PKG, COLLECT).
    with io.BytesIO() as pyc_stream:
        pyc_stream.write(compat.BYTECODE_MAGIC)
        pyc_stream.write(struct.pack('<I', 0b01))  # PEP-552: hash-based pyc, check_source=False
        pyc_stream.write(b'\00' * 8)  # Zero the source hash
        marshal.dump(code_object, pyc_stream)
        pyc_data = pyc_stream.getvalue()

    if os.path.isfile(pyc_path):
        with open(pyc_path, 'rb') as fh:
            existing_pyc_data = fh.read()
        if pyc_data == existing_pyc_data:
            return pyc_path  # Return path to (existing) file.

    # Ensure the existence of parent directories for the target pyc path
    os.makedirs(os.path.dirname(pyc_path), exist_ok=True)

    # Write
    with open(pyc_path, 'wb') as fh:
        fh.write(pyc_data)

    # Return output path
    return pyc_path


def _read_pyc_data(filename):
    """
    Helper for reading data from .pyc files. Supports both stand-alone and archive-embedded .pyc files. Used by
    `compile_pymodule` and `get_code_object` helper functions.
    """
    src_file = pathlib.Path(filename)

    if src_file.is_file():
        # Stand-alone .pyc file.
        pyc_data = src_file.read_bytes()
    else:
        # Check if .pyc file is stored in a .zip archive, as is the case for stdlib modules in embeddable
        # python on Windows.
        parent_zip_file = misc.path_to_parent_archive(src_file)
        if parent_zip_file is not None and zipfile.is_zipfile(parent_zip_file):
            with zipfile.ZipFile(parent_zip_file, 'r') as zip_archive:
                # NOTE: zip entry names must be in POSIX format, even on Windows!
                zip_entry_name = str(src_file.relative_to(parent_zip_file).as_posix())
                pyc_data = zip_archive.read(zip_entry_name)
        else:
            raise FileNotFoundError(f"Cannot find .pyc file {filename!r}!")

        # Verify the python version
        if pyc_data[:4] != compat.BYTECODE_MAGIC:
            raise ValueError(f"The .pyc module {filename} was compiled for incompatible version of python!")

    return pyc_data


def postprocess_binaries_toc_pywin32(binaries):
    """
    Process the given `binaries` TOC list to apply work around for `pywin32` package, fixing the target directory
    for collected extensions.
    """
    # Ensure that all files collected from `win32`  or `pythonwin` into top-level directory are put back into
    # their corresponding directories. They end up in top-level directory because `pywin32.pth` adds both
    # directories to the `sys.path`, so they end up visible as top-level directories. But these extensions
    # might in fact be linked against each other, so we should preserve the directory layout for consistency
    # between modulegraph-discovered extensions and linked binaries discovered by link-time dependency analysis.
    # Within the same framework, also consider `pywin32_system32`, just in case.
    PYWIN32_SUBDIRS = {'win32', 'pythonwin', 'pywin32_system32'}

    processed_binaries = []
    for dest_name, src_name, typecode in binaries:
        dest_path = pathlib.PurePath(dest_name)
        src_path = pathlib.PurePath(src_name)

        if dest_path.parent == pathlib.PurePath('.') and src_path.parent.name.lower() in PYWIN32_SUBDIRS:
            dest_path = pathlib.PurePath(src_path.parent.name) / dest_path
            dest_name = str(dest_path)

        processed_binaries.append((dest_name, src_name, typecode))

    return processed_binaries


def postprocess_binaries_toc_pywin32_anaconda(binaries):
    """
    Process the given `binaries` TOC list to apply work around for Anaconda `pywin32` package, fixing the location
    of collected `pywintypes3X.dll` and `pythoncom3X.dll`.
    """
    # The Anaconda-provided `pywin32` package installs three copies of `pywintypes3X.dll` and `pythoncom3X.dll`,
    # located in the following directories (relative to the environment):
    # - Library/bin
    # - Lib/site-packages/pywin32_system32
    # - Lib/site-packages/win32
    #
    # This turns our dependency scanner and directory layout preservation mechanism into a lottery based on what
    # `pywin32` modules are imported and in what order. To keep things simple, we deal with this insanity by
    # post-processing the `binaries` list, modifying the destination of offending copies, and let the final TOC
    # list normalization deal with potential duplicates.
    DLL_CANDIDATES = {
        f"pywintypes{sys.version_info[0]}{sys.version_info[1]}.dll",
        f"pythoncom{sys.version_info[0]}{sys.version_info[1]}.dll",
    }

    DUPLICATE_DIRS = {
        pathlib.PurePath('.'),
        pathlib.PurePath('win32'),
    }

    processed_binaries = []
    for dest_name, src_name, typecode in binaries:
        # Check if we need to divert - based on the destination base name and destination parent directory.
        dest_path = pathlib.PurePath(dest_name)
        if dest_path.name.lower() in DLL_CANDIDATES and dest_path.parent in DUPLICATE_DIRS:
            dest_path = pathlib.PurePath("pywin32_system32") / dest_path.name
            dest_name = str(dest_path)

        processed_binaries.append((dest_name, src_name, typecode))

    return processed_binaries


def create_base_library_zip(filename, modules_toc, code_cache=None):
    """
    Create a zip archive with python modules that are needed during python interpreter initialization.
    """
    with zipfile.ZipFile(filename, 'w') as zf:
        for name, src_path, typecode in modules_toc:
            # Obtain code object from cache, or compile it.
            code = None if code_cache is None else code_cache.get(name, None)
            if code is None:
                optim_level = {'PYMODULE': 0, 'PYMODULE-1': 1, 'PYMODULE-2': 2}[typecode]
                code = get_code_object(name, src_path, optimize=optim_level)
            # Determine destination name
            dest_name = name.replace('.', os.sep)
            # Special case: packages have an implied `__init__` filename that needs to be added.
            basename, ext = os.path.splitext(os.path.basename(src_path))
            if basename == '__init__':
                dest_name += os.sep + '__init__'
            dest_name += '.pyc'  # Always .pyc, regardless of optimization level.
            # Write the .pyc module
            with io.BytesIO() as fc:
                fc.write(compat.BYTECODE_MAGIC)
                fc.write(struct.pack('<I', 0b01))  # PEP-552: hash-based pyc, check_source=False
                fc.write(b'\00' * 8)  # Match behavior of `building.utils.compile_pymodule`
                code = strip_paths_in_code(code)  # Strip paths
                marshal.dump(code, fc)
                # Use a ZipInfo to set timestamp for deterministic build.
                info = zipfile.ZipInfo(dest_name)
                zf.writestr(info, fc.getvalue())