File: build_cache.py

package info (click to toggle)
charliecloud 0.43-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 3,084 kB
  • sloc: python: 6,021; sh: 4,284; ansic: 3,863; makefile: 598
file content (1454 lines) | stat: -rw-r--r-- 63,340 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
# Note on how we use Git:
#
# Git is extremely flexible and can be configured in many ways, including
# various configuration files [1] as well as environment variables [2]. We do
# our best to use a fully-isolated Git that brings along no external
# configuration the user or system may have, by excluding configuration files
# other than Charliecloud’s and clearing the environment.
#
# Another gotcha that is not (yet?) documented is $PATH. git(1) re-executes
# itself in the same way that it was invoked; e.g., if you invoke it with
# plain “git”, which looks up the binary in the path, then sub-invocations
# will do the same and look up “git” again [3]. If somehow you use different
# paths to find the outer and inner Git — which is easy to do accidentally
# with subprocess — you can run a mixed-version Git, which is bad (see #1606).
# We work around this by looking up git(1) once and then calling it by its
# absolute path, with an empty environment including unset $PATH.
#
# Alternately, we could have sanitized the environment more carefully, passing
# through $PATH and perhaps other variables. This seemed difficult to do
# correctly (e.g., should we keep $LD_LIBRARY_PATH?), and it seemed unlikely
# that Git would be executing programs other than Git that weren’t readily
# available in the standard paths or using any non-standard features.
#
# [1]: https://git-scm.com/book/en/v2/Customizing-Git-Git-Configuration
# [2]: https://git-scm.com/book/en/v2/Git-Internals-Environment-Variables
# [3]: https://lore.kernel.org/git/E7D87B07-C416-4A58-8726-CCDA0907AC66@lanl.gov/t/#u

import configparser
import datetime
import glob
import hashlib
import itertools
import os
import pickle
import re
import shutil
import stat
import sys
import tempfile
import textwrap

import charliecloud as ch
import filesystem as fs
import image as im
import pull


## Constants ##

# Required versions.
DOT_MIN = (2, 30, 1)
GIT_MIN = (2, 28, 1)
GIT2DOT_MIN = (0, 8, 3)

# Git configuration. Note some of these are overridden in specific commands.
# Documentation for these variables: https://git-scm.com/docs/git-config
GIT_CONFIG = {
   # Do parallel checkouts with all cores.
   "checkout.workers":       "-1",
   # Length of shortened commit digests. I don’t follow the math well, but if
   # unconfigured the minimum is 7 and we do see more (e.g. in a repo with
   # 213,000 objects). I hope 12 is sufficient; maybe that’s enough for 16
   # million objects?
   "core.abbrev":             "12",
   # Prioritize write speed over data safety; i.e., increase the risk of cache
   # corruption on system crash while (hopefully) decreasing write speed.
   "core.fsync":             "none",
   # We want to keep access to commits on deleted branches so they are still
   # available for cache hits. This setting is necessary but not sufficient;
   # see branch_delete() below.
   "core.logAllRefUpdates":  "true",
   # Try to maximize “git add” speed.
   "core.looseCompression":  "0",
   # Enable incremental indexes [1]; it should speed things like “git add”.
   # [1]: https://git-scm.com/docs/git-update-index
   "core.splitIndex":        "true",
   # “ctime” marks when the file *or its inode* was last changed. Twiddling
   # the various metadata will alter this, so Git shouldn’t use it when
   # deciding if a file may have changed.
   "core.trustctime":        "false",
   # Enable the “untracked cache” [1], which saves directory mtimes to
   # eliminate the need to re-stat(2) in some cases.
   # [1]: https://git-scm.com/docs/git-update-index
   "core.untrackedCache":    "true",
   # Quick-and-dirty results suggest that commit is not faster after garbage
   # collection, and checkout is actually a little faster if *not* garbage
   # collected. Therefore, it’s not a high priority to run garbage collection.
   # Further, I would assume garbaging a lot of files rather than a few gives
   # better opportunities for delta compression. Our most file-ful example
   # image is obspy at about 50K files.
   "gc.auto":                "100000",
   # Leave packs larger than this alone during automatic GC. This is to avoid
   # excessive resource consumption during GC the user didn’t ask for.
   "gc.bigPackThreshold":    "12G",
   # Anything unreachable from a named branch or the reflog is unavailable to
   # the build cache, so we may as well delete it immediately. However, there
   # might be a concurrent Git operation in progress, so don’t use “now”.
   "gc.pruneExpire":         "12.hours.ago",
   # Use the newest index version, which does “simple pathname compression
   # that reduces index size by 30%-50% on large repositories” [1].
   # [1]: https://git-scm.com/docs/git-update-index
   "index.version":          "4",
   # Print logs in short format by default. This helps ensure consistency
   # across different systems and git versions.
   "log.decorate":           "short",
   # States on the reflog are available to the build cache, but the default
   # prune time is 90 and 30 days respectively, which seems too long.
   #"gc.reflogExpire":        "14.days.ago",  # changed my mind
   # In some quick-and-dirty tests (see issue #1412), pack.compression=1 is
   # 50% faster than the default 6 at the cost of 6% more size, while
   # Compression 0 is twice as fast but also over twice the size; 9 doubles
   # the time with no space savings. 1 seems like the right balance.
   "pack.compression":       "1",
   # These two are guesses based on the fact that HPC machines tend to have a
   # lot of memory and more caching is faster.
   "pack.deltaCacheLimit":   "4096",
   "pack.deltaCacheSize":    "1G",
   # These two are guesses based on [1] and its links, particularly [2].
   # [1]: https://stackoverflow.com/questions/28720151
   # [2]: https://web.archive.org/web/20170526024841/https://vcscompare.blogspot.com/2008/06/git-repack-parameters.html
   "pack.depth":             "36",
   "pack.window":            "24",
   # Our Git repo is purely local, so it doesn’t really matter who owns the
   # commits. Set these in case the user hasn’t configured them and they can’t
   # be guessed. (Issue #1535.)
   "user.email":             "charlie@localhost",
   "user.name":              "Charlie",
   # Always fail if Git doesn’t know who the user is, rather than guessing if
   # possible. Makes #1535 happen for everyone.
   "user.useConfigOnly":     "true",
}

# Placeholder for Git hash values that are unknown. This deliberately does not
# support str operations (e.g., indexing), so trying those will fail loudly.
GIT_HASH_UNKNOWN = -1


## Globals ##

# The active build cache.
cache = None

# Path to DOT output (.gv and .pdf will be appended)
dot_base = None

# Absolute path of Git binary we’re using.
git = None

# Default path within image to metadata pickle.
PICKLE_PATH = fs.Path("ch/git.pickle")


## Functions ##

def have_deps(required=True):
   """Return True if dependencies for the build cache are present, False
      otherwise. Note this does not include the --dot debugging option; that
      checks its own dependencies when invoked.

      This function also figures out which Git to use and sets the appropriate
      variables."""
   global git
   git = shutil.which("git")
   if (git is None):
      (ch.FATAL if required else ch.VERBOSE)("no git(1) found")
      return False
   # As of 2.34.1, we get: "git version 2.34.1\n".
   return ch.version_check([git, "--version"], GIT_MIN, required=required)

def have_dot():
   ch.version_check(["git2dot.py", "--version"], GIT2DOT_MIN)
   ch.version_check(["dot", "-V"], DOT_MIN)

def init(cli):
   # At this point --bucache is what the user wanted, either directly or via
   # --no-cache. If it’s None, chose the right default; otherwise, try what
   # the user asked for and fail if we can’t do it.
   if (cli.bucache != ch.Build_Mode.DISABLED):
      ok = have_deps(False)
      if (cli.bucache is None):
         cli.bucache = ch.Build_Mode.ENABLED if ok else ch.Build_Mode.DISABLED
         ch.VERBOSE("using default build cache mode")
      if (cli.bucache != ch.Build_Mode.DISABLED and not ok):
         ch.FATAL("insufficient Git for build cache mode: %s"
                  % cli.bucache.value)
   # Set cache appropriately. We could also do this with a factory method, but
   # that seems overkill.
   global cache
   if (cli.bucache == ch.Build_Mode.ENABLED):
      cache = Enabled_Cache(cli.cache_large)
   elif (cli.bucache == ch.Build_Mode.REBUILD):
      cache = Rebuild_Cache(cli.cache_large)
   elif (cli.bucache == ch.Build_Mode.DISABLED):
      cache = Disabled_Cache(cli.cache_large)
   else:
      assert False, "unreachable"
   ch.VERBOSE("build cache mode: %s" % cache)
   # DOT output path
   try:
      global dot_base
      dot_base = cli.dot
   except AttributeError:
      pass

def md5(data=b""):
   try:
      return hashlib.md5(data, usedforsecurity=False)
   except TypeError:  # assume no usedforsecurity argument
      return hashlib.md5(data)


## Supporting classes ##

class File_Metadata:

   # This class holds metadata we care about for a file (in the general sense,
   # not necessarily a regular file), along with methods for dealing with that
   # metadata for the build cache. This includes re-creating some files from
   # their metadata, for files that Git can’t store or breaks.
   #
   # Importantly, this class must support un-pickling of old versions of
   # itself, to support existing build caches on upgrade. At present, we do
   # this attribute-by-attribute, without explicit versioning. We omit
   # __slots__ so old versions with since-deleted attributes can be unpickled.
   #
   # Note that ctime can’t be restored: https://unix.stackexchange.com/a/36105
   #
   # Attributes corresponding directly to inode(7) fields (and pickled):
   #
   #   atime_ns
   #   mtime_ns
   #   mode
   #
   #   size .......... Size of the file in bytes. Note large file thresholds
   #                   vary between builds, so this attribute should not be
   #                   used to determine if a file is in large file storage.
   #                   WARNING: May be -1 if read from an old pickle.
   #
   # Other attributes stored in pickle:
   #
   #   children ...... Insertion-ordered mapping from child names to their
   #                   File_Metadata objects. Empty if non-directory, or upon
   #                   object creation (i.e., caller must assemble the tree).
   #
   #   dont_restore .. If True, file should not be restored (e.g., RPM
   #                   database cache files).
   #
   #   large_name .... If non-None, file is stored out-of-band as a large file
   #                   with this name. This is a function of all the
   #                   attributes that make file large-unique, i.e., two files
   #                   with the same large_name are considered the same file.
   #
   #   hardlink_to ... If non-None, file is a hard link to this other file,
   #                   stored as a Path object relative to the image root.
   #
   #   xattrs ........ Extended attributes of the file, stored as a dictionary.
   #                   The keys take the form “namespace.name”, where
   #                   “namespace” is the namespace the xattr belongs to (e.g.
   #                   “user” or “system”) and “name” is the actual name of the
   #                   xattr. The value in the dictionary is the value assigned
   #                   to the xattr.
   #
   # Attributes not stored (recomputed on unpickle):
   #
   #   image_root .... Absolute path to the image directory to which path is
   #                   relative. That is, image_root // path is the absolute
   #                   version of path.
   #
   #   path .......... Path to the file within the image, relative to the
   #                   image root. Empty for the image root itself. For
   #                   example, an image’s “/bin/true” has path “bin/true”.
   #
   #   path_abs ...... Absolute path to the file (under the host root).
   #
   #   st ............ Stat object for the file. Absent after un-pickling.

   def __init__(self, image_root, path):
      # Note: Constructor not called during unpickle.
      self.image_root = image_root
      self.path = path
      self.path_abs = image_root // path
      self.st = self.path_abs.stat(False)
      self.stat_cache_update()
      self.children = dict()
      self.dont_restore = False
      self.hardlink_to = None
      self.large_name = None
      self.xattrs = dict()
      if ch.xattrs_save:
         for xattr in ch.ossafe("can’t list xattrs: %s" % self.path_abs,
                                os.listxattr, self.path_abs,
                                follow_symlinks=False):
            self.xattrs[xattr] = \
               ch.ossafe(("can’t get xattr: %s: %s"
                                       % (self.path_abs, xattr)),
                         os.getxattr, self.path_abs, xattr,
                         follow_symlinks=False)

   @classmethod
   def git_prepare(class_, image_root, large_file_thresh,
                   path=None, hardlinks=None):
      """Recursively walk the given image root, prepare it for a Git commit,
         and return the resulting File_Metadata tree describing it. This is
         mostly reversed by git_restore_walk(); anything not is noted.

         path is the path relative to image_root currently being examined;
         hardlinks is a dictionary used to track what hard link groups have
         been seen already. External callers should pass None.

         For each file, in this order:

           1. Record file metadata, specifically mode and timestamps, because
              Git does not save metadata beyond a limited executable bit.
              (More may be saved in the future; see issue #1287.)

              This captures FIFOs (named pipes), which are ignored by Git.

              Exception: Sockets are ignored. Like FIFOs, sockets are ignored
              by Git, but there isn’t a meaningful way to re-create them.
              Their presence in a container image that is not in use, which
              this image shouldn’t be, most likely reflects a bug in
              something. We do print a warning in this case.

           2. For directories, record the number of children. Git does not
              save empty directories, so this is used to re-create them.

           3. For devices, exit with error. Such files should not appear in
              unprivileged container images, so their presence means something
              is wrong.

           4. For non-directories with link count greater than 1 (i.e., hard
              links), do nothing when the first link is encountered, but
              second and subsequent links are deleted, to be restored on
              checkout. (Git splits multiply-linked files into separate
              files, and directories cannot be hard-linked [1].)

           5. Files special due to their name:

              a. Names starting in “.git” are special to Git. Therefore,
                 except at the root where “.git” files support the cache’s Git
                 worktree, rename them to begin with “.weirdal_” instead.

              b. Files matching the pattern /var/lib/rpm/__db.* are Berkeley
                 DB database support files used by RPM. Sometimes, something
                 mishandles the last-modified dates on these files, fooling
                 Git into thinking they have not been modified, and so they
                 don’t get committed or restored, which confuses BDB/RPM.
                 Fortunately, they can be safely deleted, and that’s a simple
                 workaround, so we do it. See issue #1351.

         Return the File_Metadata tree, and if write is True, also save it in
         “ch/git.pickle”.

         [1]: https://en.wikipedia.org/wiki/Hard_link#Limitations"""
      # Setup.
      if (path is None):
         assert (hardlinks is None)
         path = fs.Path()
         hardlinks = dict()
      fm = class_(image_root, path)
      if (fm.path == im.GIT_DIR):
         # skip Git stuff at image root
         fm.dont_restore = True
         return fm
      # Ensure minimum permissions. Some tools like to make files without
      # necessary owner permissions, because root ignores the permissions bits
      # (CAP_DAC_OVERRIDE). See e.g. #1765.
      fm.st = fm.path_abs.chmod_min(fm.st)
      fm.stat_cache_update()
      # Validate file type and recurse if needed. (Don’t use os.walk() because
      # it’s iterative, and our algorithm is better expressed recursively.)
      if   (   stat.S_ISREG(fm.mode)
            or stat.S_ISLNK(fm.mode)
            or stat.S_ISFIFO(fm.mode)):
         # RPM databases get corrupted. Easy fix is delete them. See #1351.
         if (path.match("var/lib/rpm/__db.*")):
            ch.VERBOSE("deleting, see issue #1351: %s" % path)
            fm.path_abs.unlink()
            fm.dont_restore = True
            return fm
      elif (   stat.S_ISSOCK(fm.mode)):
         ch.WARNING("socket in image, deleting: %s" % path)
         fm.path_abs.unlink()
         fm.dont_restore = True
         return fm
      elif (   stat.S_ISCHR(fm.mode)
            or stat.S_ISBLK(fm.mode)):
         ch.FATAL("device files invalid in image: %s" % path)
      elif (   stat.S_ISDIR(fm.mode)):
         entries = sorted(fm.path_abs.listdir())
         for i in entries:
            # Recurse
            fm.children[i] = class_.git_prepare(image_root, large_file_thresh,
                                                path // i, hardlinks)
      else:
         ch.FATAL("unexpected file type in image: %x: %s"
                  % (stat.IFMT(fm.mode), path))
      # Deal with hard links (directories can’t be hard-linked).
      if (fm.st.st_nlink > 1 and not stat.S_ISDIR(fm.mode)):
         if ((fm.st.st_dev, fm.st.st_ino) in hardlinks):
            ch.DEBUG("hard link: deleting subsequent: %d %d %s"
                     % (fm.st.st_dev, fm.st.st_ino, path))
            fm.hardlink_to = hardlinks[(fm.st.st_dev, fm.st.st_ino)]
            fm.path_abs.unlink()
            return fm
         else:
            ch.DEBUG("hard link: recording first: %d %d %s"
                     % (fm.st.st_dev, fm.st.st_ino, path))
            hardlinks[(fm.st.st_dev, fm.st.st_ino)] = path
      # Deal with large files. This comparison is a little sloppy (no files
      # named “git.pickle” are large, not just the one in /ch), but it works
      # for now.
      if (    fm.size >= large_file_thresh
          and stat.S_ISREG(fm.mode)
          and fm.path.name != PICKLE_PATH.name
          and fm.hardlink_to is None):
         fm.large_name = fm.large_prepare()
      else:
         fm.large_name = None
      # Remove empty directories. Git will ignore them, including leaving them
      # there when switching the worktree to a different branch, which is bad.
      if (fm.empty_dir_p):
         fm.path_abs.rmdir()
         return fm
      # Remove FIFOs for the same reason.
      if (stat.S_ISFIFO(fm.mode)):
         fm.path_abs.unlink()
         return fm
      # Rename if necessary.
      if (not path.git_compatible_p):
         ch.DEBUG("renaming: %s -> %s" % (path, path.git_escaped))
         fm.path_abs.rename(fm.path_abs.git_escaped)
      # Done.
      return fm

   @classmethod
   def unpickle(self, image_root, data=None):
      if (data is None):
         data = (image_root // PICKLE_PATH).file_read_all(text=False)
      fm_tree = pickle.loads(data)
      fm_tree.unpickle_fix(image_root, path=fs.Path("."))
      return fm_tree

   def __getstate__(self):
      return { a:v for (a,v) in self.__dict__.items()
                   if (a not in { "image_root", "path", "path_abs", "st" }) }

   @property
   def empty_dir_p(self):
      """True if I represent either an empty directory, or a directory that
         contains only children where empty_dir_p is true. E.g., the root of a
         directory tree containing only empty directories returns true."""
      # In principle this could do a lot of recursion, but in practice I’m
      # guessing it’s not too much.
      if (not stat.S_ISDIR(self.mode)):
         return False  # not a directory
      # True if no children (truly empty directory), or each child is unstored
      # or empty_dir_p (recursively empty directory tree).
      return all((c.unstored or c.empty_dir_p) for c in self.children.values())

   @property
   def unstored(self):
      """True if I represent something not stored, either ignored by Git or
         deleted by us before committing."""
      return (   stat.S_ISFIFO(self.mode)
              or stat.S_ISSOCK(self.mode)
              or self.large_name is not None
              or self.hardlink_to is not None)

   def get(self, path):
      "Return the File_Metadata object at path."
      fm = self
      for name in path.parts:
         fm = fm.children[name]
      return fm

   def git_restore(self, quick):
      #ch.TRACE(self.str_for_log())  # output is extreme even for TRACE?
      # Do-nothing case. Exclude RPM databases explicitly because old caches
      # can have them left over without being tagged don’t restore.
      if (self.dont_restore or self.path.match("var/lib/rpm/__db.*")):
         if (not quick and self.path != im.GIT_DIR):
            ch.INFO("ignoring un-restorable file: /%s" % self.path)
         return
      # Make sure I exist, and with the correct name.
      if (self.hardlink_to is not None):
         # This relies on prepare and restore having the same traversal order,
         # so the first (stored) link is always available by the time we get
         # to subsequent (unstored) links.
         target = self.image_root // self.hardlink_to
         ch.DEBUG("hard link: restoring: %s -> %s" % (self.path_abs, target))
         ch.ossafe("can’t hardlink: %s -> %s" % (self.path_abs,
                                                       target),
                   os.link, target, self.path_abs, follow_symlinks=False)
      elif (self.large_name is not None):
         self.large_restore()
      elif (self.empty_dir_p):
         ch.ossafe("can’t mkdir: %s" % self.path, os.mkdir, self.path_abs)
      elif (stat.S_ISFIFO(self.mode)):
         ch.ossafe("can’t make FIFO: %s" % self.path, os.mkfifo, self.path_abs)
      elif (not self.path.git_compatible_p):
         self.path_abs.git_escaped.rename(self.path_abs)
      for (xattr, val) in self.xattrs.items():
         self.path_abs.setxattr(xattr, val)
      # Recurse children.
      if (len(self.children) > 0):
         for child in self.children.values():
            child.git_restore(quick)
      # Restore my metadata.
      if ((   not quick                      # Git broke metadata
           or self.hardlink_to is not None   # we just made the hardlink
           or stat.S_ISDIR(self.mode)        # maybe just created or modified
           or stat.S_ISFIFO(self.mode))      # we just made the FIFO
          and not stat.S_ISLNK(self.mode)):  # can’t not follow symlinks
         ch.ossafe("can’t restore times: %s" % self.path_abs, os.utime,
                   self.path_abs, ns=(self.atime_ns, self.mtime_ns))
         ch.ossafe("can’t restore mode: %s" % self.path_abs, os.chmod,
                   self.path_abs, stat.S_IMODE(self.mode))

   def large_name_get(self):
      "Return my name for use in large file storage."
      assert (self.size >= 0)
      h = md5()
      for attr in ("mtime_ns", "mode", "size", "path"):
         h.update(bytes(repr(getattr(self, attr)).encode("UTF-8")))
      # The digest is unique, but add an encoded path to aid debugging.
      return (  h.hexdigest() + "%"
              + str(self.path).replace("/", "%"))[:ch.FILENAME_MAX_CHARS]

   def large_names(self):
      "Return a set containing the large names of myself and all descendants."
      if (self.large_name is None):
         names = set()
      else:
         names = { self.large_name }
      for c in self.children.values():
         names |= c.large_names()
      return names

   def large_prepare(self):
      """Move my file to large file storage, or delete it if it already
         exists, then return the appropriate large name."""
      large_name = self.large_name_get()
      target = ch.storage.build_large_path(large_name)
      if (target.exists()):
         op = "found"
         self.path_abs.unlink()
      else:
         op = "moving to"
         self.path_abs.rename(target)
      ch.DEBUG("large file: %s: %s: %s" % (self.path, op, large_name))
      return large_name

   def large_restore(self):
      "Restore large file from OOB storage."
      target = ch.storage.build_large_path(self.large_name)
      ch.DEBUG("large file: %s: copying: %s" % (self.path_abs, self.large_name))
      fs.copy(target, self.path_abs)

   def pickle(self):
      (self.image_root // PICKLE_PATH) \
         .file_write(pickle.dumps(self, protocol=4))

   def stat_cache_update(self):
      for attr in ("atime_ns", "mtime_ns", "mode", "size"):
         setattr(self, attr, getattr(self.st, "st_" + attr))

   def str_for_log(self):
      # Truncate reported time to seconds.
      fmt = "%Y-%m-%d.%H:%M:%S"
      mstr = datetime.datetime.fromtimestamp(self.mtime_ns // 1e9).strftime(fmt)
      astr = datetime.datetime.fromtimestamp(self.mtime_ns // 1e9).strftime(fmt)
      return ("%s%s %s [%d %d %s %s %s %s %s] %s %s"
              % ("  " * len(self.path), stat.filemode(self.mode),
                 self.path.name, self.size, len(self.children),
                 "dont_restore" if self.dont_restore else "-",
                 "empty_dir" if self.empty_dir_p else "-",
                 "unstored" if self.unstored else "-",
                 mstr, astr,
                 self.hardlink_to if self.hardlink_to else "-",
                 self.large_name if self.large_name else "-"))

   def unpickle_fix(self, image_root, path):
      "Does no I/O."
      # old: large_name, size, xattrs: no such attribute
      if (not (hasattr(self, "large_name"))):
         self.large_name = None
      if (not (hasattr(self, "size"))):
         self.size = -1
      if (not (hasattr(self, "xattrs"))):
         self.xattrs = dict()
      # old: hardlink_to: stored as string
      if (isinstance(self.hardlink_to, str)):
         self.hardlink_to = fs.Path(self.hardlink_to)
      # old: children, name: just a list, and instances know their names
      if (isinstance(self.children, list)):
         children_new = dict()
         for c in self.children:
            children_new[c.name] = c
            delattr(c, "name")
         self.children = children_new
      # all: set non-stored attributes
      self.image_root = image_root
      self.path = path
      self.path_abs = image_root // path
      # recurse
      for (name, child) in self.children.items():
         child.unpickle_fix(image_root, path // name)

   def update(self, path):
      "Recompute the File_Metadata object at path in the tree rooted by me."
      # FIXME: Can’t handle anything other than regular, non-large files that
      # don’t need renaming.
      assert (stat.S_ISDIR(self.mode) and len(path) >= 1)
      fm = self
      for name in path.parts[:-1]:
         fm = fm.children[name]
      fm.children[path.name] = self.__class__(self.image_root, path)
      assert (stat.S_ISREG(fm.children[path.name].mode))


class State_ID:

   __slots__ = ('id_')

   def __init__(self, id_):
      # Constructor should only be called internally, so verify id_ type.
      assert (isinstance(id_, bytes))
      self.id_ = id_

   @classmethod
   def from_parent(class_, psid, input_):
      """Return the State_ID corresponding to parent State_ID psid and data
         input_ describing the transition, which can be either bytes or str."""
      h = md5(psid.id_)
      if (isinstance(input_, str)):
         input_ = input_.encode("UTF-8")
      h.update(input_)
      return class_(h.digest())

   @classmethod
   def from_text(class_, text):
      """The argument is stringified; then this string must end with 32 hex
         digits, possibly interspersed with other characters. Any preceding
         characters are ignored."""
      text = re.sub(r"[^0-9A-Fa-f]", "", str(text))[-32:]
      if (len(text) < 32):
         ch.FATAL("state ID too short: %s" % text)
      try:
         b = bytes.fromhex(text)
      except ValueError:
         ch.FATAL("state ID: malformed hex: %s" % text);
      return class_(b)

   def __eq__(self, other):
      return self.id_ == other.id_

   def __hash__(self):
      return hash(self.id_)

   def __str__(self):
      s = self.id_.hex().upper()
      return ":".join((s[0:4], s[4:8], s[8:16], s[16:24], s[24:32]))

   @property
   def short(self):
      return str(self)[:4]


## Core classes ##

class Enabled_Cache:

   root_id = State_ID.from_text("4A6F:73C3:A9204361:7061626C:616E6361")

   __slots__ = ("bootstrap_ct",
                "file_metadata",
                "large_threshold")

   def __init__(self, large_threshold):
      self.bootstrap_ct = 0
      self.large_threshold = large_threshold
      if (not os.path.isdir(self.root)):
         self.root.mkdir()
      ls = self.root.listdir()
      if (len(ls) == 0):
         self.bootstrap()      # empty; initialize a new cache
      elif (not {"HEAD", "objects", "refs"} <= ls):
         # Non-empty but not an existing cache.
         # See: https://git-scm.com/docs/gitrepository-layout
         ch.FATAL("storage broken: not a build cache: %s" % self.root)
      else:
         self.configure()         # updates config if needed
      self.worktrees_fix()

   @staticmethod
   def branch_name_ready(ref):
      return ref.for_path

   @staticmethod
   def branch_name_unready(ref):
      return ref.for_path + "#"

   @staticmethod
   def commit_hash_p(commit_ish):
      """Return True if commit_ish looks like a commit hash, False otherwise.
         Note this is a text-based heuristic only. It will return True for
         hashes that don’t exist in the repo, and false positives for
         branch/tag names that look like hashes."""
      return (re.search(r"^[0-9a-f]{7,}$", commit_ish) is not None)

   def __str__(self):
      return ("enabled (large=%g)" % self.large_threshold)

   @property
   def root(self):
      return ch.storage.build_cache

   def adopt(self, img):
      self.worktree_adopt(img, "root")
      img.metadata_load()
      img.metadata_save()
      log = "IMPORT %s" % img.ref
      sid = self.sid_from_parent(self.root_id, log)
      gh = self.commit(img.unpack_path, sid, log, [])
      self.ready(img)
      return (sid, gh)

   def bootstrap(self):
      ch.INFO("initializing empty build cache")
      self.bootstrap_ct += 1
      # Initialize bare repo. Don’t use wrapper because the build cache
      # doesn’t exist yet.
      ch.cmd_quiet([git, "init", "--bare", "-b", "root", self.root], env={})
      self.configure()
      # Create empty root commit. This is done in a strange way with no real
      # working directory at all, because (1) cloning the bucache doesn’t
      # clone the config, which we care about, and (2) worktrees cannot be
      # used on empty repositories.
      # See: https://stackoverflow.com/a/29396902/396038
      try:
         with tempfile.TemporaryDirectory(prefix="weirdal.") as td:
            env = { "GIT_DIR": self.root,
                    "GIT_WORK_TREE": td,
                    "GIT_INDEX_FILE": "%s/bootstrap-index" % td }
            self.git(["read-tree", "--empty"], env=env)
            # Note: complaints about empty commits go to stdout, not stderr.
            self.git(["commit", "--allow-empty",
                                "-m", "ROOT\n\n%s" % self.root_id], env=env)
      except OSError as x:
         ch.FATAL("can’t create or delete temporary directory: %s: %s"
                  % (x.filename, x.strerror))

   def branch_delete(self, branch):
      """Delete branch branch if it exists; otherwise, do nothing. This
         removes only the branch ref; its commits remain until garbage
         collected."""
      # Note: in a typical Git working directory, HEAD has followed the branch
      # around, so when we delete a branch ref *and necessarily its reflog
      # too*, that branch’s commits remain accessible via HEAD’s reflog until
      # the reflog entries expire. However, in our case, it’s the worktree
      # HEAD that did the following, and that reflog goes away when the
      # worktree is deleted, so the branch’s commits become inaccessible
      # immediately upon branch deletion. Here, the first “update-ref”
      # shenanigan logs the branch tip in the bare repo’s HEAD reflog, keeping
      # the commits accessible. The second puts HEAD back where it was.
      branches = [branch]
      if (self.ready_p(branch) and (self.cached_p(branch))):
         branches.append(self.unready_of(branch))
         # Tag deleted branch. This is allows images to be recovered with
         # “undelete.” Note that the “-f” flag overwrites existing tags with the
         # same name, meaning we only track the most recently deleted branch.
         self.git(["tag", "-a", "-f", "&%s" % branch, branch, "-m", "''"])
      for brnch in branches:
         if (self.git(["show-ref", "--quiet", "--heads", brnch],
                     fail_ok=True).returncode == 0): # branch found
            head_old = self.git(["rev-parse", "HEAD"]).stdout.strip()
            self.git(["update-ref", "HEAD", brnch])
            self.git(["update-ref", "HEAD", head_old])
            self.git(["branch", "-D", brnch])


   def branch_nocheckout(self, src_ref, dest):
      """Create ready branch for Ref src_ref pointing to dest, which can
         be either an Ref or a Git commit reference (as a string)."""
      if (isinstance(dest, im.Reference)):
         dest = self.branch_name_ready(dest)
      # Some versions of Git won’t let us update a branch that’s already
      # checked out, so detach that worktree if it exists.
      src_img = im.Image(src_ref)
      if (src_img.unpack_exist_p):
         self.git(["checkout", "--detach"], cwd=src_img.unpack_path)
      self.git(["branch", "-f", self.branch_name_ready(src_ref), dest])

   def cached_p(self, git_id):
      """True iff image corresponding to “git_id” is in the cache."""
      return self.find_commit(git_id)[1] != None

   def checkout(self, image, git_hash, base_image):
      # base_image used in other subclasses
      self.worktree_add(image, git_hash)
      self.git_restore(image.unpack_path, [], False)

   def checkout_ready(self, image, git_hash, base_image=None):
      """“checkout()” followed by “ready()” is an operation that appears several
         times throughout the code, so we wrap it here."""
      self.checkout(image, git_hash, base_image)
      self.ready(image)

   def commit(self, path, sid, msg, files):
      # Commit image at path into the build cache. If set files is empty, any
      # and all image content may have been changed. Otherwise, assume files
      # lists the only files in the image that have changed. These must be
      # relative paths relative to the image root (i.e., path), may only be
      # regular files, and get none of the special handling we have for
      # general image content.
      #
      # WARNING: files must be empty for the first image commit.
      self.git_prepare(path, files)
      t = ch.Timer()
      if (len(files) == 0):
         git_files = ["-A"]
      else:
         git_files = list(files) + ["ch/git.pickle"]
      self.git(["add"] + git_files, cwd=path)
      t.log("prepared index")
      t = ch.Timer()
      self.git(["commit", "-q", "--allow-empty",
                          "-m", "%s\n\n%s" % (msg, sid)], cwd=path)
      t.log("committed")
      # “git commit” does print the new commit’s hash without “-q”, but it
      # also prints every file committed, which is rather enormous for us.
      # Therefore, retrieve the hash separately.
      cp = self.git(["rev-parse", "--short", "HEAD"], cwd=path)
      git_hash = cp.stdout.strip()
      self.git_restore(path, files, True)
      return git_hash

   def commit_find_deleted(self, git_id):
         deleted = self.git(["log", "--format=%h%n%B", "-n", "1",
                           "&%s" % git_id],fail_ok=True)
         if (deleted.returncode == 0):
            # Commit was previously deleted but is still cached. Get info.
            sid = State_ID.from_text(deleted.stdout)
            commit = deleted.stdout.split("\n", maxsplit=1)[0]
         else:
            sid = None
            commit = None
         return (sid, commit)

   def configure(self):
      # Configuration.
      path = self.root // "config"
      fp = path.open("r+")
      config = configparser.ConfigParser()
      config.read_file(fp, source=path)
      changed = False
      for (k, v) in GIT_CONFIG.items():
         (s, k) = k.lower().split(".", maxsplit=1)
         if (config.get(s, k, fallback=None) != v):
            changed = True
            try:
               config.add_section(s)
            except configparser.DuplicateSectionError:
               pass
            config.set(s, k, v)
      if (changed):
         ch.VERBOSE("writing updated Git config")
         fp.seek(0)
         fp.truncate()
         ch.ossafe("can’t write Git config: %s" % path, config.write, fp)
      ch.close_(fp)
      # Ignore list entries:
      #
      #   1. Git has no default gitignore, but cancel any global gitignore rules
      #      the user might have. https://stackoverflow.com/a/26681066
      #
      #   2. The oddly-named GIT_DIR.
      #
      # It is easier to just write the list we want every time, rather than
      # trying to figure out if an update is needed.
      (self.root // "info/exclude").file_write("!*\n%s\n" % im.GIT_DIR)
      # Remove old .gitignore files from all commits. While there are nice
      # tools to do this (e.g. “git filter-repo”), we don’t want to depend on
      # an external tool. Thus, the options seem to be “filter-branch” or
      # “export” followed by “import”. Around half of the “filter-branch” man
      # page is devoted to explaining why not to use it, so we use the latter.
      #
      # NOTE: Without --reflog, “fast-export” will omit commits on unnamed
      # branches (i.e., accessible only via reflog), but with it, we get
      # “commit” instructions in the stream with no branch name, which
      # “fast-import” won’t accept. Therefore, we just delete those commits,
      # to prevent deleted .gitignore files from creeping back in. (I couldn’t
      # figure out how to fix this for “filter-branch” either, which I didn’t
      # want to use anyway for the reasons above.)
      if (ch.storage.bucache_needs_ignore_upgrade.exists()):
         ch.INFO("upgrading build cache to v6+, some cached data may be lost",
                 "see release notes for v0.32")
         text = self.git(["fast-export", "--no-data", "--", "--all"],
                         encoding="UTF-8").stdout
         #fs.Path("/tmp/old").file_write(text)
         # There is a bug in Git that loses files that become directories [1].
         # We work around this by moving delete commands within each commit to
         # be first. This makes a number of assumptions about the output of
         # “fast-export” that are true only for us, e.g. that it’s all
         # line-based, including data like commit messages.
         #
         # [1]: https://lore.kernel.org/git/6486D136-23D8-4C90-AEDA-DD037A5CD2B5@lanl.gov/T/#t
         lines = text.split("\n")
         data_p = re.compile(r"^[DM] ")
         i = 0
         while i < len(lines):
            if (data_p.search(lines[i])):
               j = i + 1
               while (data_p.search(lines[j])):
                  j += 1
               lines[i:j] = sorted(lines[i:j], key=lambda x: x[0])
               i = j - 1
            i += 1
         text = "\n".join(lines)
         text = re.sub(r"^(D|M [0-7]+ [0-9a-f]+) \.(git|weirdal_)ignore$",
                       r"#\g<0>", text, flags=re.MULTILINE)
         #fs.Path("/tmp/new").file_write(text)
         self.git(["fast-import", "--force"], input=text)
         self.git(["reflog", "expire", "--all", "--expire=now"])
         ch.storage.bucache_needs_ignore_upgrade.unlink()

   def find_commit(self, git_id):
      """Return (state ID, commit) of commit-ish git_id, or (None, None) if it
         doesn’t exist."""
      # Note abbreviated commit hash %h is automatically long enough to avoid
      # collisions.
      cp = self.git(["log", "--format=%h%n%B", "-n", "1", git_id],
                    fail_ok=True)
      if (cp.returncode == 0):  # branch exists
         sid = State_ID.from_text(cp.stdout)
         commit = cp.stdout.split("\n", maxsplit=1)[0]
      else:
         sid = None
         commit = None
      ch.VERBOSE("commit-ish %s: %s %s" % (git_id, commit, sid))
      return (sid, commit)

   def find_deleted_image(self, image):
      return self.commit_find_deleted(image.ref.for_path)

   def find_image(self, image):
      """Return (state ID, commit) of branch tip for image, or (None, None) if
         no such branch."""
      return self.find_commit(image.ref.for_path)

   def find_sid(self, sid, branch):
      """Return the hash of the commit matching State_ID, or None if no such
         commit exists. First search branch, then if not found, the
         entire repo including commits not reachable from any branch."""
      commit = self.find_sid_(sid, branch)
      if (commit is None):
         commit = self.find_sid_(sid)
      ch.VERBOSE("commit for %s: %s" % (sid, commit))
      return commit

   def find_sid_(self, sid, branch=None):
      """Return the hash of the most recent commit matching State_ID sid, or
         None if no such commit exists. If branch is given, search only that
         branch; otherwise, search the entire repo, including commits not
         reachable from any branch."""
      argv = ["log", "--grep", sid, "-F", "--format=%h", "-n", "1"]
      if (branch is not None):
         fail_ok = True
         argv += [branch]
      else:
         fail_ok = False
         argv += ["--all", "--reflog"]
      cp = self.git(argv, fail_ok=fail_ok)
      if (cp.returncode != 0 or len(cp.stdout) == 0):
         return None
      else:
         return cp.stdout.split(maxsplit=1)[0]

   def garbageinate(self):
      ch.INFO("collecting cache garbage")
      t = ch.Timer()
      # Expire the reflog with a recent time instead of now in case there is a
      # parallel Git operation in progress.
      self.git(["-c", "gc.bigPackthreshold=0", "-c", "gc.pruneExpire=now",
                "-c", "gc.reflogExpire=now", "gc"], quiet=False)
      t.log("collected garbage")
      t = ch.Timer()
      digests = self.git(["rev-list", "--all", "--reflog",
                                      "--date-order"]).stdout.split("\n")
      assert (digests[-1] == "")  # trailing newline
      digests[-2:] = []           # discard root commit and trailing newline
      p = ch.Progress("enumerating large files", "commits", 1, len(digests))
      larges_used = set()
      for d in digests:
         data = self.git(["show", "%s:%s" % (d, PICKLE_PATH)],
                         encoding=None).stdout
         fm = File_Metadata.unpickle(fs.Path("/DUMMY"), data)
         larges_used |= fm.large_names()
         p.update(1)
      p.done()
      t.log("enumerated large files")
      t = ch.Timer()
      ch.INFO("found %d large files used; deleting others" % len(larges_used))
      for l in ch.storage.build_large.listdir():
         if (l not in larges_used):
            (ch.storage.build_large // l).unlink()
      t.log("deleted unused large files")

   def git(self, argv, cwd=None, quiet=True, *args, **kwargs):
      """Run the given git(1) command with appropriate environment and return
         the resulting CompletedProcess object. If cwd is None, run with CWD
         set to the build cache bare repo; otherwise, it must be the path to
         an unpacked image. If quiet is true, read Git’s stdout and return it
         in cp.stdout; otherwise, leave Git’s stdout unchanged. Any additional
         arguments are passed through to ch.cmd_stdout()."""
      if (cwd is None):
         cwd = self.root
      else:
         if ("env" not in kwargs):
            kwargs["env"] = dict()
         kwargs["env"].update({ "GIT_DIR": str(cwd // im.GIT_DIR),
                                "GIT_WORK_TREE": str(cwd) })
      return (ch.cmd_stdout if quiet else ch.cmd)([git] + argv, cwd=cwd,
                                                  *args, **kwargs)

   def git_prepare(self, unpack_path, files, write=True):
      """Prepare unpack_path for Git operations (see
         File_Metadata.git_prepare() for lots of details). If files is None,
         regenerate self.file_metadata by walking the directory tree.
         Otherwise, update metadata only for files in files."""
      t = ch.Timer()
      if (len(files) == 0):
         self.file_metadata = File_Metadata.git_prepare(unpack_path,
                                                        self.large_threshold)
      else:
         for path in files:
            self.file_metadata.update(path)
      t.log("gathered file metadata")
      if (write):
         self.file_metadata.pickle()

   def git_restore(self, unpack_path, files, quick):
      """Opposite of git_prepare. If files is non-empty, only restore those
         files. If quick, assuming that unpack_path is unchanged since
         file_metadata was collected earlier in this process, rather than the
         directory being checked out from Git, i.e., only restore things that
         we broke in git_prepare() (e.g., renaming .git files), not things
         that Git breaks (e.g., file permissions). Otherwise (i.e., not
         quick), read the File_Metadata tree the pickled file and do a full
         restore. This method will dirty the Git working directory."""
      t = ch.Timer()
      if (not quick):
         self.file_metadata = File_Metadata.unpickle(unpack_path)
      if (len(files) == 0):
         self.file_metadata.git_restore(quick)
      else:
         for path in files:
            self.file_metadata.get(path).git_restore(quick)
      t.log("restored file metadata (%s)" % ("quick" if quick else "full"))

   def pull_eager(self, img, src_ref, last_layer=None):
      """Pull image, always checking if the repository version is newer. This
         is the pull operation invoked from the command line."""
      pullet = pull.Image_Puller(img, src_ref)
      pullet.download()  # will use dlcache if appropriate
      dl_sid = self.sid_from_parent(self.root_id, pullet.sid_input)
      dl_git_hash = self.find_sid(dl_sid, img.ref.for_path)
      if (dl_git_hash is not None):
         # Downloaded image is in cache, check it out.
         ch.INFO("pulled image: found in build cache")
         # Remove tag for previously deleted branch, if it exists.
         self.tag_delete(img.ref.for_path, fail_ok=True)
         self.checkout_ready(img, dl_git_hash)
      else:
         # Unpack and commit downloaded image. This also creates the worktree.
         ch.INFO("pulled image: adding to build cache")
         self.pull_lazy(img, src_ref, last_layer, pullet)

   def pull_lazy(self, img, src_ref, last_layer=None, pullet=None):
      """Pull img from src_ref if it does not exist in the build cache, i.e.,
         do not ask the registry if there is a newer version. This is the pull
         operation invoked by FROM. If pullet is not None, use that
         Image_Puller and do not download anything (i.e., assume
         Image_Puller.download() has already been called)."""
      if (pullet is None):
         # a young hen, especially one less than one year old
         pullet = pull.Image_Puller(img, src_ref)
         pullet.download()
      pullet.unpack(last_layer)
      sid = self.sid_from_parent(self.root_id, pullet.sid_input)
      pullet.done()
      self.worktree_adopt(img, "root")
      commit = self.commit(img.unpack_path, sid, "PULL %s" % src_ref, [])
      self.ready(img)
      if (img.ref != src_ref):
         self.branch_nocheckout(src_ref, img.ref)
      return (sid, commit)

   def ready(self, image):
      (_, git_hash) = self.find_deleted_image(image)
      if (not (git_hash is None)):
         self.tag_delete(image.ref.for_path) # Branch was deleted.
      self.git(["checkout", "-B", self.branch_name_ready(image.ref)],
               cwd=image.unpack_path)
      self.branch_delete(self.branch_name_unready(image.ref))

   def ready_p(self, branch):
      return (not branch.endswith("#"))

   def reset(self):
      if (self.bootstrap_ct >= 1):
         ch.WARNING("not resetting brand-new cache")
      else:
         # Kill any Git garbage collection that may be running, to avoid race
         # conditions while deleting the cache (see issue #1406). Open
         # directly to avoid a TOCTOU race.
         pid_path = ch.storage.build_cache // "gc.pid"
         try:
            fp = open(pid_path, "rt", encoding="UTF-8")
            text = ch.ossafe("can’t read: %s" % pid_path, fp.read)
            pid = int(text.split()[0])
            ch.INFO("stopping build cache garbage collection, PID %d" % pid)
            ch.kill_blocking(pid)
            ch.close_(fp)
         except FileNotFoundError:
            # no PID file, therefore no GC running
            pass
         except OSError as x:
            ch.FATAL("can’t open GC PID file: %s: %s" % (pid_path, x.strerror))
         # Delete images that are worktrees referring back to the build cache.
         ch.INFO("deleting build cache")
         for d in ch.storage.unpack_base.listdir():
            dotgit = ch.storage.unpack_base // d // im.GIT_DIR
            if (os.path.exists(dotgit)):
               ch.VERBOSE("deleting cached image: %s" % d)
               (ch.storage.unpack_base // d).rmtree()
         # Delete build cache.
         self.root.rmtree()
         ch.storage.build_large.rmtree()
         # Create new.
         self.root.mkdir()
         ch.storage.build_large.mkdir()
         self.bootstrap()

   def rollback(self, path):
      """Restore path to the last committed state, including both tracked and
         untracked files."""
      ch.INFO("something went wrong, rolling back ...")
      self.git_prepare(path, [], write=False)
      t = ch.Timer()
      self.git(["reset", "--hard", "HEAD"], cwd=path, quiet=False)
      self.git(["clean", "-fdq"], cwd=path, quiet=False)
      t.log("reverted worktree")
      self.git_restore(path, [], False)

   def sid_from_parent(self, *args):
      # This lets us intercept the call and return None in disabled mode.
      return State_ID.from_parent(*args)

   def status_char(self, miss):
      "Return single character to indicate whether miss is true or not."
      if (miss is None):
         return " "
      elif (miss):
         return "."
      else:
         return "*"

   def summary_print(self):
      # state IDs
      msgs = self.git(["log", "--all", "--reflog", "--format=format:%b"]).stdout
      states = set()
      for msg in msgs.splitlines():
         if (msg != ""):
            states.add(State_ID.from_text(msg))
      # branches (FIXME: how to count unnamed branch tips?)
      image_ct = self.git(["branch", "--list"]).stdout.count("\n")
      # file count and size on disk
      (file_ct, byte_ct) = fs.Path(self.root).du()
      commit_ct = int(self.git(["rev-list", "--all", "--reflog",
                                            "--count"]).stdout)
      (file_ct, file_suffix) = ch.si_decimal(file_ct)
      (byte_ct, byte_suffix) = ch.si_binary_bytes(byte_ct)
      # print it
      print("named images:   %5d" % image_ct)
      print("state IDs:      %5d" % len(states))
      print("large files:    %5d" % len(ch.storage.build_large.listdir()))
      print("commits:        %5d" % commit_ct)
      print("internal files: %5d %s" % (file_ct, file_suffix))
      print("disk used:      %5d %s" % (byte_ct, byte_suffix))
      # some information directly from Git
      if (ch.log_level >= ch.Log_Level.VERBOSE):
         out = self.git(["count-objects", "-vH"]).stdout
         print("Git statistics:")
         print(textwrap.indent(out, "  "), end="")
         out = (self.root // "config").file_read_all()
         print("Git config:")
         print(textwrap.indent(out, "  "), end="")

   def tag_delete(self, tag, *args, **kwargs):
      """Delete specified git tag. Used for recovering deleted branches."""
      return self.git(["tag", "-d", "&%s" % tag], *args, **kwargs)

   def tree_dot(self):
      have_dot()
      path_gv = fs.Path(dot_base + ".gv")
      path_pdf = fs.Path(dot_base + ".pdf")
      if (not path_gv.is_absolute()):
         path_gv = os.getcwd() // path_gv
         path_pdf = os.getcwd() // path_pdf
      ch.VERBOSE("writing %s" % path_gv)
      ch.cmd_quiet(
["git2dot.py",
 "--range", "--all --reflog --topo-order",
 "--font-name", "Nimbus Mono",
 "-d", 'graph[rankdir="TB", bgcolor="white"]',
 "-d", 'edge[dir=forward, arrowsize=0.5]',
 "--bedge", '[color=gray80, dir=none]',
 "--bnode", '[label="{label}", shape=box, height=0.20, color=gray80]',
 "--cnode", '[label="{label}", shape=box, color=black, fillcolor=white]',
 "--mnode", '[label="{label}", shape=box, color=black, fillcolor=white]',
 "-D", "@SID@", "([0-9A-F]{4}):",
 "-l", "@SID@|%s", str(path_gv)], cwd=self.root)
      ch.VERBOSE("writing %s" % path_pdf)
      ch.cmd_quiet(["dot", "-Tpdf", "-o%s" % path_pdf, str(path_gv)])

   def tree_print(self):
      # Note the percent codes are interpreted by Git.
      # See: https://git-scm.com/docs/git-log#_pretty_formats
      args = ["log", "--graph", "--all", "--reflog", "--topo-order"]
      if (ch.log_level == ch.Log_Level.INFO):
         # ref names, subject (instruction), branch heads.
         fmt = "%C(auto)%d %Creset%<|(77,trunc)%s"
         args.append("--decorate-refs=refs/heads")
      else:
         # ref names, short commit hash, subject (instruction), body (state ID)
         # FIXME: The body contains a trailing newline I can’t figure out how
         # to remove.
         fmt = "%C(auto)%d%C(yellow) %h %Creset%s %b"
      self.git(args + ["--format=%s" % fmt], quiet=False)
      print()  # blank line to separate from summary

   def unpack_delete(self, image, missing_ok=False):
      """Wrapper for Image.unpack_delete() that first detaches the work tree's
         head. If we delete an image's unpack path without first detaching HEAD,
         the corresponding work tree must also be deleted before the bucache
         branch. This involves multiple calls to worktrees_fix(), which is
         clunky, so we use this method instead."""
      if (not image.unpack_exist_p and missing_ok):
         return
      (_, commit) = self.find_commit(image.ref.for_path)
      if (commit is not None):
         # Off with her head!
         self.git(["checkout", "%s" % commit], cwd=image.unpack_path)
      image.unpack_delete()

   def unready_of(self, branch):
      if (self.ready_p(branch)):
         return branch + "#"
      else:
         return branch

   def worktree_add(self, image, base):
      if (image.unpack_cache_linked):
         self.git_prepare(image.unpack_path, [], write=False)  # clean worktree
         head = self.worktree_head(image)
         ch.VERBOSE("%s: head %s, want %s" % (image.unpack_path, head, base))
         if (self.commit_hash_p(base) and base == head):
            ch.INFO("using existing image")
         else:
            ch.INFO("updating existing image ...")
            t = ch.Timer()
            self.git(["checkout", "-B", self.branch_name_unready(image.ref),
                      base], cwd=image.unpack_path)
            t.log("adjusted worktree")
      else:
         ch.INFO("copying image from cache ...")
         image.unpack_clear()
         t = ch.Timer()
         self.git(["worktree", "add", "-f", "-B",
                   self.branch_name_unready(image.ref),
                   image.unpack_path, base])
         # Move GIT_DIR from default location to where we want it.
         git_dir_default = image.unpack_path // ".git"
         git_dir_new = image.unpack_path // im.GIT_DIR
         git_dir_new.parent.mkdir()
         git_dir_default.rename(git_dir_new)
         t.log("created worktree")

   def worktree_adopt(self, image, base):
      """Create a new worktree with the contents of existing directory
         image.unpack_path. Note shenanigans because “git worktree add”
         *cannot* use an existing directory but shutil.copytree *must* create
         its own directory (until Python 3.8, and we have to support 3.6). So
         we use some renaming."""
      if (os.path.isdir(ch.storage.image_tmp)):
         ch.WARNING("temporary image still exists, deleting",
                    "maybe a previous command crashed?")
         ch.storage.image_tmp.rmtree()
      image.unpack_path.rename(ch.storage.image_tmp)
      self.worktree_add(image, base)
      (image.unpack_path // im.GIT_DIR).rename(   ch.storage.image_tmp
                                               // im.GIT_DIR)
      image.unpack_path.rmtree()
      ch.storage.image_tmp.rename(image.unpack_path)

   def worktree_head(self, image):
      cp = self.git(["rev-parse", "--short", "HEAD"],
                    fail_ok=True, cwd=image.unpack_path)
      if (cp.returncode != 0):
         return None
      else:
         return cp.stdout.strip()

   def worktrees_fix(self):
      """Git stores pointers (paths) both from the main repository to each
         worktree, and in the other direction from each worktree back to the
         main repo. These are absolute paths, so if the storage directory gets
         moved, they need updating. Also, worktrees can disappear without
         telling Git. This method cleans all that up.

         This method does roughly the same thing as “git worktree repair” and
         “git worktree prune”, but we do it manually [1,2] because we know more
         about what is going on than Git does: (1) which images are worktrees
         vs. plain directories; (2) which images changed from worktree to plain
         directory w/o telling Git; (3) where the worktrees and main repo are
         relative to one another.

         In particular, I don’t see a simple way to trust the exit code of
         “git worktree repair” without doing most of this work first anyway.

         [1]: https://git-scm.com/docs/git-worktree
         [2]: https://git-scm.com/docs/gitrepository-layout"""
      t = ch.Timer()
      wt_actuals = { fs.Path(i).parts[-(len(im.GIT_DIR)+1)]
                     for i in glob.iglob(str(   ch.storage.unpack_base
                                             // "*" // im.GIT_DIR)) }
      wt_gits =    { fs.Path(i).name
                     for i in glob.iglob("%s/worktrees/*" % self.root) }
      # Unlink images that think they are in Git but are not. This should not
      # happen, but it does, and I wasn’t able to figure out how it happened.
      wt_gits_orphaned = wt_actuals - wt_gits
      for img_dir in wt_gits_orphaned:
         link = ch.storage.unpack_base // img_dir // im.GIT_DIR
         ch.WARNING("image erroneously marked cached, fixing: %s" % link,
                    ch.BUG_REPORT_PLZ)
         link.unlink()
      wt_actuals -= wt_gits_orphaned
      # Delete worktree data for images that no longer exist or aren’t
      # Git-enabled any more.
      wt_gits_deleted = wt_gits - wt_actuals
      for wt in wt_gits_deleted:
         (ch.storage.build_cache // "worktrees" // wt).rmtree()
      ch.VERBOSE("deleted %d stale worktree metadatas" % len(wt_gits_deleted))
      wt_gits -= wt_gits_deleted
      # Validate that the pointers are in sync now.
      if (wt_gits != wt_actuals):
         ch.ERROR("found images -> cache links: %s" % " ".join(wt_actuals))
         ch.ERROR("found cache -> images links: %s" % " ".join(wt_gits))
         ch.FATAL("build cache is desynchronized, cannot proceed",
                  ch.BUG_REPORT_PLZ)
      # If storage directory moved, repair all the paths.
      if (len(wt_gits) > 0):
         wt_dir_stored = fs.Path((   ch.storage.build_cache
                                  // "worktrees"
                                  // next(iter(wt_gits))
                                  // "gitdir").file_read_all())
         if (not wt_dir_stored.is_relative_to(ch.storage.root)):
            for wt in wt_actuals:
               wt_repo_dir = ch.storage.build_cache // "worktrees" // wt
               wt_img_git = ch.storage.unpack_base // wt // im.GIT_DIR
               wt_img_git.file_write("gitdir: %s\n" % str(wt_repo_dir))
               (wt_repo_dir // "gitdir").file_write(str(wt_img_git) + "\n")
            ch.VERBOSE("fixed %d worktrees" % len(wt_actuals))
      t.log("re-linked worktrees")


class Rebuild_Cache(Enabled_Cache):

   def __str__(self):
      return ("rebuild (large=%g)" % self.large_threshold)

   def find_sid(self, sid, branch):
      return None


class Disabled_Cache(Rebuild_Cache):

   def __init__(self, *args):
      pass

   def __str__(self):
      return "disabled"

   def branch_nocheckout(self, src_ref, dest):
      pass

   def checkout(self, image, git_hash, base_image):
      ch.INFO("copying image ...")
      image.unpack_clear()
      image.copy_unpacked(base_image)

   def commit(self, path, *args):
      self.permissions_fix(path)
      return None

   def find_image(self, *args):
      return (None, None)

   def permissions_fix(self, path):
      # Some distributions create unreadable files; e.g., CentOS 7 after
      # installing “openssh”:
      #
      #   $ ls -lh /scratch/reidpr.ch/img/centos_7ch/usr/bin/ssh-agent
      #   ---x--s--x 1 reidpr reidpr 374K Nov 24  2021 [...]/ssh-agent
      #
      # This makes the image un-copyable, so it can’t be used as a base image.
      #
      # Enabled_Cache takes care of this in git_prepare(), and
      # --force=fakeroot bypasses it in some other way I haven’t looked into.
      for (dir_, subdirs, files) in ch.walk(path):
         for i in itertools.chain(subdirs, files):
            (dir_ // i).chmod_min()

   def pull_lazy(self, img, src_ref, last_layer=None, pullet=None):
      if (pullet is None and os.path.exists(img.unpack_path)):
         ch.VERBOSE("base image already exists, skipping pull")
      else:
         if (pullet is None):
            pullet = pull.Image_Puller(img, src_ref)
            pullet.download()
         img.unpack_clear()
         pullet.unpack(last_layer)
         pullet.done()
      return (None, None)

   def ready(self, *args):
      pass

   def rollback(self, path):
      self.permissions_fix(path)

   def sid_from_parent(self, *args):
      return None

   def worktree_add(self, *args):
      pass

   def worktree_adopt(self, *args):
      pass

   def worktrees_prune(self, *args):
      pass