File: coalesce.py

package info (click to toggle)
cde 0.1%2Bgit9-g551e54d-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 11,176 kB
  • sloc: ansic: 75,885; sh: 4,282; python: 1,006; perl: 438; makefile: 297; lisp: 44; java: 5
file content (153 lines) | stat: -rw-r--r-- 4,616 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# dependencies: bsdiff, md5sum

import os, sys, stat
from subprocess import *

my_root = sys.argv[1]
assert os.path.isdir(my_root)

DEBUG = False

# these files take forever to search thru and don't result in much savings ...
ignores = ['__init__.py', '__init__.pyc']

# Key:   base filename
# Value: list of directories where file is found
system_filenames = {}

basedirs = ['/bin', '/lib', '/lib64', '/usr/bin', '/usr/lib', '/usr/lib64']

for b in basedirs:
  if os.path.isdir(b):
    for (dirname, subdirs, files) in os.walk(b):
      for f in files:
        if f not in system_filenames:
          system_filenames[f] = []
        system_filenames[f].append(dirname)


# candidate pairs for coalescing:
# (full path to file in cde-root/, full path to file in native system # dir)
# Key:   full path to file within cde-root/
# Value: list of candidate system files (full paths)
coalescing_candidates = {}

for (dirname, subdirs, files) in os.walk(my_root):
  for f in files:
    if f in ignores:
      continue

    # first look for exact matches
    if f in system_filenames:
      f_path = os.path.join(dirname, f)
      st = os.lstat(f_path) # don't follow symlinks
      if stat.S_ISREG(st.st_mode):
        for k_dir in system_filenames[f]:
          k_path = os.path.join(k_dir, f)
          k_st = os.lstat(k_path) # don't follow symlinks
          if stat.S_ISREG(k_st.st_mode):
            if f_path not in coalescing_candidates:
              coalescing_candidates[f_path] = []
            coalescing_candidates[f_path].append(k_path)

    # then look for fuzzy searches for libraries
    # by taking all the parts before the first '-' or '.'
    # character and comparing them with contents of
    # system_filenames.  this can pick up on 'variants' of library names
    elif f.startswith('lib'):
      f_path = os.path.join(dirname, f)
      st = os.lstat(f_path) # don't follow symlinks
      if stat.S_ISREG(st.st_mode):
        try: first_dash = f.index('-')
        except ValueError: first_dash = len(f) + 1
        try: first_dot = f.index('.')
        except ValueError: first_dot = len(f) + 1

        i = min(first_dash, first_dot)
        # add 1 to avoid spurious substring matches like
        # libc-2.7.so and libcrypto.so.0.9.8
        base_libname = f[:i+1]

        for k in system_filenames:
          # look for a prefix match
          if k.startswith(base_libname):
            # find all regular files (NOT symlinks)
            for k_dir in system_filenames[k]:
              k_path = os.path.join(k_dir, k)
              k_st = os.lstat(k_path) # don't follow symlinks
              if stat.S_ISREG(k_st.st_mode):
                if f_path not in coalescing_candidates:
                  coalescing_candidates[f_path] = []
                coalescing_candidates[f_path].append(k_path)


if DEBUG:
  print len(coalescing_candidates), 'candidates for coalescing'


total_savings = 0

# TODO: bsdiff does horribly if files are IDENTICAL, so check for
# identicalness first ...

for (x, y_lst) in coalescing_candidates.iteritems():
  best_match = None
  best_match_savings = 0

  if DEBUG: print "Trying:", x

  for y in y_lst:

    (stdout, stderr) = Popen(['md5sum', y, x], stdout=PIPE, stderr=PIPE).communicate()

    lines = stdout.split('\n')
    y_md5 = lines[0].split()[0]

    # sometimes you don't have permissions to read this file, so just
    # move on ...
    try:
      x_md5 = lines[1].split()[0]
    except:
      continue

    if (x_md5 == y_md5):
      if DEBUG: print 'EXACT MATCH!', y
      best_match = y
      pkg_st = os.stat(x)
      best_match_savings = pkg_st.st_size
      break # break out of this loop altogether


    if os.path.exists('/tmp/cur.patch'):
      os.remove('/tmp/cur.patch')
    # pass in the system's version of the file as the first arg
    (stdout, stderr) = Popen(['./bsdiff', y, x, '/tmp/cur.patch'], stdout=PIPE, stderr=PIPE).communicate()

    pkg_st = os.stat(x)
    try:
      patch_st = os.stat('/tmp/cur.patch')
    except:
      print >> sys.stderr, "Error in bsdiff:", y, x
      continue # sometimes bsdiff fails

    savings = pkg_st.st_size - patch_st.st_size
    if savings < 0:
      if DEBUG: print "WARNING:", y
      pass
    else:
      if savings > best_match_savings:
        best_match_savings = savings
        best_match = y
        if DEBUG: print "  better:", y

  if DEBUG: print "Best match:", best_match
  if DEBUG: print "  bytes saved:", best_match_savings
  if DEBUG: print "---"
  total_savings += best_match_savings


if DEBUG:
  print "Total saved bytes:", total_savings
else:
  print total_savings