File: find_file_by_chksum.py

package info (click to toggle)
worker 5.2.2-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 14,300 kB
  • sloc: cpp: 131,702; sh: 5,166; yacc: 1,616; makefile: 786; lex: 278; python: 85; xml: 45
file content (106 lines) | stat: -rwxr-xr-x 3,560 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#! /usr/bin/python3
#
# This script outputs all files matching a given checksum

from __future__ import print_function
import os
import hashlib
from optparse import OptionParser

class FindByChksum:

    def __init__( self, options, args ):
        self.__options = options
        self.__args = args

        self.__verbose = options.verbose
        self.__chksum = options.chksum
        self.__type = options.type
        self.__hash_limit = int( options.hash_limit )

        self.__dir = args[0]

    def get_hash( self, filename ):
        h = None
        if os.path.isfile( filename ):
            hasher = {
                'sha256': hashlib.sha256(),
                'md5': hashlib.md5()
            }.get( self.__type, None )

            if hasher:
                bytes_read = 0
                with open( filename, "rb" ) as fh:
                    while True:
                        if self.__hash_limit > 0 and bytes_read >= self.__hash_limit:
                            break
                        l = 4096
                        if self.__hash_limit > 0:
                            l = min( 4096, self.__hash_limit - bytes_read )
                        buf = fh.read( l )
                        if len( buf ) > 0:
                            hasher.update( buf )
                            bytes_read += len( buf )
                        else:
                            break
                h = hasher.hexdigest()
        return h

    def get_file_list( self, base_dir ):
        res = []
        for base, dirs, files in os.walk( base_dir ):
            for f in files:
                full_name = os.path.join( base, f )

                if self.__verbose:
                    print( "Processing %s" % ( full_name ) )

                h = self.get_hash( full_name )
                if not h:
                    if self.__verbose:
                        print( "Skip %s due to failed chksum" % ( full_name ) )
                    continue

                if self.__verbose:
                    print( "Got chksum %s for %s" % ( h, full_name ) )

                if h == self.__chksum or h.startswith( self.__chksum ):
                    res.append( full_name )
        return res
    
    def find( self ):
        if self.__verbose:
            print( "Finding in %s" % ( self.__dir ) )

        res = self.get_file_list( self.__dir )
        for f in res:
            print( f )

if __name__ == "__main__":
    parser = OptionParser( usage = "usage: %prog [options] <dir>" )
    parser.add_option( "-v", "--verbose",
                       action  = "store_true",
                       dest    = "verbose",
                       default = False,
                       help    = "verbose output" )
    parser.add_option( "-l", "--limit",
                       action  = "store",
                       dest    = "hash_limit",
                       default = "0",
                       help    = "set number of bytes used for hashingy" )
    parser.add_option( "-s", "--sum",
                       action  = "store",
                       dest    = "chksum",
                       default = "",
                       help    = "set the chksum (or prefix of it) for comparison" )
    parser.add_option( "-t", "--type",
                       action  = "store",
                       dest    = "type",
                       default = "sha256",
                       help    = "set the chksum type (sha256, md5)" )

    ( options, args ) = parser.parse_args()

    s = FindByChksum( options, args )

    s.find()