1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171
|
import queue
import pickle
import os
import time
class Cache(queue.Queue):
"""
wig uses a cache to store the requests and responses made during a scan.
This helps limit the amount of requests that it makes, as a request for
resource is only made once.
To further limit the amount of requests, wig saves a copy of the cache
and will reuse it for scans run within 24 hours.
"""
def _init(self, maxsize):
self.queue = dict()
self.host = None
self.cache_dir = './cache/'
self.cache_name = ''
self.now = str(time.time()).split('.')[0]
self.printer = None
# only load cache data that is new than this
# (currently this is set for 24 hours)
self.cache_ttl = 60*60*24
# check if cache dir exists - create if not
self._check_or_create_cache()
# check if there are caches that are older than ttl
self._remove_old_caches()
def __getitem__(self, path):
return self.queue[path]
def __setitem__(self, path, response):
with self.mutex:
self.queue[path] = response
def __contains__(self, url):
with self.mutex:
return url in self.queue
def _check_or_create_cache(self):
if not os.path.exists(self.cache_dir):
os.makedirs(self.cache_dir)
def _remove_old_caches(self):
# remove caches that are too old
# bail if the directory does not exist
if not os.path.exists(self.cache_dir):
return None
# iterate over the cache files
for cache_file in os.listdir(self.cache_dir):
# skip the file if it's not a cache file
if not cache_file.endswith('.cache'):
continue
# check if the cache is for the host
_, time_ext = cache_file.split('_-_')
save_time, _ = time_ext.split('.')
# check the age of the cache, and remove it if older than
# ttl
age = int(self.now) - int(save_time)
if age > self.cache_ttl:
file_name = os.path.join(self.cache_dir, cache_file)
os.remove(file_name)
def _get_name_for_cache_file(self):
# check if there already is an older version of the cache
# if there is, return the name of this file.
# it will be overwritten
if not os.path.exists(self.cache_dir):
os.makedirs(self.cache_dir)
for cache_file in os.listdir(self.cache_dir):
# skip the file if it's not a cache file
if not cache_file.endswith('.cache'):
continue
# check if the cache is for the host
hostname, _ = cache_file.split('_-_')
if hostname == self.cache_name.split('_-_')[0]:
return os.path.join(self.cache_dir, cache_file)
# if there aren't any previous cache files, generate a
# new name for the cache
return os.path.join(self.cache_dir, self.cache_name)
def set_host(self, host):
self.host = host
self.cache_name = self.host.replace('/', '').replace(':', '..') + '_-_' + self.now + '.cache'
def get_num_urls(self):
return len(set([self.queue[key].id for key in self.queue]))
def get_urls(self):
return [k for k in self.queue]
def get_responses(self):
return [self.queue[key] for key in self.queue]
def save(self):
# save the queue for later use
# this will help limit the amount of requests made
# when scanning the same site multiple times
with self.mutex:
file_name = self._get_name_for_cache_file()
with open(file_name, 'wb') as cache_file:
try:
pickle.dump(self.queue, cache_file)
except Exception as err:
if self.printer:
self.printer.print_debug_line('Error saving cache', 1)
else:
if self.printer:
self.printer.print_debug_line('Saved cache to: %s' % (file_name, ), 1)
def load(self):
# loads previously saved cache for the host
# bail if the host is not set
if self.host is None:
return None
# search the cache dir
for cache_file in os.listdir(self.cache_dir):
# skip the file if it's not a cache file
if not cache_file.endswith('.cache'):
continue
# check if the cache is for the host
hostname, time_ext = cache_file.split('_-_')
save_time, _ = time_ext.split('.')
# calc the age of the cache
age = int(self.now) - int(save_time)
# overwrite the current queue if it's for the host and the cache is not too old
if hostname == self.cache_name.split('_-_')[0] and age < self.cache_ttl:
file_name = os.path.join(self.cache_dir, cache_file)
try:
with open(file_name, 'rb') as handle:
data = pickle.load(handle)
for path in data:
self.__setitem__(path, data[path])
except:
if self.printer:
self.printer.print_debug_line('Error loading cache', 1)
else:
if self.printer:
self.printer.print_debug_line('Loaded cache from: %s' % (cache_file, ), 1)
|