File: scoring.py

package info (click to toggle)
zxcvbn-python 4.5.0-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 55,880 kB
  • sloc: python: 2,222; makefile: 3
file content (424 lines) | stat: -rw-r--r-- 14,299 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
from math import log, factorial

import re

from .adjacency_graphs import ADJACENCY_GRAPHS

from decimal import Decimal


def calc_average_degree(graph):
    average = 0

    for key, neighbors in graph.items():
        average += len([n for n in neighbors if n])
    average /= float(len(graph.items()))

    return average


BRUTEFORCE_CARDINALITY = 10
MIN_GUESSES_BEFORE_GROWING_SEQUENCE = 10000
MIN_SUBMATCH_GUESSES_SINGLE_CHAR = 10
MIN_SUBMATCH_GUESSES_MULTI_CHAR = 50

MIN_YEAR_SPACE = 20
REFERENCE_YEAR = 2017


def nCk(n, k):
    """http://blog.plover.com/math/choose.html"""
    if k > n:
        return 0
    if k == 0:
        return 1

    r = 1
    for d in range(1, k + 1):
        r *= n
        r /= d
        n -= 1

    return r


# ------------------------------------------------------------------------------
# search --- most guessable match sequence -------------------------------------
# ------------------------------------------------------------------------------
#
# takes a sequence of overlapping matches, returns the non-overlapping sequence with
# minimum guesses. the following is a O(l_max * (n + m)) dynamic programming algorithm
# for a length-n password with m candidate matches. l_max is the maximum optimal
# sequence length spanning each prefix of the password. In practice it rarely exceeds 5 and the
# search terminates rapidly.
#
# the optimal "minimum guesses" sequence is here defined to be the sequence that
# minimizes the following function:
#
#    g = l! * Product(m.guesses for m in sequence) + D^(l - 1)
#
# where l is the length of the sequence.
#
# the factorial term is the number of ways to order l patterns.
#
# the D^(l-1) term is another length penalty, roughly capturing the idea that an
# attacker will try lower-length sequences first before trying length-l sequences.
#
# for example, consider a sequence that is date-repeat-dictionary.
#  - an attacker would need to try other date-repeat-dictionary combinations,
#    hence the product term.
#  - an attacker would need to try repeat-date-dictionary, dictionary-repeat-date,
#    ..., hence the factorial term.
#  - an attacker would also likely try length-1 (dictionary) and length-2 (dictionary-date)
#    sequences before length-3. assuming at minimum D guesses per pattern type,
#    D^(l-1) approximates Sum(D^i for i in [1..l-1]
#
# ------------------------------------------------------------------------------
def most_guessable_match_sequence(password, matches, _exclude_additive=False):
    n = len(password)

    # partition matches into sublists according to ending index j
    matches_by_j = [[] for _ in range(n)]
    try:
        for m in matches:
            matches_by_j[m['j']].append(m)
    except TypeError:
        pass
    # small detail: for deterministic output, sort each sublist by i.
    for lst in matches_by_j:
        lst.sort(key=lambda m1: m1['i'])

    optimal = {
        # optimal.m[k][l] holds final match in the best length-l match sequence
        # covering the password prefix up to k, inclusive.
        # if there is no length-l sequence that scores better (fewer guesses)
        # than a shorter match sequence spanning the same prefix,
        # optimal.m[k][l] is undefined.
        'm': [{} for _ in range(n)],

        # same structure as optimal.m -- holds the product term Prod(m.guesses
        # for m in sequence). optimal.pi allows for fast (non-looping) updates
        # to the minimization function.
        'pi': [{} for _ in range(n)],

        # same structure as optimal.m -- holds the overall metric.
        'g': [{} for _ in range(n)],
    }

    # helper: considers whether a length-l sequence ending at match m is better
    # (fewer guesses) than previously encountered sequences, updating state if
    # so.
    def update(m, l):
        k = m['j']
        pi = estimate_guesses(m, password)
        if l > 1:
            # we're considering a length-l sequence ending with match m:
            # obtain the product term in the minimization function by
            # multiplying m's guesses by the product of the length-(l-1)
            # sequence ending just before m, at m.i - 1.
            pi = pi * Decimal(optimal['pi'][m['i'] - 1][l - 1])
        # calculate the minimization func
        g = factorial(l) * pi
        if not _exclude_additive:
            g += MIN_GUESSES_BEFORE_GROWING_SEQUENCE ** (l - 1)

        # update state if new best.
        # first see if any competing sequences covering this prefix, with l or
        # fewer matches, fare better than this sequence. if so, skip it and
        # return.
        for competing_l, competing_g in optimal['g'][k].items():
            if competing_l > l:
                continue
            if competing_g <= g:
                return

        # this sequence might be part of the final optimal sequence.
        optimal['g'][k][l] = g
        optimal['m'][k][l] = m
        optimal['pi'][k][l] = pi

    # helper: evaluate bruteforce matches ending at k.
    def bruteforce_update(k):
        # see if a single bruteforce match spanning the k-prefix is optimal.
        m = make_bruteforce_match(0, k)
        update(m, 1)
        for i in range(1, k + 1):
            # generate k bruteforce matches, spanning from (i=1, j=k) up to
            # (i=k, j=k). see if adding these new matches to any of the
            # sequences in optimal[i-1] leads to new bests.
            m = make_bruteforce_match(i, k)
            for l, last_m in optimal['m'][i - 1].items():
                l = int(l)

                # corner: an optimal sequence will never have two adjacent
                # bruteforce matches. it is strictly better to have a single
                # bruteforce match spanning the same region: same contribution
                # to the guess product with a lower length.
                # --> safe to skip those cases.
                if last_m.get('pattern', False) == 'bruteforce':
                    continue

                # try adding m to this length-l sequence.
                update(m, l + 1)

    # helper: make bruteforce match objects spanning i to j, inclusive.
    def make_bruteforce_match(i, j):
        return {
            'pattern': 'bruteforce',
            'token': password[i:j + 1],
            'i': i,
            'j': j,
        }

    # helper: step backwards through optimal.m starting at the end,
    # constructing the final optimal match sequence.
    def unwind(n):
        if n == 0:
            # return empty list for zero-length password
            return []

        optimal_match_sequence = []
        k = n - 1
        # find the final best sequence length and score
        l = None
        g = float('inf')
        for candidate_l, candidate_g in optimal['g'][k].items():
            if candidate_g < g:
                l = candidate_l
                g = candidate_g

        while k >= 0:
            m = optimal['m'][k][l]
            optimal_match_sequence.insert(0, m)
            k = m['i'] - 1
            l -= 1

        return optimal_match_sequence

    for k in range(n):
        for m in matches_by_j[k]:
            if m['i'] > 0:
                for l in optimal['m'][m['i'] - 1]:
                    l = int(l)
                    update(m, l + 1)
            else:
                update(m, 1)
        bruteforce_update(k)

    optimal_match_sequence = unwind(n)
    optimal_l = len(optimal_match_sequence)

    # corner: empty password
    if len(password) == 0:
        guesses = 1
    else:
        guesses = optimal['g'][n - 1][optimal_l]

    # final result object
    return {
        'password': password,
        'guesses': guesses,
        'guesses_log10': log(guesses, 10),
        'sequence': optimal_match_sequence,
    }


def estimate_guesses(match, password):
    if match.get('guesses', False):
        return Decimal(match['guesses'])

    min_guesses = 1
    if len(match['token']) < len(password):
        if len(match['token']) == 1:
            min_guesses = MIN_SUBMATCH_GUESSES_SINGLE_CHAR
        else:
            min_guesses = MIN_SUBMATCH_GUESSES_MULTI_CHAR

    estimation_functions = {
        'bruteforce': bruteforce_guesses,
        'dictionary': dictionary_guesses,
        'spatial': spatial_guesses,
        'repeat': repeat_guesses,
        'sequence': sequence_guesses,
        'regex': regex_guesses,
        'date': date_guesses,
    }

    guesses = estimation_functions[match['pattern']](match)
    match['guesses'] = max(guesses, min_guesses)
    match['guesses_log10'] = log(match['guesses'], 10)

    return Decimal(match['guesses'])


def bruteforce_guesses(match):
    guesses = BRUTEFORCE_CARDINALITY ** len(match['token'])
    # small detail: make bruteforce matches at minimum one guess bigger than
    # smallest allowed submatch guesses, such that non-bruteforce submatches
    # over the same [i..j] take precedence.
    if len(match['token']) == 1:
        min_guesses = MIN_SUBMATCH_GUESSES_SINGLE_CHAR + 1
    else:
        min_guesses = MIN_SUBMATCH_GUESSES_MULTI_CHAR + 1

    return max(guesses, min_guesses)


def dictionary_guesses(match):
    # keep these as properties for display purposes
    match['base_guesses'] = match['rank']
    match['uppercase_variations'] = uppercase_variations(match)
    match['l33t_variations'] = l33t_variations(match)
    reversed_variations = match.get('reversed', False) and 2 or 1

    return match['base_guesses'] * match['uppercase_variations'] * \
        match['l33t_variations'] * reversed_variations


def repeat_guesses(match):
    return match['base_guesses'] * Decimal(match['repeat_count'])


def sequence_guesses(match):
    first_chr = match['token'][:1]
    # lower guesses for obvious starting points
    if first_chr in ['a', 'A', 'z', 'Z', '0', '1', '9']:
        base_guesses = 4
    else:
        if re.compile(r'\d').match(first_chr):
            base_guesses = 10  # digits
        else:
            # could give a higher base for uppercase,
            # assigning 26 to both upper and lower sequences is more
            # conservative.
            base_guesses = 26
    if not match['ascending']:
        base_guesses *= 2

    return base_guesses * len(match['token'])


def regex_guesses(match):
    char_class_bases = {
        'alpha_lower': 26,
        'alpha_upper': 26,
        'alpha': 52,
        'alphanumeric': 62,
        'digits': 10,
        'symbols': 33,
    }
    if match['regex_name'] in char_class_bases:
        return char_class_bases[match['regex_name']] ** len(match['token'])
    elif match['regex_name'] == 'recent_year':
        # conservative estimate of year space: num years from REFERENCE_YEAR.
        # if year is close to REFERENCE_YEAR, estimate a year space of
        # MIN_YEAR_SPACE.
        year_space = abs(int(match['regex_match'].group(0)) - REFERENCE_YEAR)
        year_space = max(year_space, MIN_YEAR_SPACE)

        return year_space


def date_guesses(match):
    year_space = max(abs(match['year'] - REFERENCE_YEAR), MIN_YEAR_SPACE)
    guesses = year_space * 365
    if match.get('separator', False):
        guesses *= 4

    return guesses


KEYBOARD_AVERAGE_DEGREE = calc_average_degree(ADJACENCY_GRAPHS['qwerty'])
# slightly different for keypad/mac keypad, but close enough
KEYPAD_AVERAGE_DEGREE = calc_average_degree(ADJACENCY_GRAPHS['keypad'])

KEYBOARD_STARTING_POSITIONS = len(ADJACENCY_GRAPHS['qwerty'].keys())
KEYPAD_STARTING_POSITIONS = len(ADJACENCY_GRAPHS['keypad'].keys())


def spatial_guesses(match):
    if match['graph'] in ['qwerty', 'dvorak']:
        s = KEYBOARD_STARTING_POSITIONS
        d = KEYBOARD_AVERAGE_DEGREE
    else:
        s = KEYPAD_STARTING_POSITIONS
        d = KEYPAD_AVERAGE_DEGREE
    guesses = 0
    L = len(match['token'])
    t = match['turns']
    # estimate the number of possible patterns w/ length L or less with t turns
    # or less.
    for i in range(2, L + 1):
        possible_turns = min(t, i - 1) + 1
        for j in range(1, possible_turns):
            guesses += nCk(i - 1, j - 1) * s * pow(d, j)
    # add extra guesses for shifted keys. (% instead of 5, A instead of a.)
    # math is similar to extra guesses of l33t substitutions in dictionary
    # matches.
    if match['shifted_count']:
        S = match['shifted_count']
        U = len(match['token']) - match['shifted_count']  # unshifted count
        if S == 0 or U == 0:
            guesses *= 2
        else:
            shifted_variations = 0
            for i in range(1, min(S, U) + 1):
                shifted_variations += nCk(S + U, i)
            guesses *= shifted_variations

    return guesses


START_UPPER = re.compile(r'^[A-Z][^A-Z]+$')
END_UPPER = re.compile(r'^[^A-Z]+[A-Z]$')
ALL_UPPER = re.compile(r'^[^a-z]+$')
ALL_LOWER = re.compile(r'^[^A-Z]+$')


def uppercase_variations(match):
    word = match['token']

    if ALL_LOWER.match(word) or word.lower() == word:
        return 1

    for regex in [START_UPPER, END_UPPER, ALL_UPPER]:
        if regex.match(word):
            return 2

    U = sum(1 for c in word if c.isupper())
    L = sum(1 for c in word if c.islower())
    variations = 0
    for i in range(1, min(U, L) + 1):
        variations += nCk(U + L, i)

    return variations


def l33t_variations(match):
    if not match.get('l33t', False):
        return 1

    variations = 1

    for subbed, unsubbed in match['sub'].items():
        # lower-case match.token before calculating: capitalization shouldn't
        # affect l33t calc.
        chrs = list(match['token'].lower())
        S = sum(1 for chr in chrs if chr == subbed)
        U = sum(1 for chr in chrs if chr == unsubbed)
        if S == 0 or U == 0:
            # for this sub, password is either fully subbed (444) or fully
            # unsubbed (aaa) treat that as doubling the space (attacker needs
            # to try fully subbed chars in addition to unsubbed.)
            variations *= 2
        else:
            # this case is similar to capitalization:
            # with aa44a, U = 3, S = 2, attacker needs to try unsubbed + one
            # sub + two subs
            p = min(U, S)
            possibilities = 0
            for i in range(1, p + 1):
                possibilities += nCk(U + S, i)
            variations *= possibilities

    return variations