1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231
|
# coding:utf-8
import re
import string
import random
import six
from .. import *
from nose.tools import assert_equal, assert_not_equal, ok_
from nose.tools import nottest
from flanker.addresslib import validate
from flanker.addresslib import corrector
COMMENT = re.compile(r'''\s*#''')
@nottest
def generate_mutated_string(source_str, num):
letters = list(source_str)
if six.PY2:
rchars = string.ascii_lowercase.translate(None, source_str + '.')
else:
rchars = string.ascii_lowercase.translate(source_str + '.')
random_orig = random.sample(list(enumerate(source_str)), num)
random_new = random.sample(list(enumerate(rchars)), num)
for i, j in zip(random_orig, random_new):
letters[i[0]] = j[1]
return ''.join(letters)
@nottest
def generate_longer_string(source_str, num):
letters = list(source_str)
if six.PY2:
rchars = string.ascii_lowercase.translate(None, source_str + '.')
else:
rchars = string.ascii_lowercase.translate(source_str + '.')
for i in range(num):
letters = [random.choice(rchars)] + letters
return ''.join(letters)
@nottest
def generate_shorter_string(source_str, num):
return source_str[0:len(source_str)-num]
@nottest
def domain_generator(size=6, chars=string.ascii_letters + string.digits):
domain = ''.join(random.choice(chars) for x in range(size))
return ''.join([domain, '.com'])
def test_domain_typo_valid_set():
sugg_correct = 0
sugg_total = 0
print('')
for line in DOMAIN_TYPO_VALID_TESTS.split('\n'):
# strip line, skip over empty lines
line = line.strip()
if line == '':
continue
# skip over comments or empty lines
match = COMMENT.match(line)
if match:
continue
parts = line.split(',')
test_str = 'username@' + parts[0]
corr_str = 'username@' + parts[1]
sugg_str = validate.suggest_alternate(test_str)
if sugg_str == corr_str:
sugg_correct += 1
else:
print('did not match: {0}, {1}'.format(test_str, sugg_str))
sugg_total += 1
# ensure that we have greater than 90% accuracy
accuracy = float(sugg_correct) / sugg_total
print('external valid: accuracy: {0}, correct: {1}, total: {2}'
.format(accuracy, sugg_correct, sugg_total))
ok_(accuracy > 0.90)
def test_domain_typo_invalid_set():
sugg_correct = 0
sugg_total = 0
print('')
for line in DOMAIN_TYPO_INVALID_TESTS.split('\n'):
# strip line, skip over empty lines
line = line.strip()
if line == '':
continue
# skip over comments or empty lines
match = COMMENT.match(line)
if match:
continue
test_str = 'username@' + line
sugg_str = validate.suggest_alternate(test_str)
if sugg_str == None:
sugg_correct += 1
else:
print('incorrect correction: {0}, {1}'.format(test_str, sugg_str))
sugg_total += 1
# ensure that we have greater than 90% accuracy
accuracy = float(sugg_correct) / sugg_total
print('external invalid: accuracy: {0}, correct: {1}, total: {2}'
.format(accuracy, sugg_correct, sugg_total))
ok_(accuracy > 0.90)
# For the remaining tests, the accuracy is significantly lower than
# the above because the corrector is tuned to real typos that occur,
# while what we have below are random mutations. Also, because
# this these test are non-deterministic, it's better to have a lower
# lower threshold to ensure that tests don't fail dring deployment
# due to a outlier). Realistic numbers for all thees tests should easily
# be above 80% accuracy range.
def test_suggest_alternate_mutations_valid():
sugg_correct = 0
sugg_total = 0
print('')
for i in range(1, 3):
for j in range(100):
domain = random.choice(corrector.MOST_COMMON_DOMAINS)
orig_str = 'username@' + domain
mstr = 'username@' + generate_mutated_string(domain, i)
sugg_str = validate.suggest_alternate(mstr)
if sugg_str == orig_str:
sugg_correct += 1
sugg_total += 1
# ensure that we have greater than 60% accuracy
accuracy = float(sugg_correct) / sugg_total
print('mutations valid: accuracy: {0}, correct: {1}, total: {2}'
.format(accuracy, sugg_correct, sugg_total))
ok_(accuracy > 0.60)
def test_suggest_alternate_longer_valid():
sugg_correct = 0
sugg_total = 0
print('')
for i in range(1, 3):
for j in range(100):
domain = random.choice(corrector.MOST_COMMON_DOMAINS)
orig_str = 'username@' + domain
lstr = 'username@' + generate_longer_string(domain, i)
sugg_str = validate.suggest_alternate(lstr)
if sugg_str == orig_str:
sugg_correct += 1
sugg_total += 1
# ensure that we have greater than 60% accuracy
accuracy = float(sugg_correct) / sugg_total
print('longer valid: accuracy: {0}, correct: {1}, total: {2}'
.format(accuracy, sugg_correct, sugg_total))
ok_(accuracy > 0.60)
def test_suggest_alternate_shorter_valid():
sugg_correct = 0
sugg_total = 0
print('')
for i in range(1, 3):
for j in range(100):
domain = random.choice(corrector.MOST_COMMON_DOMAINS)
orig_str = 'username@' + domain
sstr = 'username@' + generate_shorter_string(domain, i)
sugg_str = validate.suggest_alternate(sstr)
if sugg_str == orig_str:
sugg_correct += 1
sugg_total += 1
# ensure that we have greater than 60% accuracy
accuracy = float(sugg_correct) / sugg_total
print('shorter valid: accuracy: {0}, correct: {1}, total: {2}'
.format(accuracy, sugg_correct, sugg_total))
ok_(accuracy > 0.60)
def test_suggest_alternate_invalid():
sugg_correct = 0
sugg_total = 0
print('')
for i in range(3, 10):
for j in range(100):
domain = domain_generator(i)
orig_str = 'username@' + domain
sugg_str = validate.suggest_alternate(orig_str)
if sugg_str == None:
sugg_correct += 1
else:
print('did not match: {0}, {1}'.format(orig_str, sugg_str))
sugg_total += 1
# ensure that we have greater than 60% accuracy
accuracy = float(sugg_correct) / sugg_total
print('alternative invalid: accuracy: {0}, correct: {1}, total: {2}'
.format(accuracy, sugg_correct, sugg_total))
ok_(accuracy > 0.60)
|