1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
|
#!/usr/bin/python3
"""
This application converts the various text files stored in the source-data
directory into a pickled python object to be used by the random data
generator scripts
Copyright (C) 2007 Chris Moffitt
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
import csv
import string
import pickle as pickle
import random
import os
data_dir = os.path.join (os.path.dirname (__file__), "source-data")
simple_files_to_process = ['street-names.txt', 'street-types.txt', 'latin-words.txt',
'email-domains.txt', 'job-titles.txt', 'company-names.txt',
'company-types.txt',
'equipment-manufacturers.txt','equipment-models.txt']
def load_files():
# Process Zip Codes
all_zips = {}
reader = csv.reader(open(os.path.join(data_dir,"zip-codes.txt")))
for row in reader:
data = [string.capwords(row[3]), row[4]]
all_zips[row[0]] = data
output = open('source-data.pkl', 'wb')
pickle.dump(all_zips, output)
#Process area codes
area_code_file = open(os.path.join(data_dir,"area-codes.txt"))
state_area_codes = {}
for line in area_code_file:
clean_line = line.replace(' ','').rstrip('\n')
state_area_codes[line.split(':')[0]] = clean_line[3:].split(',')
pickle.dump(state_area_codes, output)
area_code_file.close()
#Process Last Names
last_names = []
last_name_file = open(os.path.join(data_dir,"last-name.txt"))
for line in last_name_file:
clean_line = line.rstrip('\n')
last_names.append(string.capwords(clean_line.split(' ')[0]))
pickle.dump(last_names, output)
last_name_file.close()
#Process Male First Names
male_first_names = []
male_first_name_file = open(os.path.join(data_dir,"male-first-name.txt"))
for line in male_first_name_file:
clean_line = line.rstrip('\n')
male_first_names.append(string.capwords(clean_line.split(' ')[0]))
pickle.dump(male_first_names, output)
male_first_name_file.close()
#Process Female First Names
female_first_names = []
female_first_name_file = open(os.path.join(data_dir,"female-first-name.txt"))
for line in female_first_name_file:
clean_line = line.rstrip('\n')
female_first_names.append(string.capwords(clean_line.split(' ')[0]))
pickle.dump(female_first_names, output)
female_first_name_file.close()
#Process the simple files
for f in simple_files_to_process:
temp = []
sample_file = open(os.path.join(data_dir, f))
for line in sample_file:
clean_line = line.rstrip('\n')
temp.append(clean_line)
pickle.dump(temp, output)
sample_file.close()
temp = []
output.close()
if __name__ == "__main__":
response = string.lower(input("Type 'yes' to reload the data from source files and create a new source file: "))
if response == 'yes':
load_files()
|