File: convert_data.py

package info (click to toggle)
tinysparql 3.10.1-4
links: PTS, VCS
area: main
in suites: forky, sid
size: 24,468 kB
sloc: ansic: 118,310; python: 6,139; javascript: 719; sh: 121; perl: 106; xml: 67; makefile: 31; sql: 1
file content (101 lines) | stat: -rwxr-xr-x 3,612 bytes
parent folder | download | duplicates (4)
#!/usr/bin/python3
"""
This application converts the various text files stored in the source-data
directory into a pickled python object to be used by the random data
generator scripts

Copyright (C) 2007 Chris Moffitt
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

"""


import csv
import string
import pickle as pickle
import random
import os

data_dir = os.path.join (os.path.dirname (__file__), "source-data")
simple_files_to_process = ['street-names.txt', 'street-types.txt', 'latin-words.txt',
                            'email-domains.txt', 'job-titles.txt', 'company-names.txt',
                            'company-types.txt',
                           'equipment-manufacturers.txt','equipment-models.txt']

def load_files():
    # Process Zip Codes
    all_zips = {}
    reader = csv.reader(open(os.path.join(data_dir,"zip-codes.txt")))
    for row in reader:
        data = [string.capwords(row[3]), row[4]]
        all_zips[row[0]] = data
    output = open('source-data.pkl', 'wb')
    pickle.dump(all_zips, output)

    #Process area codes
    area_code_file = open(os.path.join(data_dir,"area-codes.txt"))
    state_area_codes = {}
    for line in area_code_file:
        clean_line = line.replace(' ','').rstrip('\n')
        state_area_codes[line.split(':')[0]] = clean_line[3:].split(',')
    pickle.dump(state_area_codes, output)
    area_code_file.close()

    #Process Last Names
    last_names = []
    last_name_file = open(os.path.join(data_dir,"last-name.txt"))
    for line in last_name_file:
        clean_line = line.rstrip('\n')
        last_names.append(string.capwords(clean_line.split(' ')[0]))
    pickle.dump(last_names, output)
    last_name_file.close()

    #Process Male First Names
    male_first_names = []
    male_first_name_file = open(os.path.join(data_dir,"male-first-name.txt"))
    for line in male_first_name_file:
        clean_line = line.rstrip('\n')
        male_first_names.append(string.capwords(clean_line.split(' ')[0]))
    pickle.dump(male_first_names, output)
    male_first_name_file.close()

    #Process Female First Names
    female_first_names = []
    female_first_name_file = open(os.path.join(data_dir,"female-first-name.txt"))
    for line in female_first_name_file:
        clean_line = line.rstrip('\n')
        female_first_names.append(string.capwords(clean_line.split(' ')[0]))
    pickle.dump(female_first_names, output)
    female_first_name_file.close()

    #Process the simple files
    for f in simple_files_to_process:
        temp = []
        sample_file = open(os.path.join(data_dir, f))
        for line in sample_file:
            clean_line = line.rstrip('\n')
            temp.append(clean_line)
        pickle.dump(temp, output)
        sample_file.close()
        temp = []
    output.close()

if __name__ == "__main__":
    response = string.lower(input("Type 'yes' to reload the data from source files and create a new source file: "))
    if response == 'yes':
        load_files()