1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
|
"""California housing dataset.
The original database is available from StatLib
http://lib.stat.cmu.edu/datasets/
The data contains 20,640 observations on 9 variables.
This dataset contains the average house value as target variable
and the following input variables (features): average income,
housing average age, average rooms, average bedrooms, population,
average occupation, latitude, and longitude in that order.
References
----------
Pace, R. Kelley and Ronald Barry, Sparse Spatial Autoregressions,
Statistics and Probability Letters, 33 (1997) 291-297.
"""
# Authors: Peter Prettenhofer
# License: BSD 3 clause
from os.path import dirname, exists, join
from os import makedirs, remove
import tarfile
import numpy as np
import logging
from .base import get_data_home
from .base import _fetch_remote
from .base import _pkl_filepath
from .base import RemoteFileMetadata
from ..utils import Bunch
from ..utils import _joblib
# The original data can be found at:
# http://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.tgz
ARCHIVE = RemoteFileMetadata(
filename='cal_housing.tgz',
url='https://ndownloader.figshare.com/files/5976036',
checksum=('aaa5c9a6afe2225cc2aed2723682ae40'
'3280c4a3695a2ddda4ffb5d8215ea681'))
logger = logging.getLogger(__name__)
def fetch_california_housing(data_home=None, download_if_missing=True,
return_X_y=False):
"""Load the California housing dataset (regression).
============== ==============
Samples total 20640
Dimensionality 8
Features real
Target real 0.15 - 5.
============== ==============
Read more in the :ref:`User Guide <california_housing_dataset>`.
Parameters
----------
data_home : optional, default: None
Specify another download and cache folder for the datasets. By default
all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
download_if_missing : optional, default=True
If False, raise a IOError if the data is not locally available
instead of trying to download the data from the source site.
return_X_y : boolean, default=False.
If True, returns ``(data.data, data.target)`` instead of a Bunch
object.
.. versionadded:: 0.20
Returns
-------
dataset : dict-like object with the following attributes:
dataset.data : ndarray, shape [20640, 8]
Each row corresponding to the 8 feature values in order.
dataset.target : numpy array of shape (20640,)
Each value corresponds to the average house value in units of 100,000.
dataset.feature_names : array of length 8
Array of ordered feature names used in the dataset.
dataset.DESCR : string
Description of the California housing dataset.
(data, target) : tuple if ``return_X_y`` is True
.. versionadded:: 0.20
Notes
------
This dataset consists of 20,640 samples and 9 features.
"""
data_home = get_data_home(data_home=data_home)
if not exists(data_home):
makedirs(data_home)
filepath = _pkl_filepath(data_home, 'cal_housing.pkz')
if not exists(filepath):
if not download_if_missing:
raise IOError("Data not found and `download_if_missing` is False")
logger.info('Downloading Cal. housing from {} to {}'.format(
ARCHIVE.url, data_home))
archive_path = _fetch_remote(ARCHIVE, dirname=data_home)
with tarfile.open(mode="r:gz", name=archive_path) as f:
cal_housing = np.loadtxt(
f.extractfile('CaliforniaHousing/cal_housing.data'),
delimiter=',')
# Columns are not in the same order compared to the previous
# URL resource on lib.stat.cmu.edu
columns_index = [8, 7, 2, 3, 4, 5, 6, 1, 0]
cal_housing = cal_housing[:, columns_index]
_joblib.dump(cal_housing, filepath, compress=6)
remove(archive_path)
else:
cal_housing = _joblib.load(filepath)
feature_names = ["MedInc", "HouseAge", "AveRooms", "AveBedrms",
"Population", "AveOccup", "Latitude", "Longitude"]
target, data = cal_housing[:, 0], cal_housing[:, 1:]
# avg rooms = total rooms / households
data[:, 2] /= data[:, 5]
# avg bed rooms = total bed rooms / households
data[:, 3] /= data[:, 5]
# avg occupancy = population / households
data[:, 5] = data[:, 4] / data[:, 5]
# target in units of 100,000
target = target / 100000.0
module_path = dirname(__file__)
with open(join(module_path, 'descr', 'california_housing.rst')) as dfile:
descr = dfile.read()
if return_X_y:
return data, target
return Bunch(data=data,
target=target,
feature_names=feature_names,
DESCR=descr)
|