File: core.py

package info (click to toggle)
python-ulmo 0.8.8%2Bdfsg1-1.1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,064 kB
  • sloc: python: 6,135; makefile: 144; sh: 5
file content (381 lines) | stat: -rw-r--r-- 13,512 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
"""
    ulmo.ncdc.cirs.core
    ~~~~~~~~~~~~~~~~~~~

    This module provides direct access to the `National Climatic Data Center`_
    `Climate Index Reference Sequential (CIRS)`_ drought dataset.

    .. _National Climatic Data Center: http://www.ncdc.noaa.gov
    .. _Climate Index Reference Sequential (CIRS): http://www1.ncdc.noaa.gov/pub/data/cirs/
"""
from builtins import str
from builtins import range
from past.builtins import basestring
import distutils.version
import os.path

import pandas

from ulmo import util


CIRS_DIR = util.get_ulmo_dir('ncdc/cirs')

NO_DATA_VALUES = {
    'cddc': '-9999.',
    'hddc': '-9999.',
    'pcpn': '-9.99',
    'pdsi': '-99.99',
    'phdi': '-99.99',
    'pmdi': '-99.99',
    'sp01': '-99.99',
    'sp02': '-99.99',
    'sp03': '-99.99',
    'sp06': '-99.99',
    'sp09': '-99.99',
    'sp12': '-99.99',
    'sp24': '-99.99',
    'tmpc': '-99.90',
    'zndx': '-99.99',

}


def get_data(elements=None, by_state=False, location_names='abbr', as_dataframe=False, use_file=None):
    """Retrieves data.

    Parameters
    ----------
    elements : ``None``, str or list
        The element(s) for which to get data for. If ``None`` (default), then
        all elements are used. An individual element is a string, but a list or
        tuple of them can be used to specify a set of elements.  Elements are:
            * 'cddc': Cooling Degree Days
            * 'hddc': Heating Degree Days
            * 'pcpn': Precipitation
            * 'pdsi': Palmer Drought Severity Index
            * 'phdi': Palmer Hydrological Drought Index
            * 'pmdi': Modified Palmer Drought Severity Index
            * 'sp01': 1-month Standardized Precipitation Index
            * 'sp02': 2-month Standardized Precipitation Index
            * 'sp03': 3-month Standardized Precipitation Index
            * 'sp06': 6-month Standardized Precipitation Index
            * 'sp09': 9-month Standardized Precipitation Index
            * 'sp12': 12-month Standardized Precipitation Index
            * 'sp24': 24-month Standardized Precipitation Index
            * 'tmpc': Temperature
            * 'zndx': ZNDX
    by_state : bool
        If False (default), divisional data will be retrieved. If True, then
        regional data will be retrieved.
    location_names : str or ``None``
        This parameter defines what (if any) type of names will be added to the
        values. If set to 'abbr' (default), then abbreviated location names
        will be used. If 'full', then full location names will be used. If set
        to None, then no location name will be added and the only identifier
        will be the location_codes (this is the most memory-conservative
        option).
    as_dataframe : bool
        If ``False`` (default), a list of values dicts is returned. If ``True``,
        a dict with element codes mapped to equivalent pandas.DataFrame objects
        will be returned. The pandas dataframe is used internally, so setting
        this to ``True`` is faster as it skips a somewhat expensive
        serialization step.
    use_file : ``None``, file-like object or str
        If ``None`` (default), then data will be automatically retrieved from
        the web. If a file-like object or a file path string, then the file will
        be used to read data from. This is intended to be used for reading in
        previously-downloaded versions of the dataset.

    Returns
    -------
    data : list or pandas.DataFrame
        A list of value dicts or a pandas.DataFrame containing data. See
        the ``as_dataframe`` parameter for more.
    """

    if isinstance(elements, basestring):
        elements = [elements]
    elif elements is None:
        elements = [
            'cddc',
            'hddc',
            'pcpn',
            'pdsi',
            'phdi',
            'pmdi',
            'sp01',
            'sp02',
            'sp03',
            'sp06',
            'sp09',
            'sp12',
            'sp24',
            'tmpc',
            'zndx',
        ]

    df = None

    for element in elements:
        element_file = _get_element_file(use_file, element, elements, by_state)

        element_df = _get_element_data(element, by_state, element_file, location_names)

        keys = ['location_code', 'year', 'month']
        for append_key in ['division', 'state', 'state_code']:
            if append_key in element_df.columns:
                keys.append(append_key)
        element_df.set_index(keys, inplace=True)

        if df is None:
            df = element_df
        else:
            df = df.join(element_df, how='outer')

    df = df.reset_index()
    df = _resolve_location_names(df, location_names, by_state)

    if as_dataframe:
        return df
    else:
        return list(df.T.to_dict().values())


def _get_element_data(element, by_state, use_file, location_names):
    if use_file:
        url = None
        path = None
    else:
        url = _get_url(element, by_state)
        filename = url.rsplit('/', 1)[-1]
        path = os.path.join(CIRS_DIR, filename)

    with util.open_file_for_url(url, path, use_file=use_file) as f:
        element_df = _parse_values(f, by_state, location_names, element)

    return element_df


def _get_element_file(use_file, element, elements, by_state):
    if isinstance(use_file, basestring):
        if os.path.basename(use_file) == '':
            if len(elements) > 1:
                assert ValueError(
                    "'use_file' must be a path to a directory if using "
                    "'use_file' with multiple elements")

            return use_file + _get_filename(element, by_state, os.path.dirname(use_file))

    return use_file


def _get_filename(element, by_state, dir_path):
    files = os.listdir(dir_path)
    return _most_recent(files, element, by_state)


def _get_url(element, by_state):
    ftp_dir = "ftp://ftp.ncdc.noaa.gov/pub/data/cirs/climdiv/"
    files = util.dir_list(ftp_dir)
    most_recent = _most_recent(files, element, by_state)
    return ftp_dir + most_recent


def _most_recent(files, element, by_state):
    geographic_extent = 'st' if by_state else 'dv'
    match_str = 'climdiv-{element}{geographic_extent}'.format(
        element=element,
        geographic_extent=geographic_extent,
    )
    matches = [s for s in files if s.startswith(match_str)]
    return sorted(matches, key=_file_key)[0]


def _file_key(filename):
    version_str = filename.split('-')[2][1:]
    return distutils.version.StrictVersion(version_str)


def _parse_values(file_handle, by_state, location_names, element):
    if by_state:
        id_columns = [
            ('location_code', 0, 3, None),
            # ('division', 3, 3, None), # ignored in state files
            # ('element', 4, 6, None),  # element is redundant
            ('year', 6, 10, None),
        ]
    else:
        id_columns = [
            ('location_code', 0, 2, None),
            ('division', 2, 4, None),
            # ('element', 4, 6, None),  # element is redundant
            ('year', 6, 10, None),
        ]

    year_col_end = id_columns[-1][2]

    month_columns = [
        (str(n), year_col_end - 6 + (7 * n), year_col_end + (7 * n), None)
        for n in range(1, 13)
    ]

    columns = id_columns + month_columns

    na_values = [NO_DATA_VALUES.get(element)]
    parsed = util.parse_fwf(file_handle, columns, na_values=na_values)

    month_columns = [id_column[0] for id_column in id_columns]
    melted = pandas.melt(parsed, id_vars=month_columns)\
        .rename(columns={'variable': 'month'})

    melted.month = melted.month.astype(int)

    # throw away NaNs
    melted = melted[melted['value'].notnull()]

    data = melted.rename(columns={
        'value': element,
    })

    return data


def _resolve_location_names(df, location_names, by_state):
    if location_names is None:
        return df
    elif location_names not in ('abbr', 'full'):
        raise ValueError("location_names should be set to either None, 'abbr' or 'full'")
    else:
        locations = _states_regions_dataframe()[location_names]
        with_locations = df.join(locations, on='location_code')

        if by_state:
            return with_locations.rename(columns={
                location_names: 'location',
            })
        else:
            return with_locations.rename(columns={
                location_names: 'state',
                'location_code': 'state_code',
            })


def _states_regions_dataframe():
    """returns a dataframe indexed by state/region code with columns for the
    name and abbreviation (abbr) to use
    """
    STATES_REGIONS = {
        # code: (full name, abbreviation)
        1: ("Alabama", "AL"),
        2: ("Arizona", "AZ"),
        3: ("Arkansas", "AR"),
        4: ("California", "CA"),
        5: ("Colorado", "CO"),
        6: ("Connecticut", "CT"),
        7: ("Delaware", "DE"),
        8: ("Florida", "FL"),
        9: ("Georgia", "GA"),
        10: ("Idaho", "ID"),
        11: ("Illinois", "IL"),
        12: ("Indiana", "IN"),
        13: ("Iowa", "IA"),
        14: ("Kansas", "KS"),
        15: ("Kentucky", "KY"),
        16: ("Louisiana", "LA"),
        17: ("Maine", "ME"),
        18: ("Maryland", "MD"),
        19: ("Massachusetts", "MA"),
        20: ("Michigan", "MI"),
        21: ("Minnesota", "MN"),
        22: ("Mississippi", "MS"),
        23: ("Missouri", "MO"),
        24: ("Montana", "MT"),
        25: ("Nebraska", "NE"),
        26: ("Nevada", "NV"),
        27: ("New Hampshire", "NH"),
        28: ("New Jersey", "NJ"),
        29: ("New Mexico", "NM"),
        30: ("New York", "NY"),
        31: ("North Carolina", "NC"),
        32: ("North Dakota", "ND"),
        33: ("Ohio", "OH"),
        34: ("Oklahoma", "OK"),
        35: ("Oregon", "OR"),
        36: ("Pennsylvania", "PA"),
        37: ("Rhode Island", "RI"),
        38: ("South Carolina", "SC"),
        39: ("South Dakota", "SD"),
        40: ("Tennessee", "TN"),
        41: ("Texas", "TX"),
        42: ("Utah", "UT"),
        43: ("Vermont", "VT"),
        44: ("Virginia", "VA"),
        45: ("Washington", "WA"),
        46: ("West Virginia", "WV"),
        47: ("Wisconsin", "WI"),
        48: ("Wyoming", "WY"),
        101: ("Northeast Region", "ner"),
        102: ("East North Central Region", "encr"),
        103: ("Central Region", "cr"),
        104: ("Southeast Region", "ser"),
        105: ("West North Central Region", "wncr"),
        106: ("South Region", "sr"),
        107: ("Southwest Region", "swr"),
        108: ("Northwest Region", "nwr"),
        109: ("West Region", "wr"),
        110: ("National (contiguous 48 States)", "national"),

        # The following are the range of code values for the National Weather Service Regions, river basins, and agricultural regions.
        111: ("NWS: Great Plains", "nws:gp"),
        115: ("NWS: Southern Plains and Gulf Coast", "nws:spgc"),
        120: ("NWS: US Rockies and Westward", "nws:usrw"),
        121: ("NWS: Eastern Region", "nws:er"),
        122: ("NWS: Southern Region", "nws:sr"),
        123: ("NWS: Central Region", "nws:cr"),
        124: ("NWS: Western Region", "nws:wr"),
        201: ("NWS: Pacific Northwest Basin", "nws:pnwb"),
        202: ("NWS: California River Basin", "nws:crb"),
        203: ("NWS: Great Basin", "nws:gb"),
        204: ("NWS: Lower Colorado River Basin", "nws:lcrb"),
        205: ("NWS: Upper Colorado River Basin", "nws:urcb"),
        206: ("NWS: Rio Grande River Basin", "nws:rgrb"),
        207: ("NWS: Texas Gulf Coast River Basin", "nws:tgcrb"),
        208: ("NWS: Arkansas-White-Red Basin", "nws:awrb"),
        209: ("NWS: Lower Mississippi River Basin", "nws:lmrb"),
        210: ("NWS: Missouri River Basin", "nws:mrb"),
        211: ("NWS: Souris-Red-Rainy Basin", "nws:srrb"),
        212: ("NWS: Upper Mississippi River Basin", "nws:umrb"),
        213: ("NWS: Great Lakes Basin", "nws:glb"),
        214: ("NWS: Tennessee River Basin", "nws:trb"),
        215: ("NWS: Ohio River Basin", "nws:ohrb"),
        216: ("NWS: South Atlantic-Gulf Basin", "nws:sagb"),
        217: ("NWS: Mid-Atlantic Basin", "nws:mab"),
        218: ("NWS: New England Basin", "nws:neb"),
        220: ("NWS: Mississippi River Basin & Tributaties (N. of Memphis, TN",
              "nws:mrbt"),

        # below( codes are weighted by area)
        250: ("Area: Spring Wheat Belt", "area:swb"),
        255: ("Area: Primary Hard Red Winter Wheat Belt", "area:phrwwb"),
        256: ("Area: Winter Wheat Belt", "area:wwb"),
        260: ("Area: Primary Corn and Soybean Belt", "area:pcsb"),
        261: ("Area: Corn Belt", "area:cb"),
        262: ("Area: Soybean Belt", "area:sb"),
        265: ("Area: Cotton Belt", "area:cb"),

        # below( codes are weighted by productivity)
        350: ("Prod: Spring Wheat Belt", "prod:swb"),
        356: ("Prod: Winter Wheat Belt", "prod:wwb"),
        361: ("Prod: Corn Belt", "prod:cb"),
        362: ("Prod: Soybean Belt", "prod:sb"),
        365: ("Prod: Cotton Belt", "prod:cb"),

        # below( codes are for percent productivity in the Palmer Z Index categories)
        450: ("% Prod: Spring Wheat Belt", "%prod:swb"),
        456: ("% Prod: Winter Wheat Belt", "%prod:wwb"),
        461: ("% Prod: Corn Belt", "%prod:cb"),
        462: ("% Prod: Soybean Belt", "%prod:sb"),
        465: ("% Prod: Cotton Belt", "%prod:cb"),
    }
    return pandas.DataFrame(STATES_REGIONS).T.rename(columns={0: 'full', 1: 'abbr'})