File: design_climate.py

package info (click to toggle)
python-fluids 1.0.27-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 13,384 kB
  • sloc: python: 59,459; f90: 1,033; javascript: 49; makefile: 47
file content (919 lines) | stat: -rw-r--r-- 36,260 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
"""Chemical Engineering Design Library (ChEDL). Utilities for process modeling.
Copyright (C) 2016, 2017 Caleb Bell <Caleb.Andrew.Bell@gmail.com>

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""


__all__ = ['get_clean_isd_history', 'IntegratedSurfaceDatabaseStation',
           'get_closest_station', 'get_station_year_text', 'gsod_day_parser',
           'StationDataGSOD', 'heating_degree_days', 'cooling_degree_days', 'stations',
#           'geopy_geolocator', 'geopy_cache', 'SimpleGeolocatorCache',
           'geocode']

import datetime
import gzip
import os
from calendar import isleap
from collections import namedtuple
from io import BytesIO as StringIO
from urllib.request import urlopen

import numpy as np
from scipy.spatial import cKDTree

from fluids.constants import inch, knot, mile

try:  # pragma: no cover
    from appdirs import user_config_dir
    data_dir = user_config_dir('fluids')
except ImportError:  # pragma: no cover
    data_dir = ''

try:  # pragma: no cover
    # No point loading cPickle or sqlite for this reason
    import sqlite3

    import geopy
    from geopy.location import Location
except ImportError:  # pragma: no cover
    geopy = None
    Location = None

# Geopy cache/lookup layer, also requires appdirs for caching, can work without
geolocator = None
geolocator_user_agent = 'fluids'
geolocator_disk_cache_name = 'simple_geolocator_cache.sqlite3'
geolocator_disk_cache_loc = os.path.join(data_dir, geolocator_disk_cache_name)

simple_geopy_cache = None

geopy_missing_msg = """Geocoder module `geopy` is required for this
functionality."""

def geopy_geolocator():
    """Lazy loader for geocoder from geopy.

    This currently loads the `Nominatim` geocode and returns an instance of it,
    taking ~2 us.
    """
    global geolocator
    if geolocator is None:
        try:
            from geopy.geocoders import Nominatim
        except ImportError:
            return None
        geolocator = Nominatim(user_agent=geolocator_user_agent)
        return geolocator
    return geolocator


def geopy_cache():
    """Lazy loader for the singleton `SimpleGeolocatorCache`.

    This creates a sqlite database if one does not exist and initializes a
    connection to it.
    """
    global simple_geopy_cache
    if simple_geopy_cache is None:
        simple_geopy_cache = SimpleGeolocatorCache(geolocator_disk_cache_loc)
        return simple_geopy_cache
    return simple_geopy_cache


class SimpleGeolocatorCache:
    """Very basic on-disk address -> (lat, lon) cache, using Python's sqlite
    database for on-disk persistence.

    Offers very reasonable performance compared to online lookups.
    """

    def __init__(self, file_name):
        self.connection = conn = sqlite3.connect(file_name)
        cursor = self.connection.cursor()
        cursor.execute('CREATE TABLE IF NOT EXISTS geopy ( '
                       'address STRING PRIMARY KEY, latitude real, longitude real )')
        self.connection.commit()

    def cached_address(self, address):
        cursor = self.connection.cursor()
        cursor.execute('SELECT latitude, longitude FROM geopy WHERE address=?', (address, ))
        res = cursor.fetchone()
        if res is None:
            return None
        return res

    def cache_address(self, address, latitude, longitude):
        cursor = self.connection.cursor()
        cursor.execute('INSERT INTO geopy(address, latitude, longitude) VALUES(?, ?, ?)',
                       (address, latitude, longitude))
        self.connection.commit()


def geocode(address):
    """Query function to obtain a latitude and longitude from a location string
    such as `Houston, TX` or`Colombia`. This uses an online lookup, currently
    wrapping the `geopy` library, and providing an on-disk cache of queries.

    Parameters
    ----------
    address : str
        Search string to retrieve the location, [-]

    Returns
    -------
    latitude : float
        Latitude of address, [degrees]
    longitude : float
        Longitude of address, [degrees]

    Notes
    -----
    If a query has been retrieved before, this function will take under 1 ms;
    it takes several seconds otherwise.

    Examples
    --------
    >>> geocode('Fredericton, NB') # doctest: +SKIP
    (45.966425, -66.645813)
    """
    loc_tuple = None
    try:
        cache = geopy_cache()
        loc_tuple = cache.cached_address(address)
    except:
        # Handle bugs in the cache, i.e. if there is no space on disk to create
        # the database, by ignoring them
        pass
    if loc_tuple is not None:
        return loc_tuple
    else:
        geocoder = geopy_geolocator()
        if geocoder is None:
            return geopy_missing_msg
        location = geocoder.geocode(address)
        try:
            cache.cache_address(address, location.latitude, location.longitude)
        except:
            pass
        return (location.latitude, location.longitude)



folder = os.path.join(os.path.dirname(__file__), 'data')


def heating_degree_days(T, T_base=291.4833333333333, truncate=True):
    r'''Calculates the heating degree days for a period of time.

    .. math::
        \text{heating degree days} = max(T - T_{base}, 0)

    Parameters
    ----------
    T : float
        Measured temperature; sometimes an average over a length of time is used,
        other times the average of the lowest and highest temperature in a
        period are used, [K]
    T_base : float, optional
        Reference temperature for the degree day calculation, defaults
        to 65 °F (18.33 °C, 291.483 K), the value most used in the US, [K]
    truncate : bool
        If truncate is True, no negative values will be returned; if negative,
        the value is truncated to 0, [-]

    Returns
    -------
    heating_degree_days : float
        Degree above the base temperature multiplied by the length of time of
        the measurement, normally days [day*K]

    Notes
    -----
    Some common base temperatures are 18 °C (Canada), 15.5 °C (EU),
    17 °C (Denmark, Finland), 12 °C Switzerland. The base temperature
    should always be presented with the results.

    The time unit does not have to be days; it can be any time unit, and the
    calculation behaves the same.

    Examples
    --------
    >>> heating_degree_days(303.8)
    12.31666666666672

    >>> heating_degree_days(273)
    0.0

    >>> heating_degree_days(322, T_base=300)
    22

    References
    ----------
    .. [1] "Heating Degree Day." Wikipedia, January 24, 2018.
       https://en.wikipedia.org/w/index.php?title=Heating_degree_day&oldid=822187764.
    '''
    dd = T - T_base
    if truncate and dd < 0.0:
        dd = 0.0
    return dd


def cooling_degree_days(T, T_base=283.15, truncate=True):
    r'''Calculates the cooling degree days for a period of time.

    .. math::
        \text{cooling degree days} = max(T_{base} - T, 0)

    Parameters
    ----------
    T : float
        Measured temperature; sometimes an average over a length of time is used,
        other times the average of the lowest and highest temperature in a
        period are used, [K]
    T_base : float, optional
        Reference temperature for the degree day calculation, defaults
        to 10 °C, 283.15 K, a common value, [K]
    truncate : bool
        If truncate is True, no negative values will be returned; if negative,
        the value is truncated to 0, [-]

    Returns
    -------
    cooling_degree_days : float
        Degree below the base temperature multiplied by the length of time of
        the measurement, normally days [day*K]

    Notes
    -----
    The base temperature should always be presented with the results.

    The time unit does not have to be days; it can be time unit, and the
    calculation behaves the same.

    Examples
    --------
    >>> cooling_degree_days(250)
    33.14999999999998

    >>> cooling_degree_days(300)
    0.0

    >>> cooling_degree_days(250, T_base=300)
    50

    References
    ----------
    .. [1] "Heating Degree Day." Wikipedia, January 24, 2018.
       https://en.wikipedia.org/w/index.php?title=Heating_degree_day&oldid=822187764.
    '''
    dd = T_base - T
    if truncate and dd < 0.0:
        dd = 0.0
    return dd


def get_clean_isd_history(dest=os.path.join(folder, 'isd-history-cleaned.tsv'),
                          url="ftp://ftp.ncdc.noaa.gov/pub/data/noaa/isd-history.csv"): # pragma: no cover
    """Basic method to update the isd-history file from the NOAA. This is useful
    as new weather stations are updated all the time.

    This function requires pandas to run. If fluids is installed for the
    superuser, this method must be called in an instance of Python running
    as the superuser (administrator).

    Retrieving the file from ftp typically takes several seconds.
    Pandas reads the file in ~30 ms and writes it in ~220 ms. Reading it with
    the code below takes ~220 ms but is necessary to prevent a pandas
    dependency.

    Parameters
    ----------
    dest : str, optional
        The file to store the data retrieved; leave as the default argument
        for it to be accessible by fluids.
    url : str, optional
        The location of the data file; this can be anywhere that can be read
        by pandas, including a local file as would be useful in an offline
        situation.
    """
    import pandas as pd
    df = pd.read_csv(url, dtype={'USAF': str, 'WBAN': str})
    df.to_csv(dest, sep='\t', index=False, header=False)


class IntegratedSurfaceDatabaseStation:
    """Class to hold data on a weather station in the Integrated Surface
    Database.

    License information for the database can be found at the following link:
    https://data.noaa.gov/dataset/global-surface-summary-of-the-day-gsod

    Note: Of the 28000 + stations in the database, approximately 3000 have WBAN
    identifiers; 26000 have unique names; 24000 have USAF identifiers; and
    there are only 25800 unique lat/lon pairs.

    To uniquely represent a weather station, a combination of identifiers
    must be used. (Name, USAF, WBAN) makes a good choice.

    Parameters
    ----------
    USAF : str or None if unassigned
        Air Force station ID. May contain a letter in the first position.
    WBAN : str or None if unassigned
        NCDC WBAN number
    NAME : str
        Name of the station; ex. 'CENTRAL COLORADO REGIONAL AP'
    CTRY : str or None if unspecified
        FIPS country ID
    ST : str or None if not in the US
        State for US stations
    ICAO : str or None if not an airport
        ICAO airport code
    LAT : float
        Latitude with a precision of one thousandths of a decimal degree,
        [degrees]
    LON : float
        Longitude with a precision of one thousandths of a decimal degree,
        [degrees]
    ELEV : float
        Elevation of weather station, [m]
    BEGIN : float
        Beginning Period Of Record (YYYYMMDD). There may be reporting gaps
        within the P.O.R.
    END : Ending Period Of Record (YYYYMMDD). There may be reporting gaps
        within the P.O.R.
    """

    __slots__ = ['USAF', 'WBAN', 'NAME', 'CTRY', 'ST', 'ICAO', 'LAT', 'LON',
                 'ELEV', 'BEGIN', 'END', 'raw_data', 'parsed_data']

    def __repr__(self):
        s = ('<Weather station registered in the Integrated Surface Database, '
            'name %s, country %s, USAF %s, WBAN %s, coords (%s, %s) '
            'Weather data from %s to %s>' )
        return s%(self.NAME, self.CTRY, self.USAF, self.WBAN, self.LAT, self.LON, str(self.BEGIN)[0:4], str(self.END)[0:4])

    def __init__(self, USAF, WBAN, NAME, CTRY, ST, ICAO, LAT, LON, ELEV, BEGIN,
                 END):
        try:
            self.USAF = int(USAF)
        except:
            self.USAF = USAF # Nones
        self.WBAN = WBAN
        self.NAME = NAME
        self.CTRY = CTRY
        self.ST = ST
        self.ICAO = ICAO
        self.LAT = LAT
        self.LON = LON
        self.ELEV = ELEV
        self.BEGIN = int(BEGIN)
        self.END = int(END)


class StationDataGSOD:
    # Holds data, caches and retrieves data
    def __init__(self, station, data_dir_override=None):
        self.data_dir_override = data_dir_override
        self.station = station

        self.begin = datetime.datetime.strptime(str(self.station.BEGIN), '%Y%m%d')
        self.end = datetime.datetime.strptime(str(self.station.END), '%Y%m%d')

        self.year_range = range(self.begin.year, self.end.year + 1)

#         Would be nice to create these later, when using a download_data method
        self.raw_text = {}
        self.raw_data = {}
        self.parsed_data = {}
        self.load_empty_vectors()
        self.download_data()
        self.parse_data()

    def load_empty_vectors(self):
        for year in self.year_range:
            days_in_year = 366 if isleap(year) else 365
            self.raw_data[year] = [None]*days_in_year
            self.parsed_data[year] = [None]*days_in_year
            self.raw_text[year] = None
#        days = [None]*days_in_year(y)

    def download_data(self):
        for year in self.year_range:
            if self.raw_text[year] is None:
                try:
                    year_data = get_station_year_text(self.station.USAF, self.station.WBAN, year, data_dir_override=self.data_dir_override)
                    self.raw_text[year] = year_data
                except:
                    pass

    def parse_data(self):
        for year, data in self.raw_text.items():
            if data is not None:
                days = self.parsed_data[year]
                for line in data.split('\n')[1:-1]:
                    parsed = gsod_day_parser(line)
                    doy = parsed.DATE.timetuple().tm_yday-1
                    days[doy] = parsed

    def coldest_month(self, older_year=None, newer_year=None, minimum_days=23):
        # Tested
        month_data = self.month_average_temperature(older_year=older_year,
                                                    newer_year=newer_year,
                                                    minimum_days=minimum_days)
        return month_data.index(min(month_data))

    def warmest_month(self, older_year=None, newer_year=None, minimum_days=23):
        # Tested
        month_data = self.month_average_temperature(older_year=older_year,
                                                    newer_year=newer_year,
                                                    minimum_days=minimum_days)
        return month_data.index(max(month_data))

    def month_average_temperature(self, older_year=None, newer_year=None,
                                  include_yearly=False, minimum_days=23):
        '''
        >> station = get_closest_station(38.8572, -77.0369)
        >> station_data = StationDataGSOD(station)
        >> station_data.month_average_temperature(1990, 2000, include_yearly=False)
        [276.1599380905833, 277.5375516246206, 281.1881231671554, 286.7367003367004, 291.8689638318671, 296.79545454545456, 299.51868686868687, 298.2097914630174, 294.4116161616162, 288.25883023786247, 282.3188552188553, 277.8282339524275]
        '''
        # Take years, make them inclusive; add minimum valid days.
        year_month_averages = {}
        year_month_counts = {}

        for year, data in self.parsed_data.items():
            if not (older_year <= year <= newer_year):
                continue # Ignore out-of-range years easily
            year_month_averages[year] = [0.0]*12
            year_month_counts[year] = [0]*12

            for i, day in enumerate(data):
                if day is None:
                    continue
                # Don't do these comparisons to make it fast
                if day.DATE.year < older_year or day.DATE.year > newer_year:
                    continue # Ignore out-of-range days as possible

                T = day.TEMP
                if T is None:
                    continue
                # Cache these lookups
                year_month_averages[year][day.DATE.month-1] += T
                year_month_counts[year][day.DATE.month-1] += 1

            for month in range(12):
                count = year_month_counts[year][month]
                if count < minimum_days:
                    ans = None
                else:
                    ans = year_month_averages[year][month]/count
                year_month_averages[year][month] = ans

        # Compute the average of the month
        actual_averages = [0.0]*12
        actual_averages_counts = [0]*12
        for year, average in year_month_averages.items():
            for month in range(12):
                if average is not None and average[month] is not None:
                    count = actual_averages_counts[month]
                    if count is None:
                        count = 1
                    else:
                        count += 1
                    actual_averages_counts[month] = count
                    month_average_sum = actual_averages[month]
                    if month_average_sum is None:
                        month_average_sum = average[month]
                    else:
                        month_average_sum += average[month]
                    actual_averages[month] = month_average_sum

        for month in range(12):
            actual_averages[month] = actual_averages[month]/actual_averages_counts[month]

        # Don't set anything as properties - too many variables used in calculating thems
        # Speed is not that important.
        if include_yearly:
            return actual_averages, year_month_averages
        else:
            return actual_averages

    # Copy and paste
    def month_average_windspeed(self, older_year=None, newer_year=None,
                                  include_yearly=False, minimum_days=23):
        # Take years, make them inclusive; add minimum valid days.
        year_month_averages = {}
        year_month_counts = {}

        for year, data in self.parsed_data.items():
            if not (older_year <= year <= newer_year):
                continue # Ignore out-of-range years easily
            year_month_averages[year] = [0.0]*12
            year_month_counts[year] = [0]*12

            for i, day in enumerate(data):
                if day is None:
                    continue
                # Don't do these comparisons to make it fast
                if day.DATE.year < older_year or day.DATE.year > newer_year:
                    continue # Ignore out-of-range days as possible

                wind_speed = day.WDSP
                if wind_speed is None:
                    continue
                # Cache these lookups
                year_month_averages[year][day.DATE.month-1] += wind_speed
                year_month_counts[year][day.DATE.month-1] += 1

            for month in range(12):
                count = year_month_counts[year][month]
                if count < minimum_days:
                    ans = None
                else:
                    ans = year_month_averages[year][month]/count
                year_month_averages[year][month] = ans

        # Compute the average of the month
        actual_averages = [0.0]*12
        actual_averages_counts = [0]*12
        for year, average in year_month_averages.items():
            for month in range(12):
                if average is not None and average[month] is not None:
                    count = actual_averages_counts[month]
                    if count is None:
                        count = 1
                    else:
                        count += 1
                    actual_averages_counts[month] = count
                    month_average_sum = actual_averages[month]
                    if month_average_sum is None:
                        month_average_sum = average[month]
                    else:
                        month_average_sum += average[month]
                    actual_averages[month] = month_average_sum

        for month in range(12):
            actual_averages[month] = actual_averages[month]/actual_averages_counts[month]

        # Don't set anything as properties - too many variables used in calculating thems
        # Speed is not that important.
        if include_yearly:
            return actual_averages, year_month_averages
        else:
            return actual_averages

    def percentile_extreme_condition(self, older_year=None, newer_year=None,
                                  include_yearly=False, minimum_days=23, attr='WDSP'):
        # Really need to normalize data with interpolation etc here.
        # Need to get the data, and process it and score interpolation regimes.
        # Or could just randomly drop data and try to fill it in.
        accepted_values = []
        for year in self.parsed_data.keys():
            if not (older_year <= year <= newer_year):
                continue # Ignore out-of-range years easily



stations = []
_latlongs = []
"""Read in the parsed data into
1) a list of latitudes and longitudes, temporary, which will get converted to
a numpy array for use in KDTree
2) a list of IntegratedSurfaceDatabaseStation objects; the query will return
the index of the nearest weather stations.
"""
with open(os.path.join(folder, 'isd-history-cleaned.tsv')) as f:
    for line in f:
        values = line.split('\t')
        for i in range(11):
            # First two are not values
            v = values[i]
            if v == '':
                values[i] = None # '' case
            else:
                try:
                    if i > 2:
                        values[i] = float(v)
                    if int(v) == 99999:
                        values[i] = None
                except:
                    continue
        lat, lon = values[6], values[7]
        if lat and lon:
            # Some stations have no lat-long; this isn't useful
            stations.append(IntegratedSurfaceDatabaseStation(*values))
            _latlongs.append((lat, lon))
_latlongs = np.array(_latlongs)
station_count = len(stations)


kd_tree = cKDTree(_latlongs) # _latlongs must be unchanged as data is not copied


def get_closest_station(latitude, longitude, minumum_recent_data=20140000,
                        match_max=100):
    """Query function to find the nearest weather station to a particular set of
    coordinates. Optionally allows for a recent date by which the station is
    required to be still active at.

    Parameters
    ----------
    latitude : float
        Latitude to search for nearby weather stations at, [degrees]
    longitude : float
        Longitude to search for nearby weather stations at, [degrees]
    minumum_recent_data : int, optional
        Date that the weather station is required to have more recent
        weather data than; format YYYYMMDD; set this to 0 to not restrict data
        by date.
    match_max : int, optional
        The maximum number of results in the KDTree to search for before
        applying the filtering criteria; an internal parameter which is
        increased automatically if the default value is insufficient [-]

    Returns
    -------
    station : IntegratedSurfaceDatabaseStation
        Instance of IntegratedSurfaceDatabaseStation which was nearest
        to the requested coordinates and with sufficiently recent data
        available [-]

    Notes
    -----
    Searching for 100 stations is a reasonable choice as it takes, ~70
    microseconds vs 50 microsecond to find only 1 station. The search does get
    slower as more points are requested. Bad data is returned from a KDTree
    search if more points are requested than are available.

    Examples
    --------
    >>> get_closest_station(51.02532675, -114.049868485806, 20150000)
    <Weather station registered in the Integrated Surface Database, name CALGARY INTL CS, country CA, USAF 713930, WBAN None, coords (51.1, -114.0) Weather data from 2004 to 2020>
    """
    # Both station strings may be important
    # Searching for 100 stations is fine, 70 microseconds vs 50 microsecond for 1
    # but there's little point for more points, it gets slower.
    # bad data is returned if k > station_count
    distances, indexes = kd_tree.query([latitude, longitude], k=min(match_max, station_count))
    for i in indexes:
        latlon = _latlongs[i]
        enddate = stations[i].END
        # Iterate for all indexes until one is found whose date is current
        if enddate > minumum_recent_data:
            return stations[i]
    if match_max < station_count:
        return get_closest_station(latitude, longitude, minumum_recent_data=minumum_recent_data, match_max=match_max*10)
    raise ValueError('Could not find a station with more recent data than '
                    'specified near the specified coordinates.')


# This should be aggressively cached
def get_station_year_text(WMO, WBAN, year, data_dir_override=None):
    """Basic method to download data from the GSOD database, given a station
    identifier and year.

    Parameters
    ----------
    WMO : int or None
         World Meteorological Organization (WMO) identifiers, [-]
    WBAN : int or None
        Weather Bureau Army Navy (WBAN) weather station identifier, [-]
    year : int
        Year data should be retrieved from, [year]
    data_dir_override : str, optional
        Directory to store the downloaded data (instead of the
         configuration directory), [-]

    Returns
    -------
    data : str
        Downloaded data file
    """
    if WMO is None:
        WMO = 999999
    if WBAN is None:
        WBAN = 99999
    station = str(int(WMO)) + '-' + str(WBAN)
    if data_dir_override is None:
        gsod_year_dir = os.path.join(data_dir, 'gsod', str(year))
    else:
        gsod_year_dir = os.path.join(data_dir_override, str(year))
    path = os.path.join(gsod_year_dir, station + '.op')
    if os.path.exists(path):
        with open(path) as f:
            data = f.read()
            if data and data != 'Exception':
                return data
            else:
                # Remove the bad file and try to redownload it
                try:
                    os.remove(path)
                except:
                    pass
                # raise ValueError(data)

    toget = ('ftp://ftp.ncdc.noaa.gov/pub/data/gsod/' + str(year) + '/'
             + station + '-' + str(year) +'.op.gz')
    try:
        data = urlopen(toget, timeout=5)
    except Exception as e:
        if not os.path.exists(gsod_year_dir):
            os.makedirs(gsod_year_dir)
        with open(path, 'w') as f:
            f.write('Exception')
        raise ValueError('Could not obtain desired data; check '
                        'if the year has data published for the '
                        'specified station and the station was specified '
                        f'in the correct form. The full error is {e}')

    data = data.read()
    data_thing = StringIO(data)

    f = gzip.GzipFile(fileobj=data_thing, mode="r")
    year_station_data = f.read()
    try:
        year_station_data = year_station_data.decode('utf-8')
    except:
        pass

    # Cache the data for future use
    if not os.path.exists(gsod_year_dir):
        os.makedirs(gsod_year_dir)
    open(path, 'w').write(year_station_data)


    return year_station_data



gsod_fields = ['DATE', # 15-18 int year; 19-22 int month/day
               'TEMP', # 25-30 Real Mean temperature for the day in degrees Fahrenheit to tenths. Missing = 9999.9
               'TEMP_COUNT', # 32-33 Int. Number of observations used in calculating mean temperature
               'DEWP', # 36-41 Real Mean dew point for the day in degrees Fahrenheit to tenths.  Missing = 9999.9
               'DEWP_COUNT', # 43-44 Int. Number of observations used in calculating mean dew point
               'SLP', # 47-52 Real Mean sea level pressure for the day in millibars to tenths.  Missing = 9999.9
               'SLP_COUNT', # 54-55 Int. Number of observations used in calculating mean sea level pressure
               'STP', # 58-63 Real Mean station pressure for the day in millibars to tenths. Missing = 9999.9
               'STP_COUNT', # 65-66 Int. Number of observations used in calculating mean station pressure
               'VISIB', # 69-73 Real Mean visibility for the day in miles to tenths. Missing = 999.9
               'VISIB_COUNT', # 75-76 Int. Number of observations used in calculating mean visibility
               'WDSP', # 79-83 Real Mean wind speed for the day in knots to tenths. Missing = 999.9
               'WDSP_COUNT', # 85-86 Int. Number of observations used in calculating mean wind speed
               'MXSPD', # 89-93 Real Maximum sustained wind speed reported for the day in knots to tenths. Missing = 999.9
               'GUST', # 96-100 Real Maximum wind gust reported for the day in knots to tenths. Missing = 999.9
               'MAX', # 103-108 Real Maximum temperature reported during the
                      # day in Fahrenheit to tenths--time of max temp report varies by country and
                      # region, so this will sometimes not be the max for the calendar day.
                      # Missing = 9999.9; FLAG of '*' is present on 109-109!
               'MIN', # 111-116 Real Minimum temperature reported during the day in Fahrenheit to tenths--time of min
                      # temp report varies by country and region, so this will sometimes not be
                      # the min for the calendar day. Missing = 9999.9 FLAG of '*' is present on 117-117!
               'PRCP', # 119-123 Real Total precipitation (rain and/or melted snow) reported during the day in inches
                       # and hundredths; will usually not end with the midnight observation--i.e.,
                       # may include latter part of previous day. .00 indicates no measurable
                       # precipitation (includes a trace).
                       # Missing = 99.99

               'SNDP', # 126-130 Real Snow depth in inches to tenths--last report for the day if reported more than
                       # once.  Missing = 999.9 Note: Most stations do not report '0' on days with no snow on the
                       # ground--therefore, '999.9' will often appear on these days.
               'FRSHTT' # 133-138 Int. Indicators (1 = yes, 0 = no/not reported) for the occurrence during the day of:
                        # Fog ('F' - 1st digit).
                        # Rain or Drizzle ('R' - 2nd digit).
                        # Snow or Ice Pellets ('S' - 3rd digit).
                        # Hail ('H' - 4th digit).
                        # Thunder ('T' - 5th digit).
                        # Tornado or Funnel Cloud ('T' - 6th digit).
              ]
# Use TEMP and DEWP and STP to calculate wet bulb temperatures
# Values to be converted to floats always
gsod_float_fields = ('TEMP', 'DEWP', 'SLP', 'STP', 'VISIB', 'WDSP', 'MXSPD',
                     'GUST', 'MAX', 'MIN', 'PRCP', 'SNDP')
# Values to be converted to ints always
gsod_int_fields = ('TEMP_COUNT', 'DEWP_COUNT', 'SLP_COUNT', 'STP_COUNT',
                   'VISIB_COUNT', 'WDSP_COUNT')

# Values which signify flags
gsod_flag_chars = '*ABCDEFGHI'
# Values which should be converted to None, as normally there is no value
gsod_bad_values = frozenset(['99.99', '999.9', '9999.9'])

gsod_indicator_names = ['fog', 'rain', 'snow_ice', 'hail', 'thunder',
                        'tornado']
five_ninths = 5.0/9.0

gsod_day = namedtuple('gsod_day', gsod_fields + gsod_indicator_names)


def gsod_day_parser(line, SI=True, to_datetime=True):
    """One line (one file) parser of data in the format of the GSOD database.
    Returns all parsed results as a namedtuple for reduced memory consumption.
    Will convert all data to base SI units unless the `SI` flag is set to False.
    As the values are rounded to one or two decimal places in the GSOD database
    in Imperial units, it may be useful to look at the values directly.

    The names columns of the columns in the GSOD database are retained and used
    as the attributes of the namedtuple results.

    The day, month, and year are normally converted to a datetime instance in
    resulting namedtuple; this behavior can be disabled by setting the
    `datetime` flag to False; it will be a string in the format YYYYMMDD if so.
    This may be useful because datetime conversion roughly doubles the speed of
    this function.

    Parameters
    ----------
    line : str
        Line in format of GSOD documentation, [-]
    SI : bool
        Whether or not the results get converted to base SI units, [-]
    to_datetime : bool
        Whether or not the date gets converted to a datetime instance or stays
        as a string, [-]

    Returns
    -------
    gsod_day_instance : gsod_day
        namedtuple with fields described in the source (all values in SI units,
        if `SI` is True, i.e. meters, m/s, Kelvin, Pascal; otherwise the
        original unit set is used), [-]
    """
    # Ignore STN--- and WBAN, 8-12 characters
    fields = line.strip().split()[2:]
    # For the case the field is blank, set it to None; strip it either way
    for i in range(len(fields)):
        field = fields[i].rstrip()
        if not field:
            field = None
        fields[i] = field

    obj = dict(zip(gsod_fields, fields))
    # Convert the date to a datetime object if specified
    if to_datetime and obj['DATE'] is not None:
        date = obj['DATE']
        obj['DATE'] = datetime.datetime(int(date[0:4]), int(date[4:6]), int(date[6:]))
        #obj['DATE'] = datetime.datetime.strptime(obj['DATE'], '%Y%m%d')

    # Parse float values as floats
    for field in gsod_float_fields:
        value = obj[field].rstrip(gsod_flag_chars)
        if value in gsod_bad_values:
            value = None
        else:
            value = float(value)
        obj[field] = value

    if SI:
        # All temperatures are in deg F
        for field in ('TEMP', 'DEWP', 'MAX', 'MIN'):
            value = obj[field]
            if value is not None:
                # F2K inline for efficiency unfortunately
                obj[field] = (value + 459.67)*five_ninths

        # Convert visibility, wind speed, pressures
        # to si units of meters, Pascal, and meters/second.
        if obj['VISIB'] is not None:
            obj['VISIB'] = obj['VISIB']*mile
        if obj['PRCP'] is not None:
            obj['PRCP'] = obj['PRCP']*inch
        if obj['SNDP'] is not None:
            obj['SNDP'] = obj['SNDP']*inch
        if obj['WDSP'] is not None:
            obj['WDSP'] = obj['WDSP']*knot
        if obj['MXSPD'] is not None:
            obj['MXSPD'] = obj['MXSPD']*knot
        if obj['GUST'] is not None:
            obj['GUST'] = obj['GUST']*knot
        if obj['SLP'] is not None:
            obj['SLP'] = obj['SLP']*100.0
        if obj['STP'] is not None:
            obj['STP'] = obj['STP']*100.0

    # Parse int values as ints
    for field in gsod_int_fields:
        value = obj[field]
        if value is not None:
            obj[field] = int(value)

    indicator_values = [flag == '1' for flag in obj['FRSHTT']]
    obj.update(zip(gsod_indicator_names, indicator_values))
    return gsod_day(**obj)