1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170
|
from __future__ import annotations
import unicodedata
import numpy as np
from xarray import coding
from xarray.core.variable import Variable
# Special characters that are permitted in netCDF names except in the
# 0th position of the string
_specialchars = '_.@+- !"#$%&\\()*,:;<=>?[]^`{|}~'
# The following are reserved names in CDL and may not be used as names of
# variables, dimension, attributes
_reserved_names = {
"byte",
"char",
"short",
"ushort",
"int",
"uint",
"int64",
"uint64",
"float",
"real",
"double",
"bool",
"string",
}
# These data-types aren't supported by netCDF3, so they are automatically
# coerced instead as indicated by the "coerce_nc3_dtype" function
_nc3_dtype_coercions = {
"int64": "int32",
"uint64": "int32",
"uint32": "int32",
"uint16": "int16",
"uint8": "int8",
"bool": "int8",
}
# encode all strings as UTF-8
STRING_ENCODING = "utf-8"
COERCION_VALUE_ERROR = (
"could not safely cast array from {dtype} to {new_dtype}. While it is not "
"always the case, a common reason for this is that xarray has deemed it "
"safest to encode np.datetime64[ns] or np.timedelta64[ns] values with "
"int64 values representing units of 'nanoseconds'. This is either due to "
"the fact that the times are known to require nanosecond precision for an "
"accurate round trip, or that the times are unknown prior to writing due "
"to being contained in a chunked array. Ways to work around this are "
"either to use a backend that supports writing int64 values, or to "
"manually specify the encoding['units'] and encoding['dtype'] (e.g. "
"'seconds since 1970-01-01' and np.dtype('int32')) on the time "
"variable(s) such that the times can be serialized in a netCDF3 file "
"(note that depending on the situation, however, this latter option may "
"result in an inaccurate round trip)."
)
def coerce_nc3_dtype(arr):
"""Coerce an array to a data type that can be stored in a netCDF-3 file
This function performs the dtype conversions as specified by the
``_nc3_dtype_coercions`` mapping:
int64 -> int32
uint64 -> int32
uint32 -> int32
uint16 -> int16
uint8 -> int8
bool -> int8
Data is checked for equality, or equivalence (non-NaN values) using the
``(cast_array == original_array).all()``.
"""
dtype = str(arr.dtype)
if dtype in _nc3_dtype_coercions:
new_dtype = _nc3_dtype_coercions[dtype]
# TODO: raise a warning whenever casting the data-type instead?
cast_arr = arr.astype(new_dtype)
if not (cast_arr == arr).all():
raise ValueError(
COERCION_VALUE_ERROR.format(dtype=dtype, new_dtype=new_dtype)
)
arr = cast_arr
return arr
def encode_nc3_attr_value(value):
if isinstance(value, bytes):
pass
elif isinstance(value, str):
value = value.encode(STRING_ENCODING)
else:
value = coerce_nc3_dtype(np.atleast_1d(value))
if value.ndim > 1:
raise ValueError("netCDF attributes must be 1-dimensional")
return value
def encode_nc3_attrs(attrs):
return {k: encode_nc3_attr_value(v) for k, v in attrs.items()}
def _maybe_prepare_times(var):
# checks for integer-based time-like and
# replaces np.iinfo(np.int64).min with _FillValue or np.nan
# this keeps backwards compatibility
data = var.data
if data.dtype.kind in "iu":
units = var.attrs.get("units", None)
if units is not None and coding.variables._is_time_like(units):
mask = data == np.iinfo(np.int64).min
if mask.any():
data = np.where(mask, var.attrs.get("_FillValue", np.nan), data)
return data
def encode_nc3_variable(var, name=None):
for coder in [
coding.strings.EncodedStringCoder(allows_unicode=False),
coding.strings.CharacterArrayCoder(),
]:
var = coder.encode(var, name=name)
data = _maybe_prepare_times(var)
data = coerce_nc3_dtype(data)
attrs = encode_nc3_attrs(var.attrs)
return Variable(var.dims, data, attrs, var.encoding)
def _isalnumMUTF8(c):
"""Return True if the given UTF-8 encoded character is alphanumeric
or multibyte.
Input is not checked!
"""
return c.isalnum() or (len(c.encode("utf-8")) > 1)
def is_valid_nc3_name(s):
"""Test whether an object can be validly converted to a netCDF-3
dimension, variable or attribute name
Earlier versions of the netCDF C-library reference implementation
enforced a more restricted set of characters in creating new names,
but permitted reading names containing arbitrary bytes. This
specification extends the permitted characters in names to include
multi-byte UTF-8 encoded Unicode and additional printing characters
from the US-ASCII alphabet. The first character of a name must be
alphanumeric, a multi-byte UTF-8 character, or '_' (reserved for
special names with meaning to implementations, such as the
"_FillValue" attribute). Subsequent characters may also include
printing special characters, except for '/' which is not allowed in
names. Names that have trailing space characters are also not
permitted.
"""
if not isinstance(s, str):
return False
num_bytes = len(s.encode("utf-8"))
return (
(unicodedata.normalize("NFC", s) == s)
and (s not in _reserved_names)
and (num_bytes >= 0)
and ("/" not in s)
and (s[-1] != " ")
and (_isalnumMUTF8(s[0]) or (s[0] == "_"))
and all(_isalnumMUTF8(c) or c in _specialchars for c in s)
)
|