"""Sparse Dtype""" import re from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Type import warnings import numpy as np from pandas._typing import Dtype, DtypeObj from pandas.errors import PerformanceWarning from pandas.core.dtypes.base import ExtensionDtype, register_extension_dtype from pandas.core.dtypes.cast import astype_nansafe from pandas.core.dtypes.common import ( is_bool_dtype, is_extension_array_dtype, is_object_dtype, is_scalar, is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.missing import isna, na_value_for_dtype if TYPE_CHECKING: from pandas.core.arrays.sparse.array import SparseArray # noqa: F401 @register_extension_dtype class SparseDtype(ExtensionDtype): """ Dtype for data stored in :class:`SparseArray`. This dtype implements the pandas ExtensionDtype interface. .. versionadded:: 0.24.0 Parameters ---------- dtype : str, ExtensionDtype, numpy.dtype, type, default numpy.float64 The dtype of the underlying array storing the non-fill value values. fill_value : scalar, optional The scalar value not stored in the SparseArray. By default, this depends on `dtype`. =========== ========== dtype na_value =========== ========== float ``np.nan`` int ``0`` bool ``False`` datetime64 ``pd.NaT`` timedelta64 ``pd.NaT`` =========== ========== The default value may be overridden by specifying a `fill_value`. Attributes ---------- None Methods ------- None """ # We include `_is_na_fill_value` in the metadata to avoid hash collisions # between SparseDtype(float, 0.0) and SparseDtype(float, nan). # Without is_na_fill_value in the comparison, those would be equal since # hash(nan) is (sometimes?) 0. _metadata = ("_dtype", "_fill_value", "_is_na_fill_value") def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None): if isinstance(dtype, type(self)): if fill_value is None: fill_value = dtype.fill_value dtype = dtype.subtype dtype = pandas_dtype(dtype) if is_string_dtype(dtype): dtype = np.dtype("object") if fill_value is None: fill_value = na_value_for_dtype(dtype) if not is_scalar(fill_value): raise ValueError(f"fill_value must be a scalar. Got {fill_value} instead") self._dtype = dtype self._fill_value = fill_value def __hash__(self): # Python3 doesn't inherit __hash__ when a base class overrides # __eq__, so we explicitly do it here. return super().__hash__() def __eq__(self, other: Any) -> bool: # We have to override __eq__ to handle NA values in _metadata. # The base class does simple == checks, which fail for NA. if isinstance(other, str): try: other = self.construct_from_string(other) except TypeError: return False if isinstance(other, type(self)): subtype = self.subtype == other.subtype if self._is_na_fill_value: # this case is complicated by two things: # SparseDtype(float, float(nan)) == SparseDtype(float, np.nan) # SparseDtype(float, np.nan) != SparseDtype(float, pd.NaT) # i.e. we want to treat any floating-point NaN as equal, but # not a floating-point NaN and a datetime NaT. fill_value = ( other._is_na_fill_value and isinstance(self.fill_value, type(other.fill_value)) or isinstance(other.fill_value, type(self.fill_value)) ) else: fill_value = self.fill_value == other.fill_value return subtype and fill_value return False @property def fill_value(self): """ The fill value of the array. Converting the SparseArray to a dense ndarray will fill the array with this value. .. warning:: It's possible to end up with a SparseArray that has ``fill_value`` values in ``sp_values``. This can occur, for example, when setting ``SparseArray.fill_value`` directly. """ return self._fill_value @property def _is_na_fill_value(self): return isna(self.fill_value) @property def _is_numeric(self) -> bool: return not is_object_dtype(self.subtype) @property def _is_boolean(self) -> bool: return is_bool_dtype(self.subtype) @property def kind(self): """ The sparse kind. Either 'integer', or 'block'. """ return self.subtype.kind @property def type(self): return self.subtype.type @property def subtype(self): return self._dtype @property def name(self): return f"Sparse[{self.subtype.name}, {repr(self.fill_value)}]" def __repr__(self) -> str: return self.name @classmethod def construct_array_type(cls) -> Type["SparseArray"]: """ Return the array type associated with this dtype. Returns ------- type """ from pandas.core.arrays.sparse.array import SparseArray # noqa: F811 return SparseArray @classmethod def construct_from_string(cls, string: str) -> "SparseDtype": """ Construct a SparseDtype from a string form. Parameters ---------- string : str Can take the following forms. string dtype ================ ============================ 'int' SparseDtype[np.int64, 0] 'Sparse' SparseDtype[np.float64, nan] 'Sparse[int]' SparseDtype[np.int64, 0] 'Sparse[int, 0]' SparseDtype[np.int64, 0] ================ ============================ It is not possible to specify non-default fill values with a string. An argument like ``'Sparse[int, 1]'`` will raise a ``TypeError`` because the default fill value for integers is 0. Returns ------- SparseDtype """ if not isinstance(string, str): raise TypeError( f"'construct_from_string' expects a string, got {type(string)}" ) msg = f"Cannot construct a 'SparseDtype' from '{string}'" if string.startswith("Sparse"): try: sub_type, has_fill_value = cls._parse_subtype(string) except ValueError as err: raise TypeError(msg) from err else: result = SparseDtype(sub_type) msg = ( f"Cannot construct a 'SparseDtype' from '{string}'.\n\nIt " "looks like the fill_value in the string is not " "the default for the dtype. Non-default fill_values " "are not supported. Use the 'SparseDtype()' " "constructor instead." ) if has_fill_value and str(result) != string: raise TypeError(msg) return result else: raise TypeError(msg) @staticmethod def _parse_subtype(dtype: str) -> Tuple[str, bool]: """ Parse a string to get the subtype Parameters ---------- dtype : str A string like * Sparse[subtype] * Sparse[subtype, fill_value] Returns ------- subtype : str Raises ------ ValueError When the subtype cannot be extracted. """ xpr = re.compile(r"Sparse\[(?P[^,]*)(, )?(?P.*?)?\]$") m = xpr.match(dtype) has_fill_value = False if m: subtype = m.groupdict()["subtype"] has_fill_value = bool(m.groupdict()["fill_value"]) elif dtype == "Sparse": subtype = "float64" else: raise ValueError(f"Cannot parse {dtype}") return subtype, has_fill_value @classmethod def is_dtype(cls, dtype: object) -> bool: dtype = getattr(dtype, "dtype", dtype) if isinstance(dtype, str) and dtype.startswith("Sparse"): sub_type, _ = cls._parse_subtype(dtype) dtype = np.dtype(sub_type) elif isinstance(dtype, cls): return True return isinstance(dtype, np.dtype) or dtype == "Sparse" def update_dtype(self, dtype): """ Convert the SparseDtype to a new dtype. This takes care of converting the ``fill_value``. Parameters ---------- dtype : Union[str, numpy.dtype, SparseDtype] The new dtype to use. * For a SparseDtype, it is simply returned * For a NumPy dtype (or str), the current fill value is converted to the new dtype, and a SparseDtype with `dtype` and the new fill value is returned. Returns ------- SparseDtype A new SparseDtype with the correct `dtype` and fill value for that `dtype`. Raises ------ ValueError When the current fill value cannot be converted to the new `dtype` (e.g. trying to convert ``np.nan`` to an integer dtype). Examples -------- >>> SparseDtype(int, 0).update_dtype(float) Sparse[float64, 0.0] >>> SparseDtype(int, 1).update_dtype(SparseDtype(float, np.nan)) Sparse[float64, nan] """ cls = type(self) dtype = pandas_dtype(dtype) if not isinstance(dtype, cls): if is_extension_array_dtype(dtype): raise TypeError("sparse arrays of extension dtypes not supported") fill_value = astype_nansafe(np.array(self.fill_value), dtype).item() dtype = cls(dtype, fill_value=fill_value) return dtype @property def _subtype_with_str(self): """ Whether the SparseDtype's subtype should be considered ``str``. Typically, pandas will store string data in an object-dtype array. When converting values to a dtype, e.g. in ``.astype``, we need to be more specific, we need the actual underlying type. Returns ------- >>> SparseDtype(int, 1)._subtype_with_str dtype('int64') >>> SparseDtype(object, 1)._subtype_with_str dtype('O') >>> dtype = SparseDtype(str, '') >>> dtype.subtype dtype('O') >>> dtype._subtype_with_str """ if isinstance(self.fill_value, str): return type(self.fill_value) return self.subtype def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: # TODO for now only handle SparseDtypes and numpy dtypes => extend # with other compatibtle extension dtypes if any( isinstance(x, ExtensionDtype) and not isinstance(x, SparseDtype) for x in dtypes ): return None fill_values = [x.fill_value for x in dtypes if isinstance(x, SparseDtype)] fill_value = fill_values[0] # np.nan isn't a singleton, so we may end up with multiple # NaNs here, so we ignore tha all NA case too. if not (len(set(fill_values)) == 1 or isna(fill_values).all()): warnings.warn( "Concatenating sparse arrays with multiple fill " f"values: '{fill_values}'. Picking the first and " "converting the rest.", PerformanceWarning, stacklevel=6, ) np_dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes] return SparseDtype(np.find_common_type(np_dtypes, []), fill_value=fill_value)