Source code for deephaven.dtypes

#
# Copyright (c) 2016-2024 Deephaven Data Labs and Patent Pending
#

""" This module defines the data types supported by the Deephaven engine.

Each data type is represented by a DType class which supports creating arrays of the same type and more.
"""
from __future__ import annotations

import datetime
from typing import Any, Sequence, Callable, Dict, Type, Union, Optional

import jpy
import numpy as np
import pandas as pd

from deephaven import DHError
from deephaven.constants import NULL_BYTE, NULL_SHORT, NULL_INT, NULL_LONG, NULL_FLOAT, NULL_DOUBLE, NULL_CHAR

_JQstType = jpy.get_type("io.deephaven.qst.type.Type")
_JTableTools = jpy.get_type("io.deephaven.engine.util.TableTools")
_JPrimitiveArrayConversionUtility = jpy.get_type("io.deephaven.integrations.common.PrimitiveArrayConversionUtility")

_j_name_type_map: Dict[str, DType] = {}


def _qst_custom_type(cls_name: str):
    try:
        return _JQstType.find(_JTableTools.typeFromName(cls_name))
    except:
        return None


[docs]class DType: """ A class representing a data type in Deephaven.""" def __init__(self, j_name: str, j_type: Type = None, qst_type: jpy.JType = None, is_primitive: bool = False, np_type: Any = np.object_): """ Args: j_name (str): the full qualified name of the Java class j_type (Type): the mapped Python class created by JPY qst_type (JType): the JPY wrapped object for a instance of QST Type is_primitive (bool): whether this instance represents a primitive Java type np_type (Any): an instance of numpy dtype (dtype("int64") or numpy class (e.g. np.int16), default is np.object_ """ self.j_name = j_name self.j_type = j_type if j_type else jpy.get_type(j_name) self.qst_type = qst_type if qst_type else _qst_custom_type(j_name) self.is_primitive = is_primitive self.np_type = np_type _j_name_type_map[j_name] = self def __repr__(self): return self.j_name def __call__(self, *args, **kwargs): if self.is_primitive: raise DHError(message=f"primitive type {self.j_name} is not callable.") try: return self.j_type(*args, **kwargs) except Exception as e: raise DHError(e, f"failed to create an instance of {self.j_name}") from e
bool_ = DType(j_name="java.lang.Boolean", qst_type=_JQstType.booleanType(), np_type=np.bool_) """Boolean type""" byte = DType(j_name="byte", qst_type=_JQstType.byteType(), is_primitive=True, np_type=np.int8) """Signed byte integer type""" int8 = byte """Signed byte integer type""" short = DType(j_name="short", qst_type=_JQstType.shortType(), is_primitive=True, np_type=np.int16) """Signed short integer type""" int16 = short """Signed short integer type""" char = DType(j_name="char", qst_type=_JQstType.charType(), is_primitive=True, np_type=np.dtype('uint16')) """Character type""" int32 = DType(j_name="int", qst_type=_JQstType.intType(), is_primitive=True, np_type=np.int32) """Signed 32bit integer type""" long = DType(j_name="long", qst_type=_JQstType.longType(), is_primitive=True, np_type=np.int64) """Signed 64bit integer type""" int64 = long """Signed 64bit integer type""" float32 = DType(j_name="float", qst_type=_JQstType.floatType(), is_primitive=True, np_type=np.float32) """Single-precision floating-point number type""" single = float32 """Single-precision floating-point number type""" float64 = DType(j_name="double", qst_type=_JQstType.doubleType(), is_primitive=True, np_type=np.float64) """Double-precision floating-point number type""" double = float64 """Double-precision floating-point number type""" string = DType(j_name="java.lang.String", qst_type=_JQstType.stringType(), np_type=np.str_) """String type""" Character = DType(j_name="java.lang.Character") """Character type""" BigInteger = DType(j_name="java.math.BigInteger") """Java BigInteger type""" BigDecimal = DType(j_name="java.math.BigDecimal") """Java BigDecimal type""" StringSet = DType(j_name="io.deephaven.stringset.StringSet") """Deephaven StringSet type""" Instant = DType(j_name="java.time.Instant", np_type=np.dtype("datetime64[ns]")) """Instant date time type""" LocalDate = DType(j_name="java.time.LocalDate") """Local date type""" LocalTime = DType(j_name="java.time.LocalTime") """Local time type""" ZonedDateTime = DType(j_name="java.time.ZonedDateTime") """Zoned date time type""" Duration = DType(j_name="java.time.Duration") """Time period type, which is a unit of time in terms of clock time (24-hour days, hours, minutes, seconds, and nanoseconds).""" Period = DType(j_name="java.time.Period") """Time period type, which is a unit of time in terms of calendar time (days, weeks, months, years, etc.).""" TimeZone = DType(j_name="java.time.ZoneId") """Time zone type.""" BusinessCalendar = DType(j_name='io.deephaven.time.calendar.BusinessCalendar') """Business calendar type""" PyObject = DType(j_name="org.jpy.PyObject") """Python object type""" JObject = DType(j_name="java.lang.Object") """Java Object type""" bool_array = DType(j_name='[Z') """boolean array type""" byte_array = DType(j_name='[B') """Byte array type""" int8_array = byte_array """Byte array type""" short_array = DType(j_name='[S') """Short array type""" int16_array = short_array """Short array type""" char_array = DType(j_name='[C') """char array type""" int32_array = DType(j_name='[I') """32bit integer array type""" long_array = DType(j_name='[J') """64bit integer array type""" int64_array = long_array """64bit integer array type""" single_array = DType(j_name='[F') """Single-precision floating-point array type""" float32_array = single_array """Single-precision floating-point array type""" double_array = DType(j_name='[D') """Double-precision floating-point array type""" float64_array = double_array """Double-precision floating-point array type""" string_array = DType(j_name='[Ljava.lang.String;') """Java String array type""" boolean_array = DType(j_name='[Ljava.lang.Boolean;') """Java Boolean array type""" instant_array = DType(j_name='[Ljava.time.Instant;') """Java Instant array type""" zdt_array = DType(j_name='[Ljava.time.ZonedDateTime;') """Zoned date time array type""" _PRIMITIVE_DTYPE_NULL_MAP = { bool_: NULL_BYTE, byte: NULL_BYTE, char: NULL_CHAR, int16: NULL_SHORT, int32: NULL_INT, int64: NULL_LONG, float32: NULL_FLOAT, float64: NULL_DOUBLE, } _BUILDABLE_ARRAY_DTYPE_MAP = { bool_: boolean_array, byte: int8_array, char: char_array, int16: int16_array, int32: int32_array, int64: int64_array, float32: float32_array, float64: float64_array, string: string_array, Instant: instant_array, } _J_ARRAY_NP_TYPE_MAP = { boolean_array.j_type: np.dtype("?"), byte_array.j_type: np.dtype("b"), char_array.j_type: np.dtype("uint16"), short_array.j_type: np.dtype("h"), int32_array.j_type: np.dtype("i"), long_array.j_type: np.dtype("l"), float32_array.j_type: np.dtype("f"), double_array.j_type: np.dtype("d"), string_array.j_type: np.dtype("U"), instant_array.j_type: np.dtype("datetime64[ns]"), }
[docs]def null_remap(dtype: DType) -> Callable[[Any], Any]: """ Creates a null value remap function for the provided DType. Args: dtype (DType): the DType instance Returns: a Callable Raises: TypeError """ null_value = _PRIMITIVE_DTYPE_NULL_MAP.get(dtype) if null_value is None: raise TypeError("null_remap() must be called with a primitive DType") return lambda v: null_value if v is None else v
def _instant_array(data: Sequence) -> jpy.JType: """Converts a sequence of either datetime64[ns], datetime.datetime, pandas.Timestamp, datetime strings, or integers in nanoseconds, to a Java array of Instant values. """ if len(data) == 0: return jpy.array(Instant.j_type, []) if isinstance(data, np.ndarray) and data.dtype.kind == 'U': return _JPrimitiveArrayConversionUtility.translateArrayStringToInstant(data) if all((d == None or isinstance(d, str)) for d in data): jdata = jpy.array('java.lang.String', data) return _JPrimitiveArrayConversionUtility.translateArrayStringToInstant(jdata) # try to convert to numpy array of datetime64 if not already, so that we can call translateArrayLongToInstant on # it to reduce the number of round trips to the JVM if not isinstance(data, np.ndarray): try: # Pandas drops unrecognized time zones, so it may handle time zones incorrectly when parsing strings if not any(isinstance(i, str) for i in data): data = np.array([pd.Timestamp(dt).to_numpy() for dt in data], dtype=np.datetime64) except Exception as e: ... # Pandas drops unrecognized time zones, so it may handle time zones incorrectly, so do not handle 'U' dtype if isinstance(data, np.ndarray) and data.dtype.kind in ('M', 'i'): if data.dtype.kind == 'M': longs = jpy.array('long', data.astype('datetime64[ns]').astype('int64')) elif data.dtype.kind == 'i': longs = jpy.array('long', data.astype('int64')) else: raise Exception(f"Unexpected dtype: {data.dtype.kind}") return _JPrimitiveArrayConversionUtility.translateArrayLongToInstant(longs) if not isinstance(data, instant_array.j_type): from deephaven.time import to_j_instant data = [to_j_instant(d) for d in data] return jpy.array(Instant.j_type, data)
[docs]def array(dtype: DType, seq: Optional[Sequence], remap: Callable[[Any], Any] = None) -> Optional[jpy.JType]: """ Creates a Java array of the specified data type populated with values from a sequence. Note: this method does unsafe casting, meaning precision and values might be lost with down cast Args: dtype (DType): the component type of the array seq (Sequence): a sequence of compatible data, e.g. list, tuple, numpy array, Pandas series, etc. remap (optional): a callable that takes one value and maps it to another, for handling the translation of special DH values such as NULL_INT, NAN_INT between Python and the DH engine Returns: a Java array Raises: DHError """ if seq is None: return None if isinstance(seq, np.ndarray) and seq.ndim > 1: raise ValueError("array() does not support multi-dimensional arrays") if not isinstance(dtype, DType): raise TypeError(f"array() expects a DType for the first argument but given a {type(dtype).__name__}") try: if isinstance(seq, str) and dtype == char: # ord is the Python builtin function that takes a unicode character and returns an integer code point value remap = ord if remap: if not callable(remap): raise ValueError("Not a callable") seq = [remap(v) for v in seq] if dtype == Instant: return _instant_array(seq) if isinstance(seq, np.ndarray): if dtype == bool_: bytes_ = seq.astype(dtype=np.int8) j_bytes = array(byte, bytes_) seq = _JPrimitiveArrayConversionUtility.translateArrayByteToBoolean(j_bytes) return jpy.array(dtype.j_type, seq) except Exception as e: raise DHError(e, f"failed to create a Java {dtype.j_name} array.") from e
[docs]def from_jtype(j_class: Any) -> Optional[DType]: """ looks up a DType that matches the java type, if not found, creates a DType for it. """ if not j_class: return None j_name = j_class.getName() dtype = _j_name_type_map.get(j_name) if not dtype: return DType(j_name=j_name, j_type=j_class, np_type=np.object_) else: return dtype
[docs]def from_np_dtype(np_dtype: Union[np.dtype, pd.api.extensions.ExtensionDtype]) -> DType: """ Looks up a DType that matches the provided numpy dtype or Pandas's nullable equivalent; if not found, returns PyObject. """ if isinstance(np_dtype, pd.api.extensions.ExtensionDtype): # check if it is a Pandas nullable numeric types such as pd.Float64Dtype/Int32Dtype/BooleanDtype etc. if hasattr(np_dtype, "numpy_dtype"): np_dtype = np_dtype.numpy_dtype elif isinstance(np_dtype, pd.StringDtype): return string else: return PyObject if np_dtype.kind in {'U', 'S'}: return string if np_dtype.kind in {'M'}: return Instant for _, dtype in _j_name_type_map.items(): if np.dtype(dtype.np_type) == np_dtype and dtype.np_type != np.object_: return dtype return PyObject
_NUMPY_INT_TYPE_CODES = {"b", "h", "H", "i", "l"} _NUMPY_FLOATING_TYPE_CODES = {"f", "d"} def _is_py_null(x: Any) -> bool: """Checks if the value is a Python null value, i.e. None or NaN, or Pandas.NA.""" if x is None: return True try: return bool(pd.isna(x)) except (TypeError, ValueError): return False def _scalar(x: Any, dtype: DType) -> Any: """Converts a Python value to a Java scalar value. It converts the numpy primitive types, string to their Python equivalents so that JPY can handle them. For datetime values, it converts them to Java Instant. Otherwise, it returns the value as is.""" # NULL_BOOL will appear in Java as a byte value which causes a cast error. We just let JPY converts it to Java null # and the engine has casting logic to handle it. if (dt := _PRIMITIVE_DTYPE_NULL_MAP.get(dtype)) and _is_py_null(x) and dtype not in (bool_, char): return dt try: if hasattr(x, "dtype"): if x.dtype.char == 'H': # np.uint16 maps to Java char return Character(int(x)) elif x.dtype.char in _NUMPY_INT_TYPE_CODES: return int(x) elif x.dtype.char in _NUMPY_FLOATING_TYPE_CODES: return float(x) elif x.dtype.char == '?': return bool(x) elif x.dtype.char == 'U': return str(x) elif x.dtype.char == 'O': return x elif x.dtype.char == 'M': from deephaven.time import to_j_instant return to_j_instant(x) elif isinstance(x, (datetime.datetime, pd.Timestamp)): from deephaven.time import to_j_instant return to_j_instant(x) return x except: return x