Ultimate cheat sheet
Neccesary data
The named tables in these lines are used throughout this page and should be run first.
from deephaven import empty_table, time_table
import random
import string
letters_upp = string.ascii_uppercase
def get_random_string(length):
letters = string.ascii_letters
result_str = "".join(random.choice(letters) for i in range(length))
return result_str
def rand_choice(array):
return random.choice(array)
# Create static tables
static_source_1 = empty_table(26300).update(
[
"X = ii",
"Double1 = randomDouble(1, 100)",
"Int1 = randomInt(1, 100)",
"Timestamp = parseInstant(`2016-01-01T01:00 ET`) + i * HOUR",
"Date = Timestamp.toString()",
"String1 = (String)rand_choice(letters_upp)",
"String2 = (String)get_random_string(randomInt(1, 4))",
]
)
static_source_2 = empty_table(26300).update(
[
"X = ii",
"Double2 = randomDouble(1, 80)",
"Int2 = randomInt(1, 100)",
"Timestamp = parseInstant(`2019-01-01T01:00 ET`) + i * HOUR",
"String3 = (String)rand_choice(letters_upp)",
"String4 = (String)get_random_string(randomInt(1, 4))",
]
)
# Create a ticking table
ticking_source = time_table("PT1S").update(
[
"X = ii",
"Double3 = randomDouble(1, 100)",
"Int3 = randomInt(1, 100)",
"Date = Timestamp.toString()",
"String5 = (java.lang.String)rand_choice(letters_upp)",
"String6 = (java.lang.String)get_random_string(randomInt(1, 4))",
]
)
Create tables
Empty tables
from deephaven import empty_table
result = empty_table(5)
# Empty tables are often followed with a formula
result1 = result.update(formulas=["X = 5"])
- result
- result1
New tables
Columns are created using the following methods:
bool_col
byte_col
char_col
datetime_col
double_col
float_col
int_col
jobj_col
long_col
pyobj_col
short_col
string_col
from deephaven import new_table
from deephaven.column import string_col, int_col
result = new_table(
[
int_col("IntegerColumn", [1, 2, 3]),
string_col("Strings", ["These", "are", "Strings"]),
]
)
- result
Time tables
The following code makes a time_table
that updates every second.
from deephaven import time_table
result = time_table("PT1S")
Input tables
Use an input_table
when you would like to add or modify data in table cells directly.
from deephaven import input_table
from deephaven import dtypes as dht
my_column_defs = {"StringCol": dht.string, "DoubleCol": dht.double}
my_input_table = input_table(col_defs=my_column_defs, key_cols="StringCol")
- my_input_table
Ring tables
A ring_table
is a table that contains the n
most recent rows from a source table. If the table is non-static, rows outside of n
will disappear from the ring table as the source table updates.
from deephaven import time_table, ring_table
tt = time_table("PT1S")
rt = ring_table(tt, 5)
- tt
- rt
Tree tables
A tree_table
has collapsible subsections that can be be expanded in the UI for more detail.
from deephaven.constants import NULL_INT
from deephaven import empty_table
source = empty_table(100).update_view(
["ID = i", "Parent = i == 0 ? NULL_INT : (int)(i / 4)"]
)
result = source.tree(id_col="ID", parent_col="Parent")
- source
- result
Rollup tables
A rollup
table "rolls" several child specifications into one parent specification:
from deephaven import read_csv, agg
insurance = read_csv(
"https://media.githubusercontent.com/media/deephaven/examples/main/Insurance/csv/insurance.csv"
)
agg_list = [agg.avg(cols=["bmi", "expenses"])]
by_list = ["region", "age"]
test_rollup = insurance.rollup(aggs=None, by=by_list, include_constituents=True)
insurance_rollup = insurance.rollup(
aggs=agg_list, by=by_list, include_constituents=True
)
- insurance
- test_rollup
- insurance_rollup
Merge tables
To merge
tables, the schema must be identical: same column names, same column data types. This applies to both static and updating tables.
from deephaven import merge
copy1 = static_source_1
merge_same_static = merge([static_source_1, copy1])
copy_updating_1 = ticking_source
copy_updating_2 = ticking_source
merge_same_dynamic = merge([copy_updating_1, copy_updating_2])
# one can merge static and ticking tables
static_source_1v2 = static_source_1.view(["Date", "Timestamp", "SourceRowNum = X"])
ticking_source_v2 = ticking_source.view(["Date", "Timestamp", "SourceRowNum = X"])
merge_static_and_dynamic = merge([static_source_1v2, ticking_source_v2])
View table metadata
meta_table
shows the column names, data types, partitions, and groups for the table. It is useful to make sure the schema matches before merging.
static_1_meta = static_source_1.meta_table
Filter
Most queries benefit by starting with filters. Less data generally means better performance.
For SQL developers: in Deephaven, Joins are not a primary operation for filtering. Use where
, where_in
, and where_not_in
.
Backticks \`` in query strings denote a string within it. Single quotes
'` denote a literal value that gets parsed by the engine.
Date & Time examples
from deephaven.time import to_j_instant
todays_data_1 = ticking_source.where(
["Date = calendarDate()"]
) # Date for this is a String
todays_data_3 = ticking_source.where(
["Date = today('Australia/Sydney')"] # note singe quotes used to denote a literal
) # sydney timezone
single_string_date = static_source_1.where(["Date = `2017-01-03`"]) # HEAVILY USED!
less_than_date = static_source_1.where(["Date < `2017-01-03`"]) # use >=, etc. as well
one_month_string = static_source_1.where(
["Date.startsWith(`2017-02`)"]
) # note backticks used to denote a string within a string
single_db_date = static_source_2.where(
["formatDate(Timestamp, timeZone(`ET`)) = `2020-03-01`"]
)
after_db_datetime = static_source_1.where(["Timestamp > '2017-01-05T10:30:00 ET'"])
just_biz_time = static_source_1.where(["isBusinessTime(Timestamp)"]) # HEAVILY USED!
just_august_datetime = static_source_2.where(
["monthOfYear(Timestamp, timeZone(`ET`)) = 8"]
)
just_10am_hour = static_source_1.where(
["hourOfDay(Timestamp, timeZone(`ET`), false) = 10"]
)
just_tues = static_source_2.where(
["dayOfWeek(Timestamp, timeZone(`ET`)).getValue() = 2"]
) # Tuesdays
time1 = to_j_instant("2017-08-21T09:45:00 ET")
time2 = to_j_instant("2017-08-21T10:10:00 ET")
trades = static_source_1.where(["inRange(Timestamp, time1, time2)"])
# Experienced Python users might prefer to use Python3 methods for casting instead
time1 = "2017-08-21T09:45:00 ET"
time2 = "2017-08-21T12:10:00 ET"
trades = static_source_1.where([f"inRange(Timestamp, '{time1}', '{time2}')"])
String examples
one_string_match = static_source_1.where(["String1 = `A`"]) # match filter
string_set_match = static_source_1.where(["String1 in `A`, `M`, `G`"])
case_insensitive = static_source_1.where(["String1 icase in `a`, `m`, `g`"])
not_in_example = static_source_1.where(["String1 not in `A`, `M`, `G`"]) # see "not"
contains_example = static_source_1.where(["String2.contains(`P`)"])
not_contains_example = static_source_1.where(["!String2.contains(`P`)"])
starts_with_example = static_source_1.where(["String2.startsWith(`A`)"])
ends_with_example = static_source_1.where(["String2.endsWith(`M`)"])
Number examples
Using i
and ii
is not a good idea in non-static use cases, as calculations based on these variables aren't stable.
equals_example = static_source_2.where(["round(Int2) = 44"])
less_than_example = static_source_2.where(["Double2 < 8.42"])
some_manipulation = static_source_2.where(["(Double2 - Int2) / Int2 > 0.05"])
mod_example_1 = static_source_2.where(["i % 10 = 0"]) # every 10th row
mod_example_2 = static_source_2.where(["String4.length() % 2 = 0"])
# even char-count Tickers
Multiple filters
conjunctive_comma = static_source_1.where(["Date = `2017-08-23`", "String1 = `A`"])
# HEAVILY USED!
conjunctive_ampersand = static_source_1.where(["Date = `2017-08-23` && String1 = `A`"])
disjunctive_same_col = static_source_1.where(
["Date = `2017-08-23`|| Date = `2017-08-25`"]
)
disjunctive_diff_col = static_source_1.where(["Date = `2017-08-23` || String1 = `A`"])
range_lame_way = static_source_1.where(["Date >= `2017-08-21`", "Date <= `2017-08-23`"])
in_range_best = static_source_1.where(["inRange(Date, `2017-08-21`, `2017-08-23`)"])
# HEAVILY USED!
in_range_best_string = static_source_1.where(
"inRange(String2.substring(0,1), `A`, `D`)"
)
# starts with letters A - D
where
For SQL developers: In Deephaven, filter your data before joining using where
operations. Deephaven is optimized for filtering rather than matching.
from deephaven import new_table
from deephaven.column import string_col, int_col, double_col
from deephaven.constants import NULL_INT
source = new_table(
[
string_col("Letter", ["A", "C", "F", "B", "E", "D", "A"]),
int_col("Number", [NULL_INT, 2, 1, NULL_INT, 4, 5, 3]),
string_col(
"Color", ["red", "blue", "orange", "purple", "yellow", "pink", "blue"]
),
int_col("Code", [12, 14, 11, NULL_INT, 16, 14, NULL_INT]),
]
)
result_single_filter = source.where(filters=["Color = `blue`"])
result_or = source.where_one_of(
filters=["Color = `blue`", "Number > 2"]
) # OR operation - result will have _either_ criteria
result_and = source.where(
filters=["Color = `blue`", "Number > 2"]
) # AND operation - result will have _both_ criteria
- source
- result_single_filter
- result_or
- result_and
where_in/where_not_in
# run these two queries together
string_set_driver = static_source_1.rename_columns(cols=["String4 = String2"]).head(10)
where_in_example_1 = static_source_2.where_in(
string_set_driver, ["String4"]
) # HEAVILY USED!
where_in_example_2 = static_source_2.where_in(static_source_1, ["String4 = String2"])
# Ticker in static_source_2 within list of USym of static_source_1
where_not_in_example = static_source_2.where_not_in(
static_source_1, ["String4 = String2"]
)
# see the "not"
Nulls and NaNs
null_example = static_source_1.where(["isNull(Int1)"]) # all data types supported
not_null_example = static_source_1.where(["!isNull(Int1)"])
nan_example = static_source_1.where(["isNaN(Int1)"])
not_nan_example = static_source_1.where(["!isNaN(Int1)"])
Head and Tail
Used to reduce the number of rows:
first_10k_rows = static_source_2.head(10_000)
ticking_last_rows = ticking_source.tail(100)
first_15_percent = ticking_source.reverse().head_pct(0.15) # the first 15% of rows
last_15_percent = static_source_2.tail_pct(0.15) # the last 15% of rows
first_10_by_key = static_source_1.head_by(
10, ["String1"]
) # first 10 rows for each String1-key
last_10_by_key = static_source_1.tail_by(
10, ["String1"]
) # last 10 rows for each String1-key
Sort
Single direction sorting:
Sort on multiple column or directions:
Reverse the order of rows in a table:
# works for all data types
sort_time_v1 = static_source_1.sort(order_by=["Timestamp"]) # sort ascending
sort_time_v2 = static_source_1.sort("Double1")
sort_numbers = static_source_1.sort_descending(["Int1"]) # sort descending
sort_strings = static_source_1.sort_descending(["String2"])
from deephaven import SortDirection
sort_time_v1 = static_source_1.sort_descending(
order_by=["String1", "Timestamp"]
) # multiple
sort_multi = static_source_1.sort(["Int1", "String1"])
sort_multiple_col = [SortDirection.ASCENDING, SortDirection.DESCENDING]
multi_sort = static_source_1.sort(
["Int1", "Double1"], sort_multiple_col
) # Int ascending then Double descending
Reversing tables is faster than sorting, and often used in UIs for seeing appending rows at top of table.
reverse_table = static_source_1.reverse()
reverse_updating = ticking_source.reverse()
Select and create new columns
Option 1: Choose and add new columns - Calculate and write to memory
Use select
and update
when data is expensive to calculate or accessed frequently. Results are saved in RAM for faster access, but they take more memory.
select_columns = static_source_2.select(["Timestamp", "Int2", "Double2", "String3"])
# constrain to only those 4 columns, write to memory
select_add_col = static_source_2.select(
["Timestamp", "Int2", "Double2", "String3", "Difference = Double2 - Int2"]
)
# constrain and add a new column calculating
select_and_update_col = static_source_2.select(
["Timestamp", "Int2", "Double2", "String3"]
).update("difference = Double2 - Int2")
# add a new column calculating - logically equivalent to previous example
Option 2: Choose and add new columns - Reference a formula and calculate on the fly
Use view
and update_view
when formula is quick or only a portion of the data is used at a time. Minimizes RAM used.
view_columns = static_source_2.view(["Timestamp", "Int2", "Double2", "String3"])
# similar to select(), though uses on-demand formula
view_add_col = static_source_2.view(
["Timestamp", "Int2", "Double2", "String3", "Difference = Double2 - Int2"]
)
# view set and add a column, though with an on-demand formula
view_and_update_view_col = static_source_2.view(
["Timestamp", "Int2", "Double2", "String3"]
).update_view(["Difference = Double2 - Int2"])
# logically equivalent to previous example
Option 3: Add new columns - Reference a formula and calculate on the fly
Use lazy_update
when there are a relatively small number of unique values. On-demand formula results are stored in cache and re-used.
lazy_update_example = static_source_2.lazy_update(
["Timestamp", "Int2", "Double2", "String3", "Difference = Double2 - Int2"]
)
Getting the row number
get_row_num = static_source_2.update_view(["RowNum_int = i", "RowNum_long = ii"])
Do math
# should use .update() when working with random numbers
simple_math = static_source_2.update(
[
"RandomNumber = Math.random()",
"RandomInt100 = new Random().nextInt(100)",
"Arithmetic = Int2 * Double2",
"SigNum = Math.signum(RandomNumber - 0.5)",
"Signed = SigNum * Arithmetic",
"AbsDlrVolume = abs(Signed)",
"Power = Math.pow(i, 2)",
"Exp = Double2 * 1E2",
"Log = Double2 * log(2.0)",
"Round = round(Double2)",
"Floor = floor(Double2)",
"Mod = RandomInt100 % 5",
"CastInt = (int)AbsDlrVolume",
"CastLong = (long)AbsDlrVolume",
]
)
Handle arrays
array_examples = static_source_2.update_view(
[
"RowNumber = i",
"PrevRowReference = Double2_[i-1]",
"MultipleRowRef = Double2_[i-2] - Double2_[i-1]",
"SubArray = Double2_.subVector(i-2, i+1)",
"SumSubArray = sum(Double2_.subVector(i-2, i+1))",
"ArraySize = SubArray.size()",
"ArrayMin = min(SubArray)",
"ArrayMax = max(SubArray)",
"ArrayMedian = median(SubArray)",
"ArrayAvg = avg(SubArray)",
"ArrayStd = std(SubArray)",
"ArrayVar = var(SubArray)",
"ArrayLast = last(SubArray)",
"ArrayIndex = SubArray[1]",
"InArray = in(45.71, SubArray)",
]
)
Create a slice or sub-vector
slice_and_rolling = static_source_2.update(
formulas=[
"SubVector = Int2_.subVector(i-2,i+1)",
"RollingMean = avg(SubVector)",
"RollingMedian =median(SubVector)",
"RollingSum = sum(SubVector)",
]
)
Manipulate time and calendars
time_stuff = static_source_2.view(
[
"Timestamp",
"CurrentTime = now()",
"CurrentDateDefault = calendarDate()",
"CurrentDateLon = today(timeZone(`Europe/London`))",
"IsAfter = CurrentTime > Timestamp",
"IsBizDay = isBusinessDay(CurrentDateLon)",
"StringDT = formatDateTime(Timestamp, timeZone(`ET`))",
"StringDate = formatDate(Timestamp, timeZone(`ET`))",
"StringToTime = parseInstant(StringDate + `T12:00 ET`)",
"AddTime = Timestamp + 'PT05:11:04.332'",
"PlusHour = Timestamp + HOUR",
"LessTenMins = Timestamp - 10 * MINUTE",
"DiffYear = diffYears365(Timestamp, CurrentTime)",
"DiffDay = diffDays(Timestamp, CurrentTime)",
"DiffNanos = PlusHour - Timestamp",
"DayWeek = dayOfWeek(Timestamp, timeZone(`ET`))",
"HourDay = hourOfDay(Timestamp, timeZone(`ET`), false)",
"DateAtMidnight = atMidnight(Timestamp, timeZone(`ET`))",
"IsBizDayString = isBusinessDay(StringDate)",
"IsBizDayDatetime = isBusinessDay(Timestamp)",
"IsBizTime = isBusinessDay(Timestamp)",
"FracBizDayDone = fractionBusinessDayComplete(CurrentTime)",
"FracBizDayOpen = fractionBusinessDayRemaining(CurrentTime)",
"NextBizDayString = plusBusinessDays(StringDate, 1).toString()",
"NextBizDayDatetime = plusBusinessDays(Timestamp, 1)",
"PlusFiveBizDayCurrent = futureBusinessDate(5)",
"PlusFBizDayString = plusBusinessDays(StringDate, 5).toString()",
"PlusFBizDayDatetime = plusBusinessDays(Timestamp, 5)",
"PrevBizDayCurrent = pastBusinessDate(1)",
"PrevBizDayString = minusBusinessDays(StringDate, 1).toString()",
"PrevBizDayDatetime = minusBusinessDays(Timestamp, 1)",
"MinusFiveBizDayCurrent = pastBusinessDate(5)",
"MinusFiveBizDayString = minusBusinessDays(StringDate, 5).toString()",
"MinusFiveBizDayDatetime = minusBusinessDays(Timestamp, 5)",
"BizDayArray = businessDates(Timestamp, CurrentTime)",
"NonBizDayArray = nonBusinessDates(Timestamp, CurrentTime)",
"DiffBizDaysCount = numberBusinessDates(Timestamp, CurrentTime)",
"DiffBizDaysExact = diffBusinessDays(Timestamp, CurrentTime)",
"DiffBizDaysString = numberBusinessDates(MinusFiveBizDayString, StringDate)",
"StandardBizDayNanos = standardBusinessNanos()",
"DiffBizSecs = diffBusinessNanos(CurrentTime, CurrentTime + 5 * DAY) / 1E9",
"LastBizOfMonth = isLastBusinessDayOfMonth(StringDate)",
"LastBizOfWeek = isLastBusinessDayOfWeek(CurrentTime)",
]
)
Bin data
Binning is a great pre-step for aggregating to support the down-sampling or other profiling of data.
bins = ticking_source.update_view(
[
"PriceBin = upperBin(Double3, 5)",
"SizeBin = lowerBin(Int3, 10)",
"TimeBin = upperBin(Timestamp, 200)",
]
).reverse()
agg_bin = bins.view(["TimeBin", "Total = Int3 * Double3"]).sum_by(["TimeBin"])
Manipulate strings
You can use any of the standard Java String operators in your queries, as the following examples show:
string_stuff = static_source_2.view(
[
"StringDate = formatDate(Timestamp, timeZone(`ET`))",
"String4",
"Double2",
"NewString = `new_string_example_`",
"ConcatV1 = NewString + String4",
"ConcatV2 = NewString + `Note_backticks!!`",
"ConcatV3 = NewString.concat(String4)",
"ConcatV4 = NewString.concat(`Note_backticks!!`)",
"StartBool = String4.startsWith(`M`)",
"NoEndBool = !String4.endsWith(`G`)",
"ContainedBool = String4.contains(`AA`)",
"NoContainBool = !String4.contains(`AA`)",
"FirstChar = String4.substring(0,1)",
"LengthString = String4.length()",
"CharIndexPos = ConcatV1.charAt(19)",
"SubstringEx = ConcatV1.substring(11,20)",
"FindIt = NewString.indexOf(`_`)",
"FindItMiddle = NewString.indexOf(`_`, FindIt + 1)",
"FindLastOf = NewString.lastIndexOf(`_`)",
"SplitToArrays = NewString.split(`_`)",
"SplitWithMax = NewString.split(`_`, 2)",
"SplitIndexPos = NewString.split(`_`)[1]",
"LowerCase = String4.toLowerCase()",
"UpperCase = NewString.toUpperCase()",
"DoubleToStringv1 = Double2 + ``",
"DoubleToStringv2 = String.valueOf(Double2)",
"DoubleToStringv3 = Double.toString(Double2)",
"StringToDoublev1 = Double.valueOf(DoubleToStringv1)",
]
)
from deephaven import new_table
from deephaven.column import string_col
source = new_table(
[
string_col("Letter", ["A", "a", "B", "b", "C", "c", "AA", "Aa", "bB", "C"]),
]
)
one_str_match = source.where(filters=["Letter = `A`"]) # match filter
str_set_match = source.where(filters=["Letter in `A`, `B`, `C`"])
case_insensitive = source.where(filters=["Letter icase in `a`, `B`, `c`"])
not_in_example = source.where(filters=["Letter not in `A`, `B`, `C`"]) # see "not"
contains_example = source.where(filters=["Letter.contains(`A`)"])
not_contains_example = source.where(filters=["!Letter.contains(`AA`)"])
starts_with_example = source.where(filters=["Letter.startsWith(`A`)"])
ends_with_example = source.where(filters=["Letter.endsWith(`C`)"])
- source
- one_str_match
- str_set_match
- case_insensitive
- not_in_example
- contains_example
- not_contains_example
- starts_with_example
- ends_with_example
Use Ternaries (If-Thens)
if_then_example_1 = static_source_2.update_view(
["SimpleTernary = Double2 < 50 ? `smaller` : `bigger`"]
)
if_then_example_2 = static_source_2.update_view(
["TwoLayers = Int2 <= 20 ? `small` : Int2 < 50 ? `medium` : `large`"]
)
Create and use custom variables
from deephaven import empty_table
a = 7
result = empty_table(5).update(formulas=["Y = a"])
- result
Create and use a custom function
See our guides:
## Create and Use a custom function
def mult(a, b):
return a * b
py_func_example = static_source_2.update(
["M_object = mult(Double2,Int2)", "M_double = (double)mult(Double2,Int2)"]
)
Type casting
(type)
casting is used to cast from one numeric primitive type handled by Java to another.
- byte
- short
- int
- long
- float
- double
This is useful when working with operations that require more or less precision than the pre-cast data type. See casting.
cast numeric types
from deephaven import new_table
from deephaven.column import double_col
numbers_max = new_table(
[
double_col(
"MaxNumbers",
[
(2 - 1 / (2**52)) * (2**1023),
(2 - 1 / (2**23)) * (2**127),
(2**63) - 1,
(2**31) - 1,
(2**15) - 1,
(2**7) - 1,
],
)
]
).view(
formulas=[
"DoubleMax = (double)MaxNumbers",
"FloatMax = (float)MaxNumbers",
"LongMax = (long)MaxNumbers",
"IntMax = (int)MaxNumbers",
"ShortMax = (short)MaxNumbers",
"ByteMax = (byte)MaxNumbers",
]
)
numbers_min = new_table(
[
double_col(
"MinNumbers",
[
1 / (2**1074),
1 / (2**149),
-(2**63) + 513,
-(2**31) + 2,
-1 * (2**15) + 1,
-(2**7) + 1,
],
)
]
).view(
formulas=[
"DoubleMin = (double)MinNumbers",
"FloatMin = (float)MinNumbers",
"LongMin = (long)MinNumbers",
"IntMin = (int)MinNumbers",
"ShortMin = (short)MinNumbers ",
"ByteMin = (byte)MinNumbers ",
]
)
numbers_min_meta = numbers_min.meta_table.view(formulas=["Name", "DataType"])
numbers_max_meta = numbers_max.meta_table.view(formulas=["Name", "DataType"])
- numbers_max
- numbers_min
- numbers_min_meta
- numbers_max_meta
Casting strings
from deephaven import empty_table
from deephaven import agg
colors = ["Red", "Blue", "Green"]
formulas = [
"X = 0.1 * i",
"Y1 = Math.pow(X, 2)",
"Y2 = Math.sin(X)",
"Y3 = Math.cos(X)",
]
grouping_cols = ["Letter = (i % 2 == 0) ? `A` : `B`", "Color = (String)colors[i % 3]"]
source = empty_table(40).update(formulas + grouping_cols)
myagg = [
agg.formula(
formula="avg(k)",
formula_param="k",
cols=[f"AvgY{idx} = Y{idx}" for idx in range(1, 4)],
)
]
result = source.agg_by(aggs=myagg, by=["Letter", "Color"])
- source
- result
Manipulate columns
unique_values = static_source_2.select_distinct("String4") # show unique set
# works on all data types - careful with doubles, longs
unique_values_combo = static_source_2.select_distinct(["Timestamp", "String4"])
# unique combinations of Time with String4
rename_stuff = static_source_2.rename_columns(["Symbol = String4", "Price = Double2"])
drop_column = static_source_2.drop_columns(["X"]) # same for drop one or many
drop_columns = static_source_2.drop_columns(["X", "Int2", "String3"])
put_cols_at_start = static_source_2.move_columns_up(
["String4", "Int2"]
) # Makes 1st col(s)
put_cols_wherever = static_source_2.move_columns(1, ["String4", "Int2"])
# makes String4 the 2nd and Int2 the 3rd column
Group and aggregate
See How to group and ungroup data for more details.
Simple grouping
group_to_arrays = static_source_1.group_by(["String2"])
# one row per key (i.e. String2), all other columns are arrays
group_multiple_keys = static_source_1.group_by(["String1", "Date"])
# one row for each key-combination (i.e. String1-Date pairs)
- group_to_arrays
- group_multiple_keys
Ungrouping
Expands each row so that each value in any array inside that row becomes itself a new row.
# Run these lines together
agg_by_key = static_source_1.group_by(["Date"])
# one row per Date, other fields are arrays from static_source_1
ungroup_that_output = agg_by_key.ungroup() # no arguments usually
# each array value becomes its own row
# in this case turns aggByDatetimeKey back into static_source_1
Aggregations
count_of_entire_table = static_source_1.count_by(
"Count"
) # single arg returns tot count, you choose name of column output
count_of_group = static_source_1.count_by("N", ["String1"]) # N is naming choice
first_of_group = static_source_1.first_by(["String1"])
last_of_group = static_source_1.last_by(["String1"])
sum_of_group = static_source_1.view(["String1", "Double1"]).sum_by(["String1"])
# non-key field(s) must be numerical
avg_of_group = static_source_1.view(["String1", "Double1", "Int1"]).avg_by(["String1"])
std_of_group = static_source_1.view(["String1", "Double1", "Int1"]).std_by(["String1"])
var_of_group = static_source_1.view(["String1", "Double1", "Int1"]).var_by(["String1"])
median_of_group = static_source_1.view(["String1", "Double1", "Int1"]).median_by(
["String1"]
)
min_of_group = static_source_1.view(["String1", "Double1", "Int1"]).min_by(["String1"])
max_of_group = static_source_1.view(["String1", "Double1", "Int1"]).max_by(["String1"])
# combine aggregations in a single method (using the same key-grouping)
Join data
See Choose the right join method for more details.
For SQL developers: in Deephaven, joins are normally used to enrich a data set, not filter. Use where
to filter your data instead of using a join.
Joins that get used a lot
Natural Join
- Returns all the rows of the left table, along with up to one matching row from the right table.
- If there is no match in the right table for a given row, nulls will appear for that row in the columns from the right table.
- If there are multiple matches in the right table for a given row, the query will fail.
left_table.natural_join(rightTable, columnsToMatch, columnsToAdd)
The right table of the join needs to have only one match based on the key(s).
# run these together
last_by_key_table = (
static_source_1.view(["String1", "Int1", "Timestamp"])
.last_by(["String1"])
.rename_columns(["Renamed = Int1", "RenamedTime = Timestamp"])
)
# creates 3-column table of String1 + last record of the other 2 cols
join_with_last_price = static_source_1.natural_join(last_by_key_table, ["String1"])
# arguments are (rightTableOfJoin, "JoinKey(s)")
# will have same number of rows as static_source_1 (left table)
# brings all non-key columns from last_price_table to static_source_1
# HEAVILY USED!
specify_cols_from_r = static_source_1.natural_join(
last_by_key_table, ["String1"], ["Renamed"]
)
# nearly identical to joinWithLastPrice example
# args are (rightTableOfJoin, "JoinKey(s)", "Cols from R table")
rename_cols_on_join = static_source_1.natural_join(
last_by_key_table, ["String1"], ["RenamedAgain = Renamed, RenamedTime"]
)
# nearly identical to specify_cols_from_r example
# can rename column on the join
# sometimes this is necessary…
# if there would be 2 cols of same name
# Note the placement of quotes
keys_of_diff_names = static_source_2.view(["String3", "Double2"]).natural_join(
last_by_key_table, ["String3 = String1"]
)
# note that String2 in the L table is mapped as the key to String1 in R
# this result has many null records, as there are many String2
# without matching String1
Multiple Keys
# run these together
last_price_two_keys = static_source_1.view(["Date", "String1", "Int1"]).last_by(
["Date", "String1"]
)
# creates 3-column table of LastDouble for each Date-String1 pair
nat_join_two_keys = static_source_1.natural_join(
last_price_two_keys, ["Date, String1", "Int1"]
)
# arguments are (rightTableOfJoin, "JoinKey1, JoinKey2", "Col(s)")
# Note the placement of quotes
AJ (As-Of Join)
As-of joins are time series joins. They can also be used with other ordered numerics as the join key(s). It is often wise to make sure the Right-table is sorted (based on the key). aj
is designed to find "the exact match" of the key or "the record just before". For timestamp aj-keys, this means "that time or the record just before".
left_table = right_table.aj(columnsToMatch, columnsToAdd)
Reverse As-of joins raj
find "the exact match" of the key or "the record just after". For timestamp reverse aj-keys, this means "that time or the record just after".
result = left_table.raj(right_table, columnsToMatch, columnsToAdd)
from deephaven import time_table
ticking_source_2 = time_table("PT10.5S").update(
[
"X = ii",
"Double4 = randomDouble(1, 100)",
"Int4 = randomInt(1, 100)",
"Date = Timestamp.toString()",
"String7 = (java.lang.String)rand_choice(letters_upp)",
"String8 = (java.lang.String)get_random_string(randomInt(1, 4))",
]
)
aj_join = ticking_source_2.aj(
table=ticking_source,
on=["Timestamp"],
joins=["String5", "String6", "JoinTime = Timestamp"],
)
raj_join = ticking_source_2.raj(
table=ticking_source,
on=["Timestamp"],
joins=["String5", "String6", "JoinTime = Timestamp"],
)
raj_join = ticking_source_2.raj(
table=ticking_source,
on=["String7 = String5", "Timestamp"],
joins=["String6", "JoinTime = Timestamp"],
)
Less common joins
join
returns all rows that match between the left and right tables, potentially with duplicates. Similar to SQL inner join.
- Returns only matching rows.
- Multiple matches will have duplicate values, which can result in a long table.
- Returns all rows of
left_table
. - If there are no matching keys, the result will fail.
- Multiple matches will fail.
# exact joins require precisely one match in the right table for each key
last_string_1aug23 = static_source_1.where(["Date = `2017-08-23`"]).last_by(["String1"])
last_string_1aug24 = (
static_source_1.where(["Date = `2017-08-24`"])
.last_by(["String1"])
.where_in(last_string_1aug23, ["String1"])
)
# filters for String1 in last_string_1aug23
exact_join_ex = last_string_1aug24.exact_join(
last_string_1aug23, ["String1"], ["RenamedDouble = Double1"]
)
# join will find all matches from left in the right table
# if a match is not found, that row is omitted from the result
# if the Right-table has multiple matches, then multiple rows are included in result
last_ss_1 = static_source_1.last_by(["String1"]).view(["String1", "Double1"])
last_ss_2 = static_source_2.last_by(["String3"]).view(
["String1 = String3", "RenamedDouble = Double2"]
)
first_ss_2 = static_source_2.first_by(["String3"]).view(
["String1 = String3", "RenamedDouble = Double2"]
)
from deephaven import merge
merge_first_and_last_ss_2 = merge([first_ss_2, last_ss_2])
# the table has two rows per String1
join_example = last_ss_1.join(merge_first_and_last_ss_2, ["String1"], ["RenamedDouble"])
# note that the resultant set has two rows per String1
Use columns as arrays and cells as variables
get_a_column = static_source_1.j_object.getColumnSource("String1")
print(get_a_column)
get_a_cell = get_a_column.get(0)
print(get_a_cell)
- Log
Read and write files
It is very easy to import files and to export any table to a CSV via the UI.
CSVs and Parquet files imported from a server-side directory should be done via the script below - these are NOT working examples. The code below provides syntax, but the filepaths are unknown.
# CSV
from deephaven import read_csv
# import CSV from a URL
csv_imported = read_csv(
"https://media.githubusercontent.com/media/deephaven/examples/main/Iris/csv/iris.csv"
)
# import headerless CSV
csv_headerless = read_csv(
"https://media.githubusercontent.com/media/deephaven/examples/main/DeNiro/csv/deniro_headerless.csv",
headless=True,
)
# import headerless CSV then add headings
import deephaven.dtypes as dht
header = {"Year": dht.int64, "Rotten Tom Score": dht.int64, "Title": dht.string}
csv_manual_header = read_csv(
"https://media.githubusercontent.com/media/deephaven/examples/main/DeNiro/csv/deniro_headerless.csv",
header=header,
headless=True,
)
# !!! WRITE CSV !!!
# export / write CSV to local /data/ directory
from deephaven import write_csv
write_csv(csv_manual_header, "/data/outputFile.csv")
# import the local CSV you just exported
csv_local_read = read_csv("/data/outputFile.csv")
# full syntax
# csv_full_monty = read_csv(path: str,
# header: Dict[str, DataType]=None,
# headless: bool=False,
# delimiter: str=",",
# quote: str="\"",
# ignore_surrounding_spaces: bool = True,
# trim: bool = False,
# charset: str = "utf-8")
# PARQUET
# write Parquet
from deephaven.parquet import write
table_sample = static_source_1.head(100)
write(table_sample, "/data/output_generic.parquet")
write(table_sample, "/data/output_gzip.parquet", compression_codec_name="GZIP")
# other compression codes are available
# read Parquet
from deephaven.parquet import read
parquet_read = read("/data/output_generic.parquet")
parquet_ok_if_zipped = read("/data/output_gzip.parquet")
Do Cum-Sum and Rolling Average
def has_others(array, target):
for x in array:
if x != target:
return True
return False
###
make_arrays = static_source_2.sort("String3").update_view(
[
"String3Array10 = String3_.subVector(i-5, i-1)",
"Int2Array10 = Int2_.subVector(i-5, i-1)",
"Double2Array10 = Double2_.subVector(i-5, i-1)",
]
)
###
cum_and_roll_10 = make_arrays.update(
[
"ArrayHasOtherString3s = \
has_others.call(String3Array10.toArray(), String3)",
"CumSum10 = ArrayHasOtherString3s = false ? \
(int)sum(Double2Array10) : NULL_INT",
"RollAvg10 = ArrayHasOtherString3s = false ? \
avg(Int2Array10) : NULL_DOUBLE",
]
)
Another example of creating a rolling sum
from deephaven.updateby import rolling_sum_tick
rolling_sum_op = rolling_sum_tick(
cols=["RollingSumInt1 = Int1"], rev_ticks=20, fwd_ticks=0
)
sums = trades.update_by(ops=[rolling_sum_op], by=["String1"])
Set up an EMA
from deephaven.experimental.ema import ByEmaSimple
ema30sec = ByEmaSimple("BD_SKIP", "BD_SKIP", "TIME", 30, "SECONDS", type="LEVEL")
ema_data = ticking_source.view(
formulas=["EMA_data = ema30sec.update(Timestamp, Int3)"]
).last_by()
ema_data_grouped = (
ticking_source.view(
formulas=["String5", "EMA_data = ema30sec.update(Timestamp, Int3, String5)"]
)
.last_by(by=["String5"])
.sort(order_by=["String5"])
)
Use pandas
import deephaven.pandas as dhpd
pandas_df = dhpd.to_pandas(static_source_1)
# do Pandas stuff to prove it is a Pandas Dataframe
print(type(pandas_df), pandas_df.dtypes, pandas_df.describe())
# Pandas to dh table
dh_table_again = dhpd.to_table(pandas_df)
- Log
Format tables
Datetime formatting
time_formatting = (
static_source_1.view(
[
"Time1 = Timestamp",
"Time2 = Timestamp",
"Time3 = Timestamp",
"Time4 = Timestamp",
"Time5 = Timestamp",
"Time6 = Timestamp",
]
)
.format_columns(
[
"Time1=Date(`yyyy_MM_dd'T'HH-mm-ss.S z`)",
"Time2 = Date(`yyyy-MM-dd'T'HH:mm t`)",
"Time3 = Date(`HH:mm:ss`)",
"Time4 = Date(`HH:mm:ss.SSSSSSSSS`)",
"Time5 = Date(`EEE dd MMM yy HH:MM:ss`)",
"Time6 = Date(`yyyy-MM-dd`)",
]
)
.update_view(["Time7_string = formatDate(Time6, timeZone(`ET`))"])
)
Number formatting
number_formatting = static_source_1.view(
[
"Double1",
"BigNum1 = Double1 * 1000",
"BigNum2 = Double1 * 1000",
"BigNum3 = Double1 * 1000",
"Price1 = Double1",
"Price2 = Double1",
"Price3 = Double1",
"SmallNum1 = Double1 / 1000",
"SmallNum2 = Double1 / 1000",
"SmallNum3 = Double1 / 1000",
"TinyNum = Double1 / 1_000_000_000",
]
).format_columns(
[
"BigNum1 = Decimal(`###,#00`)",
"BigNum2 = Decimal(`##0.00`)",
"BigNum3 = Decimal(`0.####E0`)",
"Price1 = Decimal(`###,###.##`)",
"Price2 = Decimal(`$###,##0.00`)",
"SmallNum1 = Decimal(`##0.000000`)",
"SmallNum2 = Decimal(`##0.00%`)",
"SmallNum3 = Decimal(`##0%`)",
"TinyNum = Decimal(`0.00E0`)",
]
)
Color formatting
# set a background color for an entire table
background_format = static_source_1.format_columns("* = bgfga(colorRGB(0,0,128))")
# set a color column by column
just_colors = static_source_1.format_columns(
[
"X = `#90F060`",
"Double1 = LIGHTBLUE",
"Int1 = colorRGB(247,204,0)",
"Timestamp = colorRGB(247,204,204)",
"Date = bgfg(colorRGB(57,43,128),colorRGB(243,247,122))",
"String1 = bgfga(MEDIUMVIOLETRED)",
"String2 = colorHSL(0, 24, 36)",
]
)
# column formatting
column_colors = (
static_source_1.update_view(["RowMod10 = i % 10"])
.format_column_where("String1", "Int1 > 35", "DEEP_RED")
.format_column_where("Double1", "Double1 > Double1_[i-1]", "LIMEGREEN")
.format_columns(
[
"Int1 = (X % 5 !=0) ? ORANGE : BLACK",
"Date = (String1 = `A`) ? colorRGB(253, 31, 203) : colorRGB(171, 254, 42)",
"RowMod10 = heatmap(RowMod10, 1.0, 10.0, BRIGHT_GREEN, BRIGHT_RED)",
]
)
)
# row formatting
row_colors = static_source_1.format_row_where("Int1 > 45", " PALE_BLUE")
# one can format_columns() for numbers and colors together
number_and_color = static_source_1.format_columns(
[
"Double1 = Decimal(`##0.00`)",
"Double1 = (Double1 > Double1_[i-1]) ? FUCHSIA : GREY",
]
)
You will find an itemization of stock colors in the docs.
Plot programmatically
Substantial documentation about plotting
exists. The following example intends to show the basics, and everything about styling and labeling is omitted.
Time series plots
from deephaven.plot.figure import Figure
figure = Figure()
time_series = figure.plot_xy(
series_name="Time_series", t=static_source_1.head(100), x="Timestamp", y="Int1"
).show()
time_series_2axis = (
figure.plot_xy(
series_name="Int1",
t=static_source_1.where(["X < 100", "Int1 < 70 "]),
x="Timestamp",
y="Int1",
)
.x_twin()
.plot_xy(
series_name="Double1",
t=static_source_1.where(["X < 100", "Double1 > 30"]),
x="Timestamp",
y="Double1",
)
.show()
)
time_series_3axis = (
figure.plot_xy(
series_name="Int1", t=static_source_1.where(["X < 25"]), x="Timestamp", y="Int1"
)
.x_twin()
.plot_xy(
series_name="Double1",
t=static_source_1.where(["X < 25"]),
x="Timestamp",
y="Double1",
)
.x_twin()
.plot_xy(
series_name="DoubleN",
t=static_source_1.where(["X < 25"]).update(["DoubleN = pow(Double1,2)"]),
x="Timestamp",
y="DoubleN",
)
.show()
)
Bar chart
from deephaven.plot import Figure, PlotStyle
figure = Figure()
bar_chart = (
figure.axes(plot_style=PlotStyle.BAR)
.plot_xy(
series_name="Int1", t=static_source_1.where("X < 41"), x="Timestamp", y="Int1"
)
.show()
)
Plot-by-some key
from deephaven.plot import Figure
table_3_keys = (
static_source_1.head(2000)
.where("String1 in `A`, `B`, `C`")
.sort("Int1")
.update(
["New_Row = i", "Y = Int1 * (String1 = `A` ? 2 : String1 = `B` ? 0.5 : 1.0)"]
)
)
p = (
Figure()
.plot_xy(
series_name="partitioned", t=table_3_keys, by=["String1"], x="New_Row", y="Y"
)
.show()
)
Histogram
from deephaven.plot import Figure
figure = Figure()
histogram = (
figure.plot_xy_hist(
series_name="Histogram Values",
t=static_source_1.where("X < 41"),
x="Int1",
nbins=10,
)
.chart_title(title="Histogram of Values")
.show()
)
Area graph
from deephaven.plot import PlotStyle, Figure
figure = Figure()
area_graph = (
figure.axes(plot_style=PlotStyle.AREA)
.plot_xy(
series_name="Int_col",
t=static_source_1.where("X < 41"),
x="Timestamp",
y="Int1",
)
.show()
)
Stacked Area
from deephaven.plot import PlotStyle, Figure
figure = Figure()
stacked_area_graph = (
figure.axes(plot_style=PlotStyle.AREA)
.plot_xy(
series_name="Series_1",
t=static_source_1.where(["X < 1000", "String1 = `A`"]),
x="Timestamp",
y="Double1",
)
.plot_xy(
series_name="Series_2",
t=static_source_1.where(["X < 1000", "String1 = `B`"]),
x="Timestamp",
y="Double1",
)
.show()
)
Scatter
from deephaven.plot.figure import Figure
from deephaven.plot import PlotStyle
from deephaven.plot import Color, Colors
from deephaven.plot import font_family_names, Font, FontStyle, Shape
figure = Figure()
plotXYScatter = (
figure.plot_xy(
series_name="Double1", t=static_source_1.head(50), x="Timestamp", y="Double1"
)
.axes(plot_style=PlotStyle.SCATTER)
.point(shape=Shape.SQUARE, size=10, label="Big Point", color=Colors.RED)
.x_twin()
.axes(plot_style=PlotStyle.SCATTER)
.plot_xy(series_name="ii", t=static_source_1.head(50), x="Timestamp", y="X")
.point(shape=Shape.DIAMOND, size=16, label="Big Triangle", color=Colors.BLUE)
.show()
)
Use layout hints
The layout_hints
method creates a new table with the layout specified by the parameters.
from deephaven import new_table
from deephaven.column import string_col, int_col
source = new_table(
[
string_col("A", ["A", "a"]),
string_col("B", ["B", "b"]),
string_col("C", ["C", "c"]),
string_col("D", ["D", "d"]),
string_col("E", ["E", "e"]),
string_col("Y", ["Y", "y"]),
int_col("Even", [2, 4]),
int_col("Odd", [1, 3]),
]
)
result = source.layout_hints(
freeze=["Even"],
front=["Odd"],
back=["B"],
hide=["C"],
column_groups=[
{"name": "Vowels", "children": ["A", "E"]},
],
)
- source
- result
Other useful methods
Wait for table updates
Use await_update
to instruct Deephaven to wait for updates to a specified table before continuing.
from deephaven import time_table
source = time_table("PT1S")
print(source.await_update(0))
print(source.await_update(1000))
result = source.update(formulas=("renamedTimestamp = Timestamp"))
- source
- result
- Log
Convert dynamic tables to static tables
Uses snapshot
to create a static, in-memory copy of a source table.
from deephaven import time_table
source = time_table("PT1S")
# Some time later...
result = source.snapshot()
- source
- result
Reduce ticking frequency
Uses snapshot_when
to reduce the ticking frequency.
from deephaven import time_table
import random
source = time_table("PT0.5S").update(
formulas=["X = (int) random.randint(0, 100)", "Y = sqrt(X)"]
)
trigger = time_table("PT5S").rename_columns(cols=["TriggerTimestamp = Timestamp"])
result = source.snapshot_when(trigger_table=trigger)
Capture the history of ticking tables
Uses snapshot_when
to capture the history of ticking tables.
from deephaven import time_table
import random
source = (
time_table("PT0.01S")
.update(
formulas=[
"X = i%2 == 0 ? `A` : `B`",
"Y = (int) random.randint(0, 100)",
"Z = sqrt(Y)",
]
)
.last_by(by=["X"])
)
trigger = time_table("PT1S").rename_columns(cols=["TriggerTimestamp = Timestamp"])
result = source.snapshot_when(trigger_table=trigger, history=True)
- source
- trigger
- result
Use DynamicTableWriter
See our guide How to write data to an in-memory, real-time table.
from deephaven import DynamicTableWriter
import deephaven.dtypes as dht
from deephaven.plot import Figure
import numpy as np, threading, time
table_writer = DynamicTableWriter(
{
"X": dht.double,
"SawToothWave": dht.double,
"SquareWave": dht.double,
"TriangleWave": dht.double,
}
)
waveforms = table_writer.table
def create_waveforms():
for i in range(200):
start = time.time()
x = 0.1 * i
y_sawtooth = (x % 1 - 0.5) * 2
y_square = 1.0 if x % 2 < 1 else -1.0
y_triangle = (x % 1 - 0.5) * 2 if x % 2 >= 1 else -(x % 1 - 0.5) * 2
table_writer.write_row(x, y_sawtooth, y_square, y_triangle)
end = time.time()
time.sleep(0.2 - (end - start))
thread = threading.Thread(target=create_waveforms)
thread.start()
figure = Figure()
new_fig = (
figure.plot_xy(series_name="Sawtooth Wave", t=waveforms, x="X", y="SawToothWave")
.plot_xy(series_name="Square Wave", t=waveforms, x="X", y="SquareWave")
.plot_xy(series_name="Triangle Wave", t=waveforms, x="X", y="TriangleWave")
)
new_plot = new_fig.show()