Table operations cheat sheet
Create tables
Empty tables
from deephaven import empty_table
result = empty_table(5)
# Empty tables are often followed with a formula
result1 = result.update(formulas=["X = 5"])
- result
- result1
New tables
Columns are created using the following methods:
from deephaven import new_table
from deephaven.column import string_col, int_col
result = new_table([
int_col("IntegerColumn", [1, 2, 3]),
string_col("Strings", ["These", "are", "Strings"])
])
- result
Time tables
The following code makes a time_table
that updates every second.
from deephaven import time_table
result = time_table("00:00:01")
Filter
You should filter your data before performing other operations to optimize performance. Less data generally means better, faster queries.
where
For SQL developers: In Deephaven, filter your data before joining using where
operations. Deephaven is optimized for filtering rather than matching.
from deephaven import new_table
from deephaven.column import string_col, int_col, double_col
from deephaven.constants import NULL_INT
source = new_table([
string_col("Letter", ["A", "C", "F", "B", "E", "D", "A"]),
int_col("Number", [NULL_INT, 2, 1, NULL_INT, 4, 5, 3]),
string_col("Color", ["red", "blue", "orange", "purple", "yellow", "pink", "blue"]),
int_col("Code", [12, 14, 11, NULL_INT, 16, 14, NULL_INT]),
])
result_single_filter = source.where(filters=["Color = `blue`"])
result_or = source.where_one_of(filters=["Color = `blue`", "Number > 2"]) # OR operation - result will have _either_ criteria
result_and = source.where(filters=["Color = `blue`", "Number > 2"]) # AND operation - result will have _both_ criteria
- source
- result_single_filter
- result_or
- result_and
To filter results based on a filterTable
:
filterTable = new_table([
string_col("Colors", ["blue", "red", "purple", "white"]),
int_col("Codes", [10, 12, 14, 16])
])
# returns a new table containing rows from the source table
where_in_colors = source.where_in(filter_table=filterTable, cols=["Color = Colors"])
where_in_colors_and_codes = source.where_in(filter_table=filterTable, cols=["Color = Colors", "Code = Codes"]) # AND operation - result will have both criteria
where_not_in_colors = source.where_not_in(filter_table=filterTable, cols=["Color = Colors"])
- filterTable
- where_in_colors
- where_in_colors_and_codes
- where_not_in_colors
head and tail
Used to reduce the number of rows:
tail = source.tail(5) # returns last 5 rows
tail = source.tail(2) # returns last 2 rows
tail_pct = source.tail_pct(0.25) # returns last 25% of rows
head_pct = source.head_pct(0.75) # returns first 75% of rows
head = source.head(2) # returns first 2 rows
- tail
- tail_pct
- head_pct
- head
Join data
See our guide Choose the right join for more details.
For SQL developers: in Deephaven, joins are normally used to enrich a data set, not filter. Use where
to filter your data instead of using a join.
Joins for close matches (time)
aj (As-Of Join)
As-of joins aj
find "the exact match" of the key or "the record just before". For timestamp aj-keys, this means "that time or the record just before".
left_table = right_table.aj(columnsToMatch, columnsToAdd)
from deephaven import new_table
from deephaven.column import string_col, int_col, double_col, datetime_col
from deephaven.time import to_datetime
trades = new_table([
string_col("Ticker", ["AAPL", "AAPL", "AAPL", "IBM", "IBM"]),
datetime_col("TradeTime", [to_datetime("2021-04-05T09:10:00 NY"), to_datetime("2021-04-05T09:31:00 NY"), to_datetime("2021-04-05T16:00:00 NY"), to_datetime("2021-04-05T16:00:00 NY"), to_datetime("2021-04-05T16:30:00 NY")]),
double_col("Price", [2.5, 3.7, 3.0, 100.50, 110]),
int_col("Size", [52, 14, 73, 11, 6])
])
quotes = new_table([
string_col("Ticker", ["AAPL", "AAPL", "IBM", "IBM", "IBM"]),
datetime_col("QuoteTime", [to_datetime("2021-04-05T09:11:00 NY"), to_datetime("2021-04-05T09:30:00 NY"), to_datetime("2021-04-05T16:00:00 NY"), to_datetime("2021-04-05T16:30:00 NY"), to_datetime("2021-04-05T17:00:00 NY")]),
double_col("Bid", [2.5, 3.4, 97, 102, 108]),
int_col("BidSize", [10, 20, 5, 13, 23]),
double_col("Ask", [2.5, 3.4, 105, 110, 111]),
int_col("AskSize", [83, 33, 47, 15, 5]),
])
result = trades.aj(table=quotes, on=["Ticker", "TradeTime = QuoteTime"])
- trades
- quotes
- result
raj (Reverse As-Of Join)
Reverse As-of joins raj
find "the exact match" of the key or "the record just after". For timestamp reverse aj-keys, this means "that time or the record just after".
result = left_table.raj(right_table, columnsToMatch, columnsToAdd)
result = trades.raj(table=quotes, on=["Ticker", "TradeTime = QuoteTime"], joins=["Bid", "Offer = Ask"])
- result
Joins with exact match
nj (Natural Join)
- Returns all the rows of the left table, along with up to one matching row from the right table.
- If there is no match in the right table for a given row, nulls will appear for that row in the columns from the right table.
- If there are multiple matches in the right table for a given row, the query will fail.
leftTable.natural_join(rightTable, columnsToMatch, columnsToAdd)
The right table of the join needs to have only one match based on the key(s).
from deephaven import new_table
from deephaven.column import string_col, int_col
from deephaven.constants import NULL_INT
left = new_table([
string_col("LastName", ["Rafferty", "Jones", "Steiner", "Robins", "Smith", "Rogers", "DelaCruz"]),
int_col("DeptID", [31, 33, 33, 34, 34, 36, NULL_INT]),
string_col("Telephone", ["(303) 555-0162", "(303) 555-0149", "(303) 555-0184", "(303) 555-0125", "", "", "(303) 555-0160"])
])
right = new_table([
int_col("DeptID", [31, 33, 34, 35]),
string_col("DeptName", ["Sales", "Engineering", "Clerical", "Marketing"]),
string_col("Telephone", ["(303) 555-0136", "(303) 555-0162", "(303) 555-0175", "(303) 555-0171"])
])
result = left.natural_join(table=right, on=["DeptID"], joins=["DeptName", "DeptTelephone = Telephone"])
- left
- right
- result
join
Similar to SQL inner join, join
returns all rows that match between the left and right tables, potentially with duplicates.
- Returns only matching rows.
- Multiple matches will have duplicate values, which can result in a long table.
exactJoin
- Returns all rows of
leftTable
. - If there are no matching keys result will fail.
- Multiple matches will fail.
Merge tables
Create a new table made of all of table 1, followed by all of table 2, etc. All tables must have the same column names (schema) when merged.
from deephaven import merge, new_table
from deephaven.column import int_col, string_col
source1 = new_table([string_col("Letter", ["A", "B", "D"]), int_col("Number", [1, 2, 3])])
source2 = new_table([string_col("Letter", ["C", "D", "E"]), int_col("Number", [14, 15, 16])])
source3 = new_table([string_col("Letter", ["E", "F", "A"]), int_col("Number", [22, 25, 27])])
table_array = [source1, source2, source3]
result = merge(table_array)
- source1
- source2
- source3
- result
View table metadata
Useful to make sure schema matches before merging. Shows the column names, data types, partitions, and groups for the table.
seeMetadata = source.meta_table
- seeMetadata
Sort
Single direction sorting:
Sort on multiple column or directions:
Reverse the order of rows in a table:
from deephaven import new_table, SortDirection
from deephaven.column import string_col, int_col
from deephaven.constants import NULL_INT
source = new_table([
string_col("Letter", ["A", "C", "F", "B", "E", "D", "A"]),
int_col("Number", [NULL_INT, 2, 1, NULL_INT, 4, 5, 3]),
string_col("Color", ["red", "blue", "orange", "purple", "yellow", "pink", "blue"]),
int_col("Code", [12, 14, 11, NULL_INT, 16, 14, NULL_INT]),
])
sort_columns = [
SortDirection.ASCENDING,
SortDirection.DESCENDING
]
sorted_table = source.sort(order_by=['Letter','Number'], order=sort_columns)
- source
- sorted_table
Select and create new columns
Option 1: Choose and add new columns - calculate and write to memory
Use select
and update
when data is expensive to calculate or accessed frequently. Results are saved in RAM for faster access, but takes more memory.
from deephaven import new_table
from deephaven.column import string_col, int_col
from deephaven.constants import NULL_INT
source = new_table([
string_col("Letter", ["A", "C", "F", "B", "E", "D", "A"]),
int_col("Number", [NULL_INT, 2, 1, NULL_INT, 4, 5, 3]),
string_col("Color", ["red", "blue", "orange", "purple", "yellow", "pink", "blue"]),
int_col("Code", [12, 14, 11, NULL_INT, 16, 14, NULL_INT]),
])
select_columns = source.select(formulas=["Letter", "Number"])
# constrain to only those 2 columns, write to memory
select_add_col = source.select(formulas=["Letter", "Number", "New = Number - 5"])
# constrain and add a new calculated column
select_and_update_col = source.select(formulas=["Letter", "Number"]).update(formulas=["New = Number - 5"])
# add a new calculated column - logically equivalent to previous example
- source
- select_columns
- select_add_col
- select_and_update_col
Option 2: Choose and add new columns - reference a formula and calculate on the fly
Use view
and update_view
when formula is quick to calculate or only a portion of the data is used at a time. Minimizes RAM used.
view_columns = source.view(formulas=["Letter", "Number"])
# similar to select(), but uses on-demand formula
view_add_col = source.update_view(formulas=["Letter", "Number", "New = Number - 5"])
# view set and add a column with an on-demand formula
view_and_updateview_col = source.view(formulas=["Letter", "Number"]).update_view(formulas=["New = Number - 5"])
# logically equivalent to previous example
- view_columns
- view_add_col
- view_and_updateview_col
Option 3: Add new columns - reference a formula and calculate on the fly
Use lazy_update
when there are a small number of unique values. On-demand formula results are stored in cache and re-used.
lazy_update_ex = source.lazy_update(formulas=["Letter", "Number", "New = Number - 5"])
- lazy_update_ex
Manipulate columns
unique_values = source.select_distinct(formulas=["Letter"]) # show unique set
# works on all data types - be careful with doubles, longs
rename_stuff = source.rename_columns(cols=["NewLetter = Letter", "NewNumber = Number"])
drop_cols = source.drop_columns(cols=["Number"]) # drop one or many
put_cols_at_start = source.move_columns_up(cols=["Number"]) # make Number the first column(s)
put_cols_wherever = source.move_columns(idx=1, cols=["Number"]) # make Number the second column
- unique_values
- rename_stuff
- drop_cols
- put_cols_at_start
- put_cols_wherever
Group
See How to group and ungroup data for more details.
group_to_arrays1 = source.group_by(by=["Letter"]) # one row per key; all other columns are arrays
multiple_keys = source.group_by(by=["Letter", "Number"]) # one row for each key-combination
- group_to_arrays1
- multiple_keys
Ungroup
Expands each row so that each value in any array inside that row becomes itself a new row.
agg_by_key = source.group_by(by=["Letter"])
# one row per Letter; other fields are arrays from source
ungroup_that_output = agg_by_key.ungroup() # no arguments usually
# each array value becomes its own row
# in this case turns grouped table back into source
- agg_by_key
- ungroup_that_output
Aggregate
# IMPORTANT: Any columns not in the parentheses of the whateverBy("Col1", "Col2") statement
# need to be an appropriate type for that aggregation method
# i.e., sums need to have all non-key columns be numbers.
first_by_key = source.first_by(by=["Number"])
first_by_two_keys = source.first_by(by=["Number", "Letter"]) # all below work with multi
count_of_entire_table = source.count_by(col="Letter") # single argument returns total count
count_of_group = source.count_by(col="Number", by=["Letter"])
first_of_group = source.first_by(by=["Letter"])
last_of_group = source.last_by(by=["Letter"])
sum_of_group = source.view(formulas=["Letter", "Number"]).sum_by(by=["Letter"])
# non-key field must be numerical
avg_of_group = source.view(formulas=["Letter", "Number"]).avg_by(by=["Letter"])
std_of_group = source.view(formulas=["Letter", "Number"]).std_by(by=["Letter"])
#See our guides for more details:
var_of_group = source.view(formulas=["Letter", "Number"]).var_by(by=["Letter"])
median_of_group = source.view(formulas=["Letter", "Number"]).median_by(by=["Letter"])
min_of_group = source.view(formulas=["Letter", "Number"]).min_by(by=["Letter"])
max_of_group = source.view(formulas=["Letter", "Number"]).max_by(by=["Letter"])
- first_by_key
- first_by_two_keys
- count_of_entire_table
- count_of_group
- first_of_group
- last_of_group
- sum_of_group
- avg_of_group
- std_of_group
- var_of_group
- median_of_group
- min_of_group
- max_of_group
Other useful methods
Copy and paste these working examples into the console.
Reduce ticking frequency
Uses snapshot
to reduce the ticking frequency.
from deephaven import time_table
import random
source = time_table("00:00:00.5").update(formulas=["X = (int) random.randint(0, 100)", "Y = sqrt(X)"])
trigger = time_table("00:00:05").rename_columns(cols=["TriggerTimestamp = Timestamp"])
result = source.snapshot_when(trigger_table=trigger)
Capture the history of ticking tables
Uses snapshot_when
to capture the history of ticking tables.
from deephaven import time_table
import random
source = time_table("00:00:00.01").update(formulas=["X = i%2 == 0 ? `A` : `B`", "Y = (int) random.randint(0, 100)", "Z = sqrt(Y)"]).last_by(by=["X"])
trigger = time_table("00:00:01").rename_columns(cols=["TriggerTimestamp = Timestamp"])
result = source.snapshot_when(trigger_table=trigger, history=True)
- source
- trigger
- result
Use DynamicTableWriter and NumPy
See our guide How to write data to an in-memory, real-time table.
from deephaven import DynamicTableWriter
import deephaven.dtypes as dht
from deephaven.plot import Figure
import numpy as np, threading, time
table_writer = DynamicTableWriter({
"X": dht.double, "SawToothWave": dht.double, "SquareWave": dht.double, "TriangleWave": dht.double
})
waveforms = table_writer.table
def create_waveforms():
for i in range(200):
start = time.time()
x = 0.1 * i
y_sawtooth = (x % 1 - 0.5) * 2
y_square = 1. if x % 2 < 1 else -1.
y_triangle = (x % 1 - 0.5) * 2 if x % 2 >= 1 else -(x % 1 - 0.5) * 2
table_writer.write_row(x, y_sawtooth, y_square, y_triangle)
end = time.time()
time.sleep(0.2 - (end - start))
thread = threading.Thread(target=create_waveforms)
thread.start()
figure = Figure()
new_fig = figure.\
plot_xy(series_name="Sawtooth Wave", t=waveforms, x="X", y="SawToothWave").\
plot_xy(series_name="Square Wave", t=waveforms, x="X", y="SquareWave").\
plot_xy(series_name="Triangle Wave", t=waveforms, x="X", y="TriangleWave")
new_plot = new_fig.show()