Ultimate cheat sheet
Neccesary data
The named tables in these lines are used throughout this page and should be run first.
Create static tables
from deephaven import empty_table
from deephaven.time import to_datetime
import random
import string
letters_upp = string.ascii_uppercase
def get_random_string(length):
letters = string.ascii_letters
result_str = ''.join(random.choice(letters) for i in range(length))
return(result_str)
def rand_choice(array):
return random.choice(array)
static_source_1 = empty_table(26300)\
.update(['X = ii', 'Double1 = randomDouble(1, 100)', 'Int1 = randomInt(1, 100)', 'Timestamp = convertDateTime(`2016-01-01T01:00 NY`) + i * HOUR', 'Date = Timestamp.toDateString()', 'String1 = (String)rand_choice(letters_upp)', 'String2 = (String)get_random_string(randomInt(1, 4))'])
static_source_2 = empty_table(26300)\
.update(['X = ii', 'Double2 = randomDouble(1, 80)', 'Int2 = randomInt(1, 100)', \
'Timestamp = convertDateTime(`2019-01-01T01:00 NY`) + i * HOUR', 'String3 = (String)rand_choice(letters_upp)', 'String4 = (String)get_random_string(randomInt(1, 4))'])
Create ticking tables
from deephaven import time_table
ticking_source = time_table("00:00:01")\
.update(['X = ii', 'Double3 = randomDouble(1, 100)', 'Int3 = randomInt(1, 100)', \
'Date = Timestamp.toDateString()', 'String5 = (java.lang.String)rand_choice(letters_upp)', 'String6 = (java.lang.String)get_random_string(randomInt(1, 4))'])
Merge tables
To merge tables, the schema must be identical: same column names, same column data types. This applies to both static and updating tables.
from deephaven import merge
copy1 = static_source_1
merge_same_static = merge([static_source_1, copy1])
copy_updating_1 = ticking_source
copy_updating_2 = ticking_source
merge_same_dynamic = merge([copy_updating_1, copy_updating_2])
# one can merge static and ticking tables
static_source_1v2 = static_source_1.view(["Date", "Timestamp", "SourceRowNum = X"])
ticking_source_v2 = ticking_source.view(["Date", "Timestamp", "SourceRowNum = X"])
merge_static_and_dynamic = merge([static_source_1v2, ticking_source_v2])
View metadata of table
meta_table shows the column names, data types, partitions, and groups for the table.
static_1_meta = static_source_1.meta_table
Filter
Most queries benefit by starting with filters. Less data generally means better performance.
For SQL developers: in Deephaven, Joins are not a primary operation for filtering. Use where
, where_in
, and where_not_in
.
Backticks [`] are used for strings and single quotes ['
] are used for timestamps and characters
Date & Time examples
from deephaven.time import to_datetime
todays_data_1 = ticking_source.where(["Date = currentDay()"]) # Date for this is a String
todays_data_3 = ticking_source.where(["Date = currentDate(TZ_SYD)"]) # sydney timezone
single_string_date = static_source_1.where(["Date = `2017-01-03`"]) # HEAVILY USED!
less_than_date = static_source_1.where(["Date < `2017-01-03`"]) # use >=, etc. as well
one_month_string = static_source_1.where(["Date.startsWith(`2017-02`)"])
single_db_date = static_source_2.where(["formatDate(Timestamp, TZ_NY) = `2020-03-01`"])
after_db_datetime = static_source_1.where(["Timestamp > '2017-01-05T10:30:00 NY'"])
just_biz_time = static_source_1.where(["isBusinessTime(Timestamp)"]) # HEAVILY USED!
just_august_datetime = static_source_2.where(["monthOfYear(Timestamp, TZ_NY) = 8"])
just_10am_hour = static_source_1.where(["hourOfDay(Timestamp, TZ_NY) = 10"])
just_tues = static_source_2.where(["dayOfWeek(Timestamp).getValue() = 2"]) # Tuesdays
time1 = to_datetime('2017-08-21T09:45:00 NY')
time2 = to_datetime('2017-08-21T10:10:00 NY')
trades = static_source_1.where(["inRange(Timestamp, time1, time2)"])
# Experienced Python users might prefer to use Python3 methods for casting instead
time1 = '2017-08-21T09:45:00 NY'
time2 = '2017-08-21T12:10:00 NY'
trades = static_source_1.where([f"inRange(Timestamp, '{time1}', '{time2}')"])
String examples
one_string_match = static_source_1.where(["String1 = `A`"]) # match filter
string_set_match = static_source_1.where(["String1 in `A`, `M`, `G`"])
case_insensitive = static_source_1.where(["String1 icase in `a`, `m`, `g`"])
not_in_example = static_source_1.where(["String1 not in `A`, `M`, `G`"]) # see "not"
contains_example = static_source_1.where(["String2.contains(`P`)"])
not_contains_example = static_source_1.where(["!String2.contains(`P`)"])
starts_with_example = static_source_1.where(["String2.startsWith(`A`)"])
ends_with_example = static_source_1.where(["String2.endsWith(`M`)"])
Number examples
equals_example = static_source_2.where(["round(Int2) = 44"])
less_than_example = static_source_2.where(["Double2 < 8.42"])
some_manipulation = static_source_2.where(["(Double2 - Int2) / Int2 > 0.05"])
mod_example_1 = static_source_2.where(["i % 10 = 0"]) # every 10th row
mod_example_2 = static_source_2.where(["String4.length() % 2 = 0"])
# even char-count Tickers
Multiple filters
conjunctive_comma = static_source_1.where(["Date = `2017-08-23`", "String1 = `A`"])
# HEAVILY USED!
conjunctive_ampersand = static_source_1.where(["Date = `2017-08-23` && String1 = `A`"])
disjunctive_same_col = static_source_1.where(["Date = `2017-08-23`|| Date = `2017-08-25`"])
disjunctive_diff_col = static_source_1.where(["Date = `2017-08-23` || String1 = `A`"])
range_lame_way = static_source_1.where(["Date >= `2017-08-21`", "Date <= `2017-08-23`"])
in_range_best = static_source_1.where(["inRange(Date, `2017-08-21`, `2017-08-23`)"])
# HEAVILY USED!
in_range_best_string = static_source_1.where("inRange(String2.substring(0,1), `A`, `D`)")
# starts with letters A - D
where_in/where_not_in
# run these two queries together
string_set_driver = static_source_1.rename_columns(cols=["String4 = String2"]).head(10)
where_in_example_1 = static_source_2.where_in(string_set_driver, ["String4"]) # HEAVILY USED!
where_in_example_2 = static_source_2.where_in(static_source_1, ["String4 = String2"])
# Ticker in static_source_2 within list of USym of static_source_1
where_not_in_example = static_source_2.where_not_in(static_source_1, ["String4 = String2"])
# see the "not"
Nulls and NaNs
null_example = static_source_1.where(["isNull(Int1)"]) # all data types supported
not_null_example = static_source_1.where(["!isNull(Int1)"])
nan_example = static_source_1.where(["isNaN(Int1)"])
not_nan_example = static_source_1.where(["!isNaN(Int1)"])
Head and Tail
first_10k_rows = static_source_2.head(10_000)
ticking_last_rows = ticking_source.tail(100)
first_15_percent = ticking_source.reverse().head_pct(0.15) # the first 15% of rows
last_15_percent = static_source_2.tail_pct(0.15) # the last 15% of rows
first_10_by_key = static_source_1.head_by(10, ["String1"]) # first 20 rows for each String1-key
last_10_by_key = static_source_1.tail_by(20, ["String1"]) # first 20 rows for each String1-key
Sort
# works for all data types
sort_time_v1 = static_source_1.sort(order_by=['Timestamp']) # sort ascending
sort_time_v2 = static_source_1.sort("Double1")
sort_numbers = static_source_1.sort_descending(['Int1']) # sort descending
sort_strings = static_source_1.sort_descending(['String2'])
from deephaven import SortDirection
sort_time_v1 = static_source_1.sort_descending(order_by=['String1', 'Timestamp']) # multiple
sort_multi = static_source_1.sort("Int1", "String1")
sort_multiple_col = [
SortDirection.ASCENDING,
SortDirection.DESCENDING
]
multi_sort = static_source_1.sort(['Int1', 'Double1'], sort_multiple_col) # Int ascending then Double descending
Reversing tables is faster than sorting, and often used in UIs for seeing appending rows at top of table.
reverse_table = static_source_1.reverse()
reverse_updating = ticking_source.reverse()
Select and create new columns
Option 1: Choose and add new columns - Calculate and write to memory
Use select
and update
when it is expensive to perform a calculation or in cases where the result will be accessed frequently.
select_columns = static_source_2.select(["Timestamp", "Int2", "Double2", "String3"])
# constrain to only those 4 columns, write to memory
select_add_col = static_source_2.select(["Timestamp", "Int2", "Double2", "String3", "Difference = Double2 - Int2"])
# constrain and add a new column calculating
select_and_update_col = static_source_2.select(["Timestamp", "Int2", "Double2", "String3"])\
.update("difference = Double2 - Int2")
# add a new column calculating - logically equivalent to previous example
Option 2: Choose and add new columns - Reference a formula and calculate on the fly
Use view
and update_view
when formula is quick or only a portion of the data is used at a time. Minimizes RAM used.
view_columns = static_source_2.view(["Timestamp", "Int2", "Double2", "String3"])
# similar to select(), though uses on-demand formula
view_add_col = static_source_2.view(["Timestamp", "Int2", "Double2", "String3", "Difference = Double2 - Int2"])
# view set and add a column, though with an on-demand formula
view_and_update_view_col = static_source_2\
.view(["Timestamp", "Int2", "Double2", "String3"])\
.update_view(["Difference = Double2 - Int2"])
# logically equivalent to previous example
Option 3: Add new columns - Reference a formula and calculate on the fly
Use lazy_update
when there are a small-ish number of unique values. On-demand formula results are stored in cache and re-used.
lazy_update_example = static_source_2.lazy_update(["Timestamp", "Int2", "Double2", "String3", "Difference = Double2 - Int2"])
Getting the row number
get_row_num = static_source_2.update_view(["RowNum_int = i", "RowNum_long = ii"])
Do math
#should use .update() when working with random numbers
simple_math = static_source_2.update(["RandomNumber = Math.random()", \
"RandomInt100 = new Random().nextInt(100)", \
"Arithmetic = Int2 * Double2", \
"SigNum = Math.signum(RandomNumber - 0.5)", \
"Signed = SigNum * Arithmetic", \
"AbsDlrVolume = abs(Signed)", \
"Power = Math.pow(i, 2)", \
"Exp = Double2 * 1E2", \
"Log = Double2 * log(2.0)", \
"Round = round(Double2)", \
"Floor = floor(Double2)", \
"Mod = RandomInt100 % 5", \
"CastInt = (int)AbsDlrVolume", \
"CastLong = (long)AbsDlrVolume"])
Handle arrays
array_examples = static_source_2.update_view(["RowNumber = i", \
"PrevRowReference = Double2_[i-1]", \
"MultipleRowRef = Double2_[i-2] - Double2_[i-1]", \
"SubArray = Double2_.subVector(i-2, i+1)", \
"SumSubArray = sum(Double2_.subVector(i-2, i+1))", \
"ArraySize = SubArray.size()", \
"ArrayMin = min(SubArray)", \
"ArrayMax = max(SubArray)", \
"ArrayMedian = median(SubArray)", \
"ArrayAvg = avg(SubArray)", \
"ArrayStd = std(SubArray)", \
"ArrayVar = var(SubArray)", \
"ArrayLast = last(SubArray)", \
"ArrayIndex = SubArray[1]", \
"InArray = in(45.71, SubArray)"])
Create a slice or sub-vector
slice_and_rolling = static_source_2.update(formulas=["SubVector = Int2_.subVector(i-2,i+1)",\
"RollingMean = avg(SubVector)", \
"RollingMedian =median(SubVector)", \
"RollingSum = sum(SubVector)"])
Manipulate time and calendars
time_stuff = static_source_2.view(["Timestamp", \
"CurrentTime = currentTime()", \
"CurrentDateDefault = currentDay()", \
"CurrentDateLon = currentDate(TZ_LON)", \
"IsAfter = CurrentTime > Timestamp", \
"IsBizDay = isBusinessDay(CurrentDateLon)", \
"StringDT = format(Timestamp, TZ_NY)", \
"StringDate = formatDate(Timestamp, TZ_NY)", \
"StringToTime = convertDateTime(StringDate + `T12:00 NY`)", \
"AddTime = Timestamp + '05:11:04.332'", \
"PlusHour = Timestamp + HOUR", \
"LessTenMins = Timestamp - 10 * MINUTE", \
"DiffYear = diffYear(Timestamp, currentTime())", \
"DiffDay = diffDay(Timestamp, CurrentTime)", \
"DiffNanos = PlusHour - Timestamp", \
"DayWeek = dayOfWeek(Timestamp, TZ_NY)", \
"HourDay = hourOfDay(Timestamp, TZ_NY)", \
"DateAtMidnight = dateAtMidnight(Timestamp, TZ_NY)", \
"IsBizDayString = isBusinessDay(StringDate)", \
"IsBizDayDatetime = isBusinessDay(Timestamp)", \
"IsBizTime = isBusinessDay(Timestamp)", \
"FracBizDayDone = fractionOfBusinessDayComplete(currentTime())", \
"FracBizDayOpen = fractionOfBusinessDayRemaining(currentTime())", \
"NextNonBizDay = nextNonBusinessDay()", \
"NextBizDayCurrent = nextBusinessDay()", \
"NextBizDayString = nextBusinessDay(StringDate)", \
"NextBizDayDatetime = nextBusinessDay(Timestamp)", \
"PlusFiveBizDayCurrent = nextBusinessDay(5)", \
"PlusFBizDayString = nextBusinessDay(StringDate, 5)", \
"PlusFBizDayDatetime = nextBusinessDay(Timestamp, 5)", \
"PrevBizDayCurrent = previousBusinessDay()", \
"PrevBizDayString = previousBusinessDay(StringDate)", \
"PrevBizDayDatetime = previousBusinessDay(Timestamp)", \
"MinusFiveBizDayCurrent = previousBusinessDay(5)", \
"MinusFiveBizDayString = previousBusinessDay(StringDate, 5)", \
"MinusFiveBizDayDatetime = previousBusinessDay(Timestamp, 5)", \
"BizDayArray = businessDaysInRange(Timestamp, currentTime())", \
"NonBizDayArray = nonBusinessDaysInRange(Timestamp, currentTime())", \
"DiffBizDaysCount = numberOfBusinessDays(Timestamp, currentTime())", \
"DiffBizDaysExact = diffBusinessDay(Timestamp, currentTime())", \
"DiffBizDaysString = numberOfBusinessDays(MinusFiveBizDayString, StringDate)", \
"StandardBizDayNanos = standardBusinessDayLengthNanos()", \
"DiffBizSecs = diffBusinessNanos(CurrentTime, CurrentTime + 5 * DAY) / 1E9", \
"LastBizOfMonth = isLastBusinessDayOfMonth(StringDate)", \
"LastBizOfWeek = isLastBusinessDayOfWeek(currentTime())"])
Bin data
Binning is a great pre-step for aggregating to support the down-sampling or other profiling of data.
bins = ticking_source.update_view(["PriceBin = upperBin(Double3, 5)", \
"SizeBin = lowerBin(Int3, 10)", \
"TimeBin = upperBin(Timestamp, '00:02:00')"]).reverse()
agg_bin = bins.view(["TimeBin", "Total = Int3 * Double3"]).sum_by(["TimeBin"])
Manipulate strings
string_stuff = static_source_2.view(["StringDate = formatDate(Timestamp, TZ_NY)", \
"String4","Double2", \
"NewString = `new_string_example_`", \
"ConcatV1 = NewString + String4", \
"ConcatV2 = NewString + `Note_backticks!!`", \
"ConcatV3 = NewString.concat(String4)", \
"ConcatV4 = NewString.concat(`Note_backticks!!`)", \
"StartBool = String4.startsWith(`M`)", \
"NoEndBool = !String4.endsWith(`G`)", \
"ContainedBool = String4.contains(`AA`)", \
"NoContainBool = !String4.contains(`AA`)", \
"FirstChar = String4.substring(0,1)", \
"LengthString = String4.length()", \
"CharIndexPos = ConcatV1.charAt(19)", \
"SubstringEx = ConcatV1.substring(11,20)", \
"FindIt = NewString.indexOf(`_`)", \
"FindItMiddle = NewString.indexOf(`_`, FindIt + 1)", \
"FindLastOf = NewString.lastIndexOf(`_`)", \
"SplitToArrays = NewString.split(`_`)", \
"SplitWithMax = NewString.split(`_`, 2)", \
"SplitIndexPos = NewString.split(`_`)[1]", \
"LowerCase = String4.toLowerCase()", \
"UpperCase = NewString.toUpperCase()", \
"DoubleToStringv1 = Double2 + ``", \
"DoubleToStringv2 = String.valueOf(Double2)", \
"DoubleToStringv3 = Double.toString(Double2)", \
"StringToDoublev1 = Double.valueOf(DoubleToStringv1)"])
Use Ternaries (If-Thens)
if_then_example_1 = static_source_2\
.update_view(["SimpleTernary = Double2 < 50 ? `smaller` : `bigger`"])
if_then_example_2 = static_source_2\
.update_view(["TwoLayers = Int2 <= 20 ? `small` : Int2 < 50 ? `medium` : `large`"])
Create and use a custom function
## Create and Use a custom function
def mult(a,b):
return a*b
py_func_example = static_source_2.update(["M_object = mult(Double2,Int2)", "M_double = (double)mult(Double2,Int2)"])
Manipulate columns
unique_values = static_source_2.select_distinct("String4") # show unique set
# works on all data types - careful with doubles, longs
unique_values_combo = static_source_2.select_distinct(["Timestamp", "String4"])
# unique combinations of Time with String4
rename_stuff = static_source_2.rename_columns(["Symbol = String4", "Price = Double2"])
drop_column = static_source_2.drop_columns(["X"]) # same for drop one or many
drop_columns = static_source_2.drop_columns(["X", "Int2", "String3"])
put_cols_at_start = static_source_2.move_columns_up(["String4", "Int2"]) # Makes 1st col(s)
put_cols_wherever = static_source_2.move_columns(1, ["String4", "Int2"])
# makes String4 the 2nd and Int2 the 3rd column
Group and aggregate
Simple grouping
group_to_arrays = static_source_1.group_by(["String2"])
# one row per key (i.e. String2), all other columns are arrays
group_multiple_keys = static_source_1.group_by(["String1", "Date"])
# one row for each key-combination (i.e. String1-Date pairs)
- group_to_arrays
- group_multiple_keys
Ungrouping
# Run these lines together
agg_by_key = static_source_1.group_by(["Date"])
# one row per Date, other fields are arrays from static_source_1
ungroup_that_output = agg_by_key.ungroup() # no arguments usually
# each array value becomes its own row
# in this case turns aggByDatetimeKey back into static_source_1
Aggregations
count_of_entire_table = static_source_1.count_by("Count") # single arg returns tot count, you choose name of column output
count_of_group = static_source_1.count_by("N", ["String1"]) # N is naming choice
first_of_group = static_source_1.first_by(["String1"])
last_of_group = static_source_1.last_by(["String1"])
sum_of_group = static_source_1.view(["String1", "Double1"]).sum_by(["String1"])
# non-key field(s) must be numerical
avg_of_group = static_source_1.view(["String1", "Double1", "Int1"]).avg_by(["String1"])
std_of_group = static_source_1.view(["String1", "Double1", "Int1"]).std_by(["String1"])
var_of_group = static_source_1.view(["String1", "Double1", "Int1"]).var_by(["String1"])
median_of_group = static_source_1.view(["String1", "Double1", "Int1"]).median_by(["String1"])
min_of_group = static_source_1.view(["String1", "Double1", "Int1"]).min_by(["String1"])
max_of_group= static_source_1.view(["String1", "Double1", "Int1"]).max_by(["String1"])
# combine aggregations in a single method (using the same key-grouping)
Join
For SQL developers: Joins in Deephaven are used to extend result sets by joining data from other tables, not as much for filtering.
Joins that get used a lot
Natural Join
The right table of the join needs to have only one match based on the key(s).
# run these together
last_by_key_table = static_source_1.view(["String1", "Int1", "Timestamp"])\
.last_by(["String1"])\
.rename_columns(["Renamed = Int1", "RenamedTime = Timestamp"])
# creates 3-column table of String1 + last record of the other 2 cols
join_with_last_price = static_source_1.natural_join(last_by_key_table, ["String1"])
# arguments are (rightTableOfJoin, "JoinKey(s)")
# will have same number of rows as static_source_1 (left table)
# brings all non-key columns from last_price_table to static_source_1
# HEAVILY USED!
specify_cols_from_r = static_source_1.natural_join(last_by_key_table, ["String1"], ["Renamed"])
# nearly identical to joinWithLastPrice example
# args are (rightTableOfJoin, "JoinKey(s)", "Cols from R table")
rename_cols_on_join = static_source_1\
.natural_join(last_by_key_table, ["String1"], ["RenamedAgain = Renamed, RenamedTime"])
# nearly identical to specify_cols_from_r example
# can rename column on the join
# sometimes this is necessary…
# if there would be 2 cols of same name
# Note the placement of quotes
keys_of_diff_names = static_source_2.view(["String3", "Double2"])\
.natural_join(last_by_key_table, ["String3 = String1"])
# note that String2 in the L table is mapped as the key to String1 in R
# this result has many null records, as there are many String2
# without matching String1
Multiple Keys
# run these together
last_price_two_keys = static_source_1.view(["Date", "String1", "Int1"])\
.last_by(["Date", "String1"])
# creates 3-column table of LastDouble for each Date-String1 pair
nat_join_two_keys = static_source_1\
.natural_join(last_price_two_keys, ["Date, String1", "Int1"])
# arguments are (rightTableOfJoin, "JoinKey1, JoinKey2", "Col(s)")
# Note the placement of quotes
AJ (As-Of Join)
As-of joins are time series joins. They can also be used with other ordered numerics as the join key(s). It is often wise to make sure the Right-table is sorted (based on the key). aj
is designed to find "the exact match" of the key or "the record just before". For timestamp aj-keys, this means "that time or the record just before".
Reverse As-of joins raj
find "the exact match" of the key or "the record just after". For timestamp reverse aj-keys, this means "that time or the record just after".
from deephaven import time_table
ticking_source_2 = time_table("00:00:10.500")\
.update(['X = ii', 'Double4 = randomDouble(1, 100)', 'Int4 = randomInt(1, 100)', \
'Date = Timestamp.toDateString()', 'String7 = (java.lang.String)rand_choice(letters_upp)', 'String8 = (java.lang.String)get_random_string(randomInt(1, 4))'])
aj_join = ticking_source_2.aj(table=ticking_source, on=["Timestamp"],joins=["String5", "String6", "JoinTime = Timestamp"])
raj_join = ticking_source_2.raj(table=ticking_source, on=["Timestamp"],joins=["String5", "String6", "JoinTime = Timestamp"])
raj_join = ticking_source_2.raj(table=ticking_source, on=["String7 = String5", "Timestamp"],joins=["String6", "JoinTime = Timestamp"])
Less common joins
# exact joins require precisely one match in the right table for each key
last_string_1aug23 = static_source_1.where(["Date = `2017-08-23`"]).last_by(["String1"])
last_string_1aug24 = static_source_1.where(["Date = `2017-08-24`"]).last_by(["String1"])\
.where_in(last_string_1aug23, ["String1"])
# filters for String1 in last_string_1aug23
exact_join_ex = last_string_1aug24.exact_join(last_string_1aug23, ["String1"], ["RenamedDouble = Double1"])
# join will find all matches from left in the right table
# if a match is not found, that row is omitted from the result
# if the Right-table has multiple matches, then multiple rows are included in result
last_ss_1 = static_source_1.last_by(["String1"]).view(["String1", "Double1"])
last_ss_2 = static_source_2.last_by(["String3"])\
.view(["String1 = String3", "RenamedDouble = Double2"])
first_ss_2 = static_source_2.first_by(["String3"])\
.view(["String1 = String3", "RenamedDouble = Double2"])
from deephaven import merge
merge_first_and_last_ss_2 = merge([first_ss_2, last_ss_2])
# the table has two rows per String1
join_example = last_ss_1.join(merge_first_and_last_ss_2, ["String1"], ["RenamedDouble"])
# note that the resultant set has two rows per String1
Use columns as arrays and cells as variables
get_a_column = static_source_1.j_object.getColumnSource("String1")
print(get_a_column)
get_a_cell = get_a_column.get(0)
print(get_a_cell)
- Log
Read and write files
It is very easy to import files and to export any table to a CSV via the UI.
CSVs and Parquety files imported from a server-side directory should be done via the script below - these are NOT working examples. The code below provides syntax, but the filepaths are unknown.
# CSV
from deephaven import read_csv
# import CSV from a URL
csv_imported = read_csv("https://media.githubusercontent.com/media/deephaven/examples/main/Iris/csv/iris.csv")
# import headerless CSV
csv_headerless = read_csv("https://media.githubusercontent.com/media/deephaven/examples/main/DeNiro/csv/deniro_headerless.csv", headless=True)
# import headerless CSV then add headings
import deephaven.dtypes as dht
header = {"Year": dht.int_, "Rotten Tom Score": dht.int_, "Title": dht.string}
csv_manual_header = read_csv("https://media.githubusercontent.com/media/deephaven/examples/main/DeNiro/csv/deniro_headerless.csv", header=header, headless=True)
# !!! WRITE CSV !!!
# export / write CSV to local /data/ directory
from deephaven import write_csv
write_csv(csv_manual_header, "/data/outputFile.csv")
# import the local CSV you just exported
csv_local_read = read_csv("/data/outputFile.csv")
# full syntax
# csv_full_monty = read_csv(path: str,
# header: Dict[str, DataType]=None,
# headless: bool=False,
# delimiter: str=",",
# quote: str="\"",
# ignore_surrounding_spaces: bool = True,
# trim: bool = False,
# charset: str = "utf-8")
# PARQUET
# write Parquet
from deephaven.parquet import write
table_sample = static_source_1.head(100)
write(table_sample, "/data/output_generic.parquet")
write(table_sample, "/data/output_gzip.parquet", compression_codec_name="GZIP")
# other compression codes are available
# read Parquet
from deephaven.parquet import read
parquet_read = read("/data/output_generic.parquet")
parquet_ok_if_zipped = read("/data/output_gzip.parquet")
Do Cum-Sum and Rolling Average
def has_others(array, target):
for x in array:
if x != target:
return True
return False
###
make_arrays= static_source_2.sort("String3")\
.update_view(["String3Array10 = String3_.subVector(i-5, i-1)", \
"Int2Array10 = Int2_.subVector(i-5, i-1)", \
"Double2Array10 = Double2_.subVector(i-5, i-1)"])
###
cum_and_roll_10 = make_arrays.update(["ArrayHasOtherString3s = \
has_others.call(String3Array10.toArray(), String3)", \
"CumSum10 = ArrayHasOtherString3s = false ? \
(int)sum(Double2Array10) : NULL_INT", \
"RollAvg10 = ArrayHasOtherString3s = false ? \
avg(Int2Array10) : NULL_DOUBLE"])
Another example of creating a rolling sum
from deephaven.updateby import rolling_sum_tick
rolling_sum_op = rolling_sum_tick(cols=["RollingSumInt1 = Int1"], rev_ticks=20, fwd_ticks=0)
sums = trades.update_by(ops=[rolling_sum_op], by=["String1"])
Set up an EMA
from deephaven.experimental.ema import ByEmaSimple
ema30sec = ByEmaSimple('BD_SKIP','BD_SKIP','TIME',30,'SECONDS', type='LEVEL')
ema_data = ticking_source.view(formulas=["EMA_data = ema30sec.update(Timestamp, Int3)"]).last_by()
ema_data_grouped = ticking_source\
.view(formulas=["String5", "EMA_data = ema30sec.update(Timestamp, Int3, String5)"])\
.last_by(by=["String5"]).sort(order_by=["String5"])
Use Pandas
import deephaven.pandas as dhpd
pandas_df = dhpd.to_pandas(static_source_1)
# do Pandas stuff to prove it is a Pandas Dataframe
print(type(pandas_df), pandas_df.dtypes, pandas_df.describe())
# Pandas to dh table
dh_table_again = dhpd.to_table(pandas_df)
- Log
Format tables
Datetime formatting
time_formatting = static_source_1.view(["Time1 = Timestamp", "Time2 = Timestamp", "Time3 = Timestamp", "Time4 = Timestamp", "Time5 = Timestamp", "Time6 = Timestamp"])\
.format_columns(["Time1=Date(`yyyy_MM_dd'T'HH-mm-ss.S z`)", \
"Time2 = Date(`yyyy-MM-dd'T'HH:mm t`)", \
"Time3 = Date(`HH:mm:ss`)", \
"Time4 = Date(`HH:mm:ss.SSSSSSSSS`)", \
"Time5 = Date(`EEE dd MMM yy HH:MM:ss`)", \
"Time6 = Date(`yyyy-MM-dd`)"])\
.update_view(["Time7_string = formatDate(Time6, TZ_NY)"])
Number formatting
number_formatting = static_source_1\
.view(["Double1", \
"BigNum1 = Double1 * 1000", "BigNum2 = Double1 * 1000", \
"BigNum3 = Double1 * 1000", "Price1 = Double1", "Price2 = Double1", \
"Price3 = Double1", \
"SmallNum1 = Double1 / 1000", "SmallNum2 = Double1 / 1000", \
"SmallNum3 = Double1 / 1000", "TinyNum = Double1 / 1_000_000_000"])\
.format_columns(["BigNum1 = Decimal(`###,#00`)", \
"BigNum2 = Decimal(`##0.00`)", \
"BigNum3 = Decimal(`0.####E0`)", \
"Price1 = Decimal(`###,###.##`)", \
"Price2 = Decimal(`$###,##0.00`)", \
"SmallNum1 = Decimal(`##0.000000`)", \
"SmallNum2 = Decimal(`##0.00%`)", \
"SmallNum3 = Decimal(`##0%`)", \
"TinyNum = Decimal(`0.00E0`)"])
Color formatting
# set a background color for an entire table
background_format = static_source_1\
.format_columns("* = bgfga(colorRGB(0,0,128))")
# set a color column by column
just_colors = static_source_1\
.format_columns(["X = `#90F060`", \
"Double1 = LIGHTBLUE", \
"Int1 = colorRGB(247,204,0)", \
"Timestamp = colorRGB(247,204,204)", \
"Date = bgfg(colorRGB(57,43,128),colorRGB(243,247,122))", \
"String1 = bgfga(MEDIUMVIOLETRED)", \
"String2 = colorHSL(0, 24, 36)"])
#column formatting
column_colors = static_source_1\
.update_view(["RowMod10 = i % 10"])\
.format_column_where("String1", "Int1 > 35", "DEEP_RED")\
.format_column_where("Double1", "Double1 > Double1_[i-1]", "LIMEGREEN")\
.format_columns(["Int1 = (X % 5 !=0) ? ORANGE : BLACK", \
"Date = (String1 = `A`) ? colorRGB(253, 31, 203) : colorRGB(171, 254, 42)", \
"RowMod10 = heatmap(RowMod10, 1.0, 10.0, BRIGHT_GREEN, BRIGHT_RED)"])
#row formatting
row_colors = static_source_1\
.format_row_where("Int1 > 45"," PALE_BLUE")
# one can format_columns() for numbers and colors together
number_and_color = static_source_1\
.format_columns(["Double1 = Decimal(`##0.00`)", \
"Double1 = (Double1 > Double1_[i-1]) ? FUCHSIA : GREY"])
You will find an itemization of stock colors in the docs.
Plot programmatically
Substantial documentation about plotting
exists. The following example intends to show the basics, and everything about styling and labeling is omitted.
Time series plots
from deephaven.plot.figure import Figure
figure = Figure()
time_series = figure.plot_xy(series_name="Time_series", t=static_source_1.head(100), x = "Timestamp", y="Int1")\
.show()
time_series_2axis = figure.plot_xy(series_name="Int1", t=static_source_1.where(["X < 100", "Int1 < 70 "]), x="Timestamp", y="Int1")\
.x_twin()\
.plot_xy(series_name="Double1", t=static_source_1.where(["X < 100", "Double1 > 30"]), x="Timestamp", y="Double1")\
.show()
time_series_3axis = figure.plot_xy(series_name="Int1", t=static_source_1.where(["X < 25"]), x="Timestamp", y="Int1")\
.x_twin()\
.plot_xy(series_name="Double1", t=static_source_1.where(["X < 25"]), x="Timestamp", y="Double1")\
.x_twin()\
.plot_xy(series_name="DoubleN", t=static_source_1.where(["X < 25"]).update(["DoubleN = pow(Double1,2)"]), x="Timestamp", y="DoubleN")\
.show()
Bar chart
from deephaven.plot import Figure, PlotStyle
figure = Figure()
bar_chart = figure.axes(plot_style=PlotStyle.BAR)\
.plot_xy(series_name="Int1", t=static_source_1.where('X < 41'), x="Timestamp", y="Int1").show()
Plot-by-some key
from deephaven.plot import Figure
table_3_keys = static_source_1.head(2000).where("String1 in `A`, `B`, `C`")\
.sort("Int1").update(["New_Row = i", "Y = Int1 * (String1 = `A` ? 2 : String1 = `B` ? 0.5 : 1.0)"])
p = Figure().plot_xy(series_name="partitioned", t=table_3_keys, by=["String1"], x="New_Row", y="Y").show()
Histogram
from deephaven.plot import Figure
figure = Figure()
histogram = figure.plot_xy_hist(series_name="Histogram Values", t=static_source_1.where('X < 41'), x="Int1", nbins=10)\
.chart_title(title="Histogram of Values")\
.show()
Area graph
from deephaven.plot import PlotStyle, Figure
figure = Figure()
area_graph = figure.axes(plot_style=PlotStyle.AREA)\
.plot_xy(series_name="Int_col", t=static_source_1.where('X < 41'), x="Timestamp", y="Int1").show()
Stacked Area
from deephaven.plot import PlotStyle, Figure
figure = Figure()
stacked_area_graph = figure.axes(plot_style=PlotStyle.AREA)\
.plot_xy(series_name="Series_1", t=static_source_1.where(["X < 1000", "String1 = `A`"]), x="Timestamp", y="Double1")\
.plot_xy(series_name="Series_2", t=static_source_1.where(["X < 1000", "String1 = `B`"]), x="Timestamp", y="Double1")\
.show()
Scatter
from deephaven.plot.figure import Figure
from deephaven.plot import PlotStyle
from deephaven.plot import Color, Colors
from deephaven.plot import font_family_names, Font, FontStyle, Shape
figure = Figure()
plotXYScatter = figure\
.plot_xy(series_name="Double1", t=static_source_1.head(50), x="Timestamp", y="Double1")\
.axes(plot_style=PlotStyle.SCATTER)\
.point(shape=Shape.SQUARE, size=10, label="Big Point", color=Colors.RED)\
.x_twin()\
.axes(plot_style=PlotStyle.SCATTER)\
.plot_xy(series_name="ii", t=static_source_1.head(50), x="Timestamp", y="X")\
.point(shape=Shape.DIAMOND, size=16, label="Big Triangle",color=Colors.BLUE )\
.show()