Ultimate Table Operations Cheat Sheet
Access Data
Tip
The import
and named tables in these lines are used throughout this page, and should be run first.
from deephaven import *
staticSource1 = db.t("LearnDeephaven", "StockTrades") # historical data
staticSource2 = db.t("LearnDeephaven", "EODTrades") # historical EOD data
Sourcing Tables
# Note: the tableName left of the "= sign" is a variable named whatever you want
allAvailable = db.getCatalog() # full catalog of namespaces and tables
tickingSource = db.i("DbInternal", "ProcessEventLog")
# access a real-time updating table!
# intraday data
# note: db.i instead of db.t
# intraday data w/o ticking
snapshotEx = db.i("DbInternal", "ProcessEventLog", False)
Merging Tables
copy1 = staticSource1.where() # 3.16 mm records
copy2 = staticSource1.where() # same 3.16 mm records in a table of a different name
mergeBothCopies = ttools.merge(copy1, copy2)
# to merge tables, the schema must be identical
# same column names, same column data types
# merging updating (ticking) tables follows the same standard
copyUpdating1 = tickingSource.where()
copyUpdating2 = tickingSource.where()
mergeBothCopies = ttools.merge(copyUpdating1, copyUpdating2)
# same schema for each of the tables to be merged
# one can merge static and ticking tables
staticSource1v2 = staticSource1.where().view("Date", "Timestamp", "SourceRowNum = i")
tickingSourcev2 = tickingSource.where().view("Date", "Timestamp", "SourceRowNum = i")
mergeStaticAndUpdating = ttools.merge(staticSource1v2, tickingSourcev2)
# again, same schema for each of the tables to be merged
# in the UI right-click on the table heading
# hit Reverse/Before-sorting from the menu - updates at top then
Viewing metadata of table
seeMetadata = staticSource1.getMeta() # column names, data types, partitions, groups
Printing information
print(db.getNamespaces()) # just the namespaces, not also the tables
print (db.getIntradayPartitions("DbInternal", "ProcessEventLog")) # see partitions
from deephaven.Calendars import calendarNames
print(calendarNames()) # list of business calendars shipped with Deephaven
# describes the packages available in the main deephaven module
help('deephaven')
Filtering
Most queries benefit by starting with filters. Less data generally means better performance.
For SQL developers : In Deephaven, Joins are not a primary operation for filtering. Use where()
, whereIn()
, and whereNotIn()
.
Note
Backticks `
are used for strings and single quotes '
are used for timestamps and characters
Date & Time examples
Filtering first by partitioning values (which is most often the set of Dates) is both a best practice, and necessary for some downstream use cases.
todaysData1 = tickingSource.where("Date = currentDay()") # Date for this is a String
todaysData2 = tickingSource.where("Date = currentDateNy()") # popular for NY
todaysData3 = tickingSource.where("Date = currentDate(TZ_SYD)") # sydney timezone
# VERY IMPORTANT: Filtering first by partitioning values
# (which is most often the set of Dates) is both a best practice,
# and necessary for some downstream use cases.
singleStringDate = staticSource1.where("Date = `2017-08-23`") # HEAVILY USED!
lessThanDate = staticSource1.where("Date < `2017-08-23`") # use >=, etc. as well
oneMonthString = staticSource1.where("Date.startsWith(`2017-08`)")
singleDbDate = staticSource2.where("formatDate(EODTimestamp, TZ_NY) = `2017-03-01`")
afterDbDatetime = staticSource1.where("Timestamp > '2017-08-25T10:30:00 NY'")
justBizTime = staticSource1.where("isBusinessTime(Timestamp)") # HEAVILY USED!
justAugustDatetime = staticSource2.where("monthOfYear(EODTimestamp, TZ_NY) = 8") #
just10amHour = staticSource1.where("hourOfDay(Timestamp, TZ_NY) = 10")
justTues = staticSource2.where("dayOfWeek(EODTimestamp).getValue() = 2") # Tuesdays
from deephaven.DBTimeUtils import convertDateTime
time1 = convertDateTime('2017-08-21T09:45:00 NY')
time2 = convertDateTime('2017-08-21T10:10:00 NY')
trades = staticSource1.where("inRange(ExchangeTimestamp, time1, time2)")
# Experienced Python users might prefer to use Python3 methods for casting instead
time1 = '2017-08-21T09:45:00 NY'
time2 = '2017-08-21T10:10:00 NY'
trades = staticSource1.where(f"inRange(Timestamp, '{time1}', '{time2}')")
String Examples
oneStringMatch = staticSource1.where("USym = `AAPL`") # match filter
stringSetMatch = staticSource1.where("USym in `AAPL`, `MSFT`, `GOOG`")
caseInsensitive = staticSource1.where("USym icase in `aapl`, `msft`, `goog`")
notInExample = staticSource1.where("USym not in `AAPL`, `MSFT`, `GOOG`") # see "not"
containsExample = staticSource1.where("USym.contains(`I`)")
notContainsExample = staticSource1.where("!USym.contains(`I`)")
startsWithExample = staticSource1.where("USym.startsWith(`A`)")
endsWithExample = staticSource1.where("USym.endsWith(`M`)")
Number examples
equalsExample = staticSource2.where("round(Open) = 44")
lessThanExample = staticSource2.where("High < 8.42")
someManipulation = staticSource2.where("(Close - Open) / Open > 0.05")
modExample1 = staticSource2.where("i % 10 = 0") # every 10th row
modExample2 = staticSource2.where("Ticker.length() % 2 = 0")
# even char-count Tickers
Multiple Filters
conjunctiveComma = staticSource1.where("Date = `2017-08-23`", "USym = `AAPL`")
# HEAVILY USED!
conjunctiveAmpA = staticSource1.where("Date = `2017-08-23` && USym = `AAPL`")
disjunctiveSameCol = staticSource1.where("Date = `2017-08-23`\
|| Date = `2017-08-25`") # thisDate or thatDate
disjunctiveDiffCol = staticSource1.where("Date = `2017-08-23` || USym = `AAPL`")
# thisDate or thatUSym
rangeLameWay = staticSource1.where("Date >= `2017-08-21`", "Date <= `2017-08-23`")
inRangeBest = staticSource1.where("inRange(Date, `2017-08-21`, `2017-08-23`)")
# HEAVILY USED!
inRangeBestString = staticSource1.where("inRange(USym.substring(0,1), `A`, `D`)")
# starts with letters A - D
WhereIn / WhereNotIn
# run these two queries together
usymSetDriver = staticSource1.renameColumns("Ticker = USym")
whereInExample = staticSource2.whereIn(usymSetDriver, "Ticker") # HEAVILY USED!
# filters staticSource2 USyms to be the same as …
# the set of USyms in staticSource1 to be…
# the same as in usymSetDriver
# this can be dynamic to accommodate changing usymSetDriver data
whereInExample = staticSource2.whereIn(staticSource1, "Ticker = USym")
# Ticker in staticSource2 within list of USym of staticSource1
whereNotInExample = staticSource2.whereNotIn(staticSource1, "Ticker = USym")
# see the "not"
Nulls and NaNs
nullExample = staticSource1.where("isNull(ExchangeId)") # all data types supported
notNullExample = staticSource1.where("!isNull(ExchangeId)")
nanExample = staticSource1.where("isNaN(Last)")
notNanExample = staticSource1.where("!isNaN(Last)")
neitherNanNull = staticSource1.where("isNormal(Last)") # normal = not null nor NaN
eitherNanNull = staticSource1.where("!isNormal(Last)") # not normal
Head and Tail
first10k = staticSource2.head(10_000)
last10k = staticSource2.tail(10_000)
# note this would deliver a changing set if intraday (updating) data
first15Perc = staticSource2.headPct(0.15) # the first 15% of rows
last15Perc = staticSource2.tailPct(0.15) # the last 15% of rows
first20byKey = staticSource1.headBy(20, "USym") # first 20 rows for each USym
last20byKey = staticSource1.tailBy(20, "USym") # first 20 rows for each USym
Sort
oneSortTime = staticSource1.sort("Timestamp") # works for all data types, actually
oneSortNumber = staticSource1.sort("Last") # sorting a number
oneSortString = staticSource1.sort("Exchange") # sorting a string
sortDescExample = staticSource1.sortDescending("Timestamp") # highest to lowest
multiSortClassic = staticSource1.sort("Exchange","Timestamp") # Exchange then Time
SortPair = jpy.get_type("com.illumon.iris.db.tables.SortPair")
sortPairs = staticSource1.sort(SortPair.ascending("Exchange"),\
SortPair.descending("Timestamp"))
# Exchange ascending then Timestamp descending
reverseTable = staticSource1.reverse() # HEAVILY USED! Very cheap to support GUIs
reverseUpdating = tickingSource.reverse() # updates then come to the top of table
Select And Create New Columns
Option 1: Choose and add new columns -- Calculate and write to memory
Use select()
and update()
when it is expensive to calculate or accessed frequently.
selectColumns = staticSource2.select("EODTimestamp", "Ticker", "Open", "Close")
# constrain to only those 4 columns, write to memory
selectAddCol = staticSource2.select("EODTimestamp", "Ticker", "Open", "Close",\
"Change = Close - Open")
# constrain and add a new column calculating
selectAndUpdateCol = staticSource2.select("EODTimestamp", "Ticker", "Open",\
"Close").update("Change = Close - Open")
# add a new column calculating - logically equivalent to previous example
Option 2: Choose and add new columns -- Reference a formula and calc on the fly
Use view()
and updateView()
when formula is quick or only a portion of the data is used at a time. Minimizes RAM used.
viewColumns = staticSource2.view("EODTimestamp", "Ticker", "Open", "Close")
# similar to select(), though uses on-demand formula
viewAddCol = staticSource2.updateView("EODTimestamp", "Ticker", "Open", "Close",\
"Change = Close - Open")
# view set and add a column, though with an on-demand formula
viewAndUpdateViewCol = staticSource2.view("EODTimestamp", "Ticker", "Open",\
"Close").updateView("Change = Close - Open")
# logically equivalent to previous example
Option 3: Add new columns -- Reference a formula and calc on the fly
Use lazyUpdate() when there are a small-ish number of unique values; on-demand formula results are stored in cache and re-used.
lazyUpdateEx = staticSource2.lazyUpdate("EODTimestamp", "Ticker", "Open", "Close",\
"Change = Close - Open")
# similar to update(), though uses on-demand formula
Using the updateView
(or equivalently update
, select
, view
) methods
Getting the row number
getTheRowNumber = staticSource2.updateView("RowNum = i")
Doing Math
simpleMath = staticSource2.updateView("RandomNumber = Math.random()",\
"RandomInt100 = new Random().nextInt(100)",\
"Arithmetic = Close * Volume",\
"SigNum = signum(RandomNumber - 0.5)",\
"Signed = SigNum * Arithmetic",\
"AbsDlrVolume = abs(Signed)",\
"Power = Math.pow(i, 2)",\
"Exp = Close * 1E2",\
"Log = Close * log(2.0)",\
"Round = round(Close)",\
"Floor = floor(Close)",\
"Mod = RandomInt100 % 5",\
"CastInt = (int)AbsDlrVolume",\
"CastLong = (long)AbsDlrVolume")
Handling Arrays
arrayExamples = staticSource2.updateView("RowNumber = i",\
"PrevRowReference = Close_[i-1]",\
"MultipleRowRef = Close_[i-2] - Close_[i-1]",\
"SubArray = Close_.subArray(i-2, i+1)",\
"SumSubArray = sum(Close_.subArray(i-2, i+1))",\
"ArraySize = SubArray.size()",\
"ArrayMin = min(SubArray)",\
"ArrayMax = max(SubArray)",\
"ArrayMedian = median(SubArray)",\
"ArrayAvg = avg(SubArray)",\
"ArrayStd = std(SubArray)",\
"ArrayVar = var(SubArray)",\
"ArrayLast = last(SubArray)",\
"ArrayIndex = SubArray[1]",\
"InArray = in(45.71, SubArray)")
Calculate Percentile in a series
viewAndSortPrices = staticSource2.view("Date = formatDate(EODTimestamp, TZ_NY)",\
"Ticker", "ClosingPrice = Close")\
.sort("Ticker", "ClosingPrice")
closingPriceArraysByTicker = viewAndSortPrices.by("Ticker")
# join two tables above using natural join
priceAndArray = viewAndSortPrices.naturalJoin(closingPriceArraysByTicker, "Ticker",\
"PriceArray = ClosingPrice").sort("Ticker", "Date")
#use binSearch to drive the percentage calculation
percRank = priceAndArray.updateView("ArraySize = PriceArray.size()",\
"BinRank = binSearchIndex(PriceArray, ClosingPrice, BS_LOWEST)",\
"PercRank = BinRank / ArraySize",\
"Perc25 = percentile(PriceArray, 0.25)",\
"Median = median(PriceArray)",\
"Perc75 = percentile(PriceArray, 0.75)")
Manipulate Time And Calendars
timeStuff = staticSource2.updateView("CurrentTime = currentTime()",\
"CurrentDateDefault = currentDay()",\
"CurrentDateNy = currentDateNy()",\
"CurrentDateLon = currentDate(TZ_LON)",\
"LastBizDateNy = lastBusinessDateNy()",\
"IsAfter = CurrentTime > EODTimestamp",\
"IsBizDay = isBusinessDay(CurrentDateLon)",\
"StringDT = format(EODTimestamp, TZ_NY)",\
"StringDate = formatDate(EODTimestamp, TZ_NY)",\
"StringToTime = convertDateTime(StringDate + `T12:00 NY`)",\
"AddTime = EODTimestamp + '05:11:04.332'",\
"PlusHour = EODTimestamp + HOUR",\
"LessTenMins = EODTimestamp - 10 * MINUTE",\
"DiffYear = diffYear(EODTimestamp, currentTime())",\
"DiffDay = diffDay(EODTimestamp, CurrentTime)",\
"DiffNanos = PlusHour - EODTimestamp",\
"DayWeek = dayOfWeek(EODTimestamp, TZ_NY)",\
"HourDay = hourOfDay(EODTimestamp, TZ_NY)",\
"DateAtMidnight = dateAtMidnight(EODTimestamp, TZ_NY)",\
"IsBizDayString = isBusinessDay(StringDate)",\
"IsBizDayDatetime = isBusinessDay(EODTimestamp)",\
"IsBizTime = isBusinessDay(EODTimestamp)",\
"FracBizDayDone = fractionOfBusinessDayComplete(currentTime())",\
"FracBizDayOpen = fractionOfBusinessDayRemaining(currentTime())",\
"NextNonBizDay = nextNonBusinessDay()",\
"NextBizDayCurrent = nextBusinessDay()",\
"NextBizDayString = nextBusinessDay(StringDate)",\
"NextBizDayDatetime = nextBusinessDay(EODTimestamp)",\
"PlusFiveBizDayCurrent = nextBusinessDay(5)",\
"PlusFBizDayString = nextBusinessDay(StringDate, 5)",\
"PlusFBizDayDatetime = nextBusinessDay(EODTimestamp, 5)",\
"PrevBizDayCurrent = previousBusinessDay()",\
"PrevBizDayString = previousBusinessDay(StringDate)",\
"PrevBizDayDatetime = previousBusinessDay(EODTimestamp)",\
"MinusFiveBizDayCurrent = previousBusinessDay(5)",\
"MinusFiveBizDayString = previousBusinessDay(StringDate, 5)",\
"MinusFiveBizDayDatetime = previousBusinessDay(EODTimestamp, 5)",\
"BizDayArray = businessDaysInRange(EODTimestamp, currentTime())",\
"NonBizDayArray = nonBusinessDaysInRange(EODTimestamp, currentTime())",\
"DiffBizDaysCount = numberOfBusinessDays(EODTimestamp, currentTime())",\
"DiffBizDaysExact = diffBusinessDay(EODTimestamp, currentTime())",\
"DiffBizDaysString = numberOfBusinessDays(MinusFiveBizDayString, StringDate)",\
"StandardBizDayNanos = standardBusinessDayLengthNanos()",\
"DiffBizSecs = diffBusinessNanos(CurrentTime, CurrentTime + 5 * DAY) / 1E9",\
"LastBizOfMonth = isLastBusinessDayOfMonth(StringDate)",\
"LastBizOfWeek = isLastBusinessDayOfWeek(currentTime())")
Bin Data
bins = staticSource1.updateView("PriceBin = upperBin(Last, 20)",
"SizeBin = lowerBin(Size, 100)",\
"TimeBin = upperBin(ExchangeTimestamp, '00:15:00')")
aggBin = bins.view("TimeBin", "Dollars = Last * Size").sumBy("TimeBin")
# binning is a great pre-step for aggregating to support
# the down-sampling or other profiling of data
Manipulate Strings
stringStuff = staticSource2.view("StringDate = formatDate(EODTimestamp, TZ_NY)",\
"Ticker","Close",\
"NewString = `new_string_example_`",\
"ConcatV1 = NewString + Ticker",\
"ConcatV2 = NewString + `Note_backticks!!`",\
"ConcatV3 = NewString.concat(Ticker)",\
"ConcatV4 = NewString.concat(`Note_backticks!!`)",\
"StartBool = Ticker.startsWith(`M`)",\
"NoEndBool = !Ticker.endsWith(`OG`)",\
"ContainedBool = Ticker.contains(`AA`)",\
"NoContainBool = !Ticker.contains(`AA`)",\
"FirstChar = Ticker.substring(0,1)",\
"LengthString = Ticker.length()",\
"CharIndexPos = ConcatV1.charAt(19)",\
"SubstringEx = ConcatV1.substring(11,20)",\
"FindIt = NewString.indexOf(`_`)",\
"FindItMiddle = NewString.indexOf(`_`, FindIt + 1)",\
"FindLastOf = NewString.lastIndexOf(`_`)",\
"SplitToArrays = NewString.split(`_`)",\
"SplitWithMax = NewString.split(`_`, 2)",\
"SplitIndexPos = NewString.split(`_`)[1]",\
"LowerCase = Ticker.toLowerCase()",\
"UpperCase = NewString.toUpperCase()",\
"DoubleToStringv1 = Close + ``",\
"DoubleToStringv2 = String.valueOf(Close)",\
"DoubleToStringv3 = Double.toString(Close)",\
"StringToDoublev1 = Double.valueOf(DoubleToStringv1)")
Use Ternaries; If-Thens
ifThenexample = staticSource2\
.updateView("SimpleTernary = Close < 100 ? `smaller` : `bigger`",\
"TwoLayers = Close <= 10 ? `small` : Close < 100 ? `medium` : `large`",\
"Conj = Close < 100 && Ticker.contains(`A`) ? `special` : `boring`",\
"Disj = Close < 100 || Ticker.contains(`A`) ? `fave` : `other`",\
"PrevClose = Ticker_[i-1] = Ticker ? Close_[i-1] : NULL_DOUBLE")
Create And Use A Custom Function
def mult(a,b):
return a*b
pyFuncExample = staticSource2.update("M=(double)mult.call(Close,Volume)")
Manipulate Columns
uniqueValues = staticSource2.selectDistinct("Ticker") # show unique set
# works on all data types - careful with doubles, longs
uniqueValuesCombo = staticSource2.selectDistinct("EODTimestamp", "Ticker")
# unique combinations of EODTime with Ticker
renameStuff = staticSource2.renameColumns("USym = Ticker", "ClosingPrice = Close")
dropColumn = staticSource2.dropColumns("Volume") # same for drop one or many
dropColumns = staticSource2.dropColumns("ImportDate", "Open", "Volume")
putColsAtStart = staticSource2.moveUpColumns("Ticker", "Volume") # Makes 1st col(s)
putColsWherever = staticSource2.moveColumns(1, "Ticker", "Volume")
# makes Ticker the 2nd and Volume the 3rd column
colDescription = staticSource1.withColumnDescription("USym", "Underlying Symbol")
# when user hovers on column header the description is visible
Group And Aggregate
Simple Grouping
groupToArrays1 = staticSource1.by("USym")
# one row per key (i.e. USym), all other columns are arrays
multipleKeys = staticSource1.by("USym", "Exchange")
# one row for each key-combination (i.e. USym-Exchange pairs)
Un-Grouping
# Run these lines together
aggByKey = staticSource1.by("Date")
# one row per Date, other fields are arrays from staticSource1
ungroupThatOutput = aggByKey.ungroup() # no arguments usually
# each array value becomes its own row
# in this case turns aggByDatetimeKey back into staticSource1
Aggregations
# IMPORTANT: Any columns not in the parentheses of the whateverBy("Col1", "Col2") statement,
# needs to be an ok type for that aggregation method -- i.e. sums need to have all non-key columns be numbers.
firstByKey = staticSource1.firstBy("Date")
firstByTwoKeys = staticSource1.firstBy("Date", "USym") # All below work with multi
countOfEntireTable = staticSource1.countBy("USym") # single arg returns tot count
countOfGroup = staticSource1.countBy("N", "USym") # N is special here
firstOfGroup = staticSource1.firstBy("USym")
lastOfGroup = staticSource1.lastBy("USym")
sumOfGroup = staticSource1.view("USym", "Last").sumBy("USym")
# non-key field must be numerical
avgOfGroup = staticSource1.view("USym", "Last").avgBy("USym")
stdOfGroup = staticSource1.view("USym", "Last").stdBy("USym")
varOfGroup = staticSource1.view("USym", "Last").varBy("USym")
medianOfGroup = staticSource1.view("USym", "Last").medianBy("USym")
minOfGroup = staticSource1.view("USym", "Last").minBy("USym")
maxOfGroup= staticSource1.view("USym", "Last").maxBy("USym")
## Combined Aggregations
# combine aggregations in a single method (using the same key-grouping)
from deephaven import *
combinationAgg = staticSource1.updateView("Dollars = Last * Size")\
.by(caf.AggCombo(\
caf.AggLast("LastPrice = Last","LastTimestamp = Timestamp"),\
caf.AggCount("Size"),\
caf.AggSum("Shares = Size", "Dollars"),\
caf.AggFirst("OpenPrice = Last"),\
caf.AggMax("HighPrice = Last"),\
caf.AggMin("LowPrice = Last"),\
caf.AggAvg("AvgPrice = Last"),\
caf.AggWAvg("Size", "WtdAvgPrice = Last"),\
caf.AggVar("VarPrice = Last"),\
caf.AggStd("StdPrice = Last"),\
caf.AggMed("MedianPrice = Last"),\
caf.AggPct(0.75, "Perc75Price = Last"),\
), "Date", "USym")\
# see the two keys -- Date & Sym, so agg is within each unique pair
# note the ability to rename columns during the process
# observe the updateView of Dollars then used in the aggregation
# weighted average is nice syntax
Join Data From Multiple Tables
For SQL developers: Joins in Deephaven are used to extend result sets by joining data from other tables, not as much for filtering.
Joins that get used a lot
Natural Join
Important
The right table of the join needs to have only one match based on the key(s).
# run these together
lastPriceTable = staticSource1.view("USym", "Last", "Timestamp")\
.lastBy("USym")\
.renameColumns("LastPrice = Last", "LastTime = Timestamp")
# creates 3-column table of USym + last record of the other 2 cols
joinWithLastPrice = staticSource1.naturalJoin(lastPriceTable, "USym")
# arguments are (rightTableOfJoin, "JoinKey(s)")
# will have same number of rows as staticSource1 (left table)
# brings all non-key columns from lastPriceTable to staticSource1
# HEAVILY USED!
# conceptually similar to Excel vlookup()
specifyColsFromR = staticSource1.naturalJoin(lastPriceTable, "USym", "LastPrice")
# nearly identical to joinWithLastPrice example
# args are (rightTableOfJoin, "JoinKey(s)", "Cols from R table")
renameColsOnJoin = staticSource1\
.naturalJoin(lastPriceTable,"USym","LP = LastPrice, LastTime")
# nearly identical to specifyColsFromR example
# can rename column on the join
# sometimes this is necessary…
# if there would be 2 cols of same name
# Note the placement of quotes
keysOfDiffNames = staticSource2.view("Ticker", "Close")\
.naturalJoin(lastPriceTable, "Ticker = USym")
# note that Ticker in the L table is mapped as the key to USym in R
# this result has many null records, as there are many Tickers
# without matching USyms
Multiple keys
# run these together
lastPriceTwoKeys = staticSource1.view("Date", "USym", "Last")\
.lastBy("Date", "USym")\
.renameColumns("LastPrice = Last")
# creates 3-column table of LastPrice for each Date-USym pair
natJoinTwoKeys = staticSource1\
.naturalJoin(lastPriceTwoKeys, "Date, USym", "LastPrice")
# arguments are (rightTableOfJoin, "JoinKey1, JoinKey2", "Col(s)")
# Note the placement of quotes
AJ (As-Of Join)
As-of joins are the time series joins vital to the capital markets' use cases. It is often wise to make sure the Right-table is sorted (based on the key). aj
is designed to find "the exact match" of the key or "the record just before". For timestamp aj-keys, this means "that time or the record just before.
# NEEDED: # MUST RUN "justAapl" query below to support all other aj examples!
justAapl = staticSource1.where("USym = `AAPL`").view("Timestamp", "AaplPrice = Last")
# just the AAPL trade rows
ajTimeSeriesJoin = staticSource1.aj(justAapl, "Timestamp")
# joins by looking at Timestamp in staticSource1 (Left Table)
# and finding exact match or "most adjacent before" Timestamp
# in justAapl (Right Table)
rajTimeSeriesJoin = staticSource1.raj(justAapl, "Timestamp")
# note the 'r' in front of the aj
# reverse aj finds the matching timestamp or the one after
ajPickAndNameCols = staticSource1\
.aj(justAapl, "Timestamp", "AaplPrice, AaplTime = Timestamp")
# Note the placement of quotes
ajKeysDiffNames = staticSource1.dropColumns("Timestamp")\
.aj(justAapl, "ExchangeTimestamp = Timestamp")
# ExchangeTimestamp from Left Table drives join with Timestamp
# from Right Table
# the fundamental below is used for TCA, slippage, simulation, etc.
ajMultipleKeysDiffNames = staticSource1.aj\
(staticSource1.updateView("Time10min = Timestamp + 10 * MINUTE"),\
"Date, USym, Timestamp = Time10min",\
"Price10Min = Last")
# join a series on the same series 10 minutes later
# Note the three keys -- matches Date, then USym…
# then does a time-series join on the last key
Less common joins
# exact joins require precisely one match in the right table for each key
lastPriceAug23 = staticSource1.where("Date = `2017-08-23`").lastBy("USym")
lastPriceAug24 = staticSource1.where("Date = `2017-08-24`").lastBy("USym")\
.whereIn(lastPriceAug23, "USym")
# filters for USyms in lastPriceAug23
exactJoinEx = lastPriceAug23.exactJoin(lastPriceAug24, "USym", "NextClose = Last")
# join will find all matches from left in the right table
# if a match is not found, that row is omitted from the result
# if the Right-table has multiple matches, then multiple rows are included in result
lastPriceSS1 = staticSource1.lastBy("USym").view("USym", "Last")
lastPriceSS2 = staticSource2.lastBy("Ticker")\
.view("USym = Ticker", "Price = Close")
firstPriceSS2 = staticSource2.firstBy("Ticker")\
.view("USym = Ticker", "Price = Close")
mergeFirstAndLastSS2 = ttools.merge(firstPriceSS2, lastPriceSS2)
# the table has two rows per USym
joinExample = lastPriceSS1.join(mergeFirstAndLastSS2, "USym", "Price")
# note that the resultant set has two rows per USym
Other Useful Methods
Use columns as arrays and cells as variable
mostActiveUSym = staticSource1.countBy("N", "USym").sortDescending("N")
getAColumn = mostActiveUSym.getColumn("USym").getDirect()
# this creates a java long array of the Column (USym, in this case)
getARecord = mostActiveUSym.getColumn("USym").getDirect()[1] # indexing starts at 0
getACell = mostActiveUSym.getColumn("USym").get(0)
# when to use each of the last two is beyond this basic introduction
# mostly relates to when iterating over an entire column
print(getAColumn)
print(getARecord)
print(getACell)
Read and write csv
It is very easy to import CSVs via the Code Studio UI and to export any Table to a CSV via the Table Widget (in Code Studios and Dashboard Tables).
CSVs imported from the client need to be done via Code Studio UI.
CSVs imported from a server-side directory should be done via the script below.
readCSV("filepath", "delimiter") # read CSV file on query server into table
readHeaderlessCsv("filepath", "delimiter")
writeCsv(table, "filePath") # writes a Deephaven Table out as a CSV
Write a Table to a Namespace
Though the below uses a namespace called ExampleNamespace, it is best practice for teams to establish a naming protocol for these directories.
# create some tables to use in the example
aapl100 = staticSource1.where("USym = `AAPL`").head(100)
goog100 = staticSource1.where("USym = `GOOG`").head(100)
aapl100_wDlr = staticSource1.where("USym = `AAPL`").tail(100)\
.update("Dollars = Last * Size")
# add local table from Code Studio to Deephaven servers
db.addTable("ExNamespace", "ExTable", aapl100)
seeTableJustAdded = db.i("ExNamespace", "ExTable").where()
# append to that table
db.appendTable("ExNamespace", "ExTable", goog100)
nowSeeAppendedTable = db.i("ExNamespace", "ExTable").where()
# remove and re-create a new table under that table name
db.replaceTable("ExNamespace", "ExTable", aapl100_wDlr)
nowSeeThatTable = db.i("ExNamespace", "ExTable").where()
# totally remove that table
# db.removeTable("ExNamespace", "ExTable")
# note the below now fails because the table does not exist
# thisWillFail = db.i("ExNamespace", "ExTable").where()
Do Cum-Sum and Rolling Average
# create a function to determine if an array of Tickers has only this-Ticker in it
from deephaven import *
def hasOthers(array, target):
for x in array:
if x != target:
return True
return False
###
makeArrays= staticSource2.updateView("TickerArray10 = Ticker_.subArray(i-10, i-1)",\
"PriceArray10 = Close_.subArray(i-10, i-1)",\
"VolumeArray10 = Volume_.subArray(i-10, i-1)")
###
CumAndRoll10 = makeArrays.update("ArrayHasOtherTickers = \
hasOthers.call(TickerArray10.toArray(), Ticker)",\
"CumSum10 = ArrayHasOtherTickers = false ? \
(int)sum(VolumeArray10) : NULL_INT",\
"RollAvg10 = ArrayHasOtherTickers = false ? \
avg(PriceArray10) : NULL_DOUBLE")
Another Example of Creating a Rolling Sum
def rollingSum(rows, values):
"""
Calculate a rolling sum from a java int array
:param rows: size of the rolling sum (i.e. number of rows to sum over)
:param values:
:return:
"""
calculations = jpy.array('int', values.size()) # create an array of integers for our rolling sum value
sum_ = 0 # our running sum (avoid builtin symbol sum)
for i in range(values.size()):
sum_ += values.get(i)
if i >= rows:
# subtract from the rolling sum when needed
sum_ -= values.get(i - rows)
calculations[i] = sum_
return calculations
trades = db.t("LearnDeephaven", "StockTrades").where("Date=`2017-08-25`")\
.view("Sym", "Size")\
.headBy(100, "Sym")
symGroups = trades.by("Sym")
sums = symGroups.updateView("RollingSum=(int[]) rollingSum.call(20, Size)")
ungroup = sums.ungroup()
Setting up an EMA
# reference requisite libraries
from deephaven import *
from deephaven.DBTimeUtils import *
ByEmaSimple = jpy.get_type("com.illumon.numerics.movingaverages.ByEmaSimple")
BadDataBehavior = jpy.get_type("com.illumon.numerics.movingaverages.ByEma$BadDataBehavior")
MaMode = jpy.get_type("com.illumon.numerics.movingaverages.AbstractMa$Mode")
TimeUnit = jpy.get_type("java.util.concurrent.TimeUnit")
# configure EMA parameters
# note TIME below
ema_price_10min = ByEmaSimple(BadDataBehavior.BD_SKIP, BadDataBehavior.BD_SKIP, MaMode.TIME, 10.0, TimeUnit.MINUTES)
ema_price_60min = ByEmaSimple(BadDataBehavior.BD_SKIP, BadDataBehavior.BD_SKIP, MaMode.TIME, 60.0, TimeUnit.MINUTES)
# see TICK below
ema_price_100ticks = ByEmaSimple(BadDataBehavior.BD_SKIP, BadDataBehavior.BD_SKIP, MaMode.TICK, 100, None)
# calculate EMA within update (or updateView, select, view) method
withEmas = staticSource1.update(
"EmaMin10=ema_price_10min.update(Timestamp, Last, USym)",\
"EmaMin60=ema_price_60min.update(Timestamp, Last, USym)",\
"EmaTick100=ema_price_100ticks.update(Timestamp, Last, USym)")
# notice the EMA is calculated by USym
# TradingHourseEma also exists, but is beyond the scope of this document
# That can be tied to a business calendar.
# Datetime method diffBusinessNanos() can also be helpful for that.
Use CumulativeUtil
# The CumulativeUtil class provides methods to simplify rolling and cumulative
# aggregations of double values
cu = jpy.get_type("com.illumon.iris.db.v2.utils.CumulativeUtil")
t=db.t("LearnDeephaven","StockTrades")\
.where("Date=`2017-08-25`")\
.update("Volume=Last*Size")
t2=t.sort("Timestamp")
CumulativeMarketSum=cu.cumSum(t2,"CumulativeVolume","Volume")
CumulativeSumBySymbol=cu.cumSum(t,"Sym","CumulativeVolume","Volume")
# When using these methods with intraday (ticking) tables,
# it must be ensured that the appends of new rows are the only changes that happen to the source tables.
# This can be done using the assertAppendOnly method.
# If changes other than appends occur in the underlying table,
# an exception will be thrown, ending the query.
# This ensures that the results returned by CumulativeUtil methods are accurate.
ta = jpy.get_type("com.illumon.iris.db.tables.verify.TableAssertions")
tickingTable = db.i("DbInternal", "ProcessEventLog").where("Date=currentDateNy()")
appendOnlyTickingTable = ta.assertAppendOnly(tickingTable)
# CumulativeUtil methods would then be applied to appendOnlyTickingTable.
Use Numpy And Pandas
from deephaven import *
from deephaven import npy
from deephaven import tableToDataFrame
import numpy as np
import pandas as pd
# note these conversions are less immediately available than DH tables
# constrained by memory available
# no updates -- ticking sources are snapshotted
pandasDF = tableToDataFrame(staticSource1.where("Date = `2017-08-25`"))
# do Pandas stuff to prove it is a Pandas Dataframe
print(type(pandasDF))
print(pandasDF.dtypes)
print(pandasDF.describe())
# if all the columns can be of the same data type...
# and if the data set is very large,
# it may be faster to make a Pandas Dataframe via Numpy
justNumbers = staticSource1.view("Last", "Size", "ExchangeId")
pandasDF_3Cols = pd.DataFrame(
data=npy.numpy_slice(justNumbers, 0, justNumbers.size(), dtype=np.float32),
columns=justNumbers.getDefinition().getColumnNamesArray()
)
print(type(pandasDF_3Cols))
print(pandasDF_3Cols.dtypes)
# get an example table to work with
aaplTrades = staticSource1.where("Date = `2017-08-25`", "USym = `AAPL`")
# IMPORTANT: Conversions from DH Tables to NumPy arrays are to only one-dimensional arrays!
# make numPy arrays of single columns
npArray_Price = columnToNumpyArray(aaplTrades, "Last")
npArray_TradeSize = columnToNumpyArray(aaplTrades, "Size")
# do NumPy stuff
print(type(npArray_Price))
print(npArray_Price.size)
print(npArray_Price[10:20])
print(npArray_Price.max())
print(npArray_TradeSize.sum())
print(np.unique(npArray_TradeSize))
# multiply arrays
npArray_Dollars = np.multiply(npArray_Price, npArray_TradeSize)
# make a 3xN NumPy array
np3Cols = np.stack((npArray_Price, npArray_TradeSize, npArray_Dollars), axis = 1)
print(np3Cols.shape)
# make a Pandas DF from the 3XN NumPy Array
pandasDF_fromNumPy = pd.DataFrame(np3Cols, columns = ['Price', 'Shares', 'Dollars'])
print(pandasDF_fromNumPy.aggregate(['sum', 'min', 'max', 'average', 'median']))
# turn Pandas DF into Deephaven Table
dhTable = dataFrameToTable(pandasDF)
Format Tables
Datetime Formatting
timeFormatting = staticSource1.view("Time1 = Timestamp", "Time2 = Timestamp", "Time3 = Timestamp", "Time4 = Timestamp", "Time5 = Timestamp", "Time6 = Timestamp")\
.formatColumns("Time1=Date(`yyyy_MM_dd'T'HH-mm-ss.S z`)",
"Time2 = Date(`yyyy-MM-dd'T'HH:mm t`)",
"Time3 = Date(`HH:mm:ss`)",
"Time4 = Date(`HH:mm:ss.SSSSSSSSS`)",
"Time5 = Date(`EEE dd MMM yy HH:MM:ss`)",
"Time6 = Date(`yyyy-MM-dd`)")\
.updateView("Time7_string = formatDate(Time6, TZ_NY)")
Number Formatting
numberFormatting = staticSource1.view("Last",\
"BigNum1 = Last * 1000", "BigNum2 = Last * 1000",\
"BigNum3 = Last * 1000", "Price1 = Last", "Price2 = Last",\
"Price3 = Last",\
"SmallNum1 = Last / 1000", "SmallNum2 = Last / 1000",\
"SmallNum3 = Last / 1000", "TinyNum = Last / 1_000_000_000")\
.formatColumns("BigNum1 = Decimal(`###,#00`)",\
"BigNum2 = Decimal(`##0.00`)",\
"BigNum3 = Decimal(`0.####E0`)",\
"Price1 = Decimal(`###,###.##`)",\
"Price2 = Decimal(`$###,##0.00`)",\
"SmallNum1 = Decimal(`##0.000000`)",\
"SmallNum2 = Decimal(`##0.00%`)",\
"SmallNum3 = Decimal(`##0%`)",\
"TinyNum = Decimal(`0.00E0`)")
# run these next three tables together
# create table for the example
numTable = staticSource1.view("Last",\
"Random1 = Last * (new Random().nextDouble())",\
"Random2 = Last * (new Random().nextDouble())",\
"PriceLong = (long)Last", "PriceInt = (int)Last",\
"BigNumFloat = (float)(Last * 1000)")
# efficient syntax to round many float and double columns across a table
from deephaven import *
roundColumns = ttools.roundDecimalColumns(numTable)
roundMostColums = ttools.roundDecimalColumnsExcept(numTable, "Last")
Color Formatting
justColors = staticSource1\
.formatColumns("Timestamp = `#90F060`","SecurityType = LIGHTBLUE",\
"Exchange = colorRGB(247,204,0)",\
"USym = colorRGB(247,204,204)",\
"Sym = bgfg(colorRGB(57,43,128),colorRGB(243,247,122))",\
"Last = bgfga(MEDIUMVIOLETRED)",\
"Size = colorHSL(0, 24, 36)")
# need this for the last part of conditionalColors script
from deephaven import *
colorMe = DistinctFormatter()
conditionalColors = staticSource1.where("Date = `2017-08-25`")\
.updateView("RowMod10 = i % 10")\
.formatColumnWhere("Source", "SaleCondition = `@FTI`", "DEEP_RED")\
.formatColumnWhere("USym", "Last > Last_[i-1]", "LIMEGREEN")\
.formatRowWhere("ExchangeId % 10 = 0", "colorHSL(260, 94, 52)")\
.formatColumns("Size = (Size % 5 !=0) ? ORANGE : BLACK",\
"SecurityType = (Exchange = `Nasdaq` && SaleCondition = `@TI`) ? colorRGB(253, 31, 203) : colorRGB(171, 254, 42)",\
"RowMod10 = heatmap(RowMod10, 1.0, 10.0, BRIGHT_GREEN, BRIGHT_RED)",\
"Exchange = colorMe.getColor(Exchange)")
# one can formatColumns() for numbers and colors together
numberAndColor = staticSource1.formatColumns("Last = Decimal(`##0.00`)",\
"Last = (Last > Last_[i-1]) ? FUCHSIA : GREY")
List of canned colors found here.
Plot Programmatically
Substantial documentation about plotting exists. The following example intends to show the basics, and everything about styling and labeling is omitted.
Note
Run the following first, to create some tables to use in the plot examples below.
from deephaven import *
from deephaven import Calendars
from deephaven import Plot
cal = Calendars.calendar("USNYSE")
fourUSym = staticSource1.where("USym in `GOOG`,`AAPL`,`CSCO`, `MSFT`",\
"cal.isBusinessTime(ExchangeTimestamp)", "SaleCondition!=`@W`",\
"inRange(Size, 100, 10_000)")\
.sort("Date", "USym", "ExchangeTimestamp")
fourUSymOneDay = fourUSym.where("Date = `2017-08-25`")
fourUSymBest = fourUSymOneDay\
.naturalJoin(fourUSymOneDay.firstBy("USym"), "USym", "Open = Last")\
.updateView("PercChange = Last / Open - 1")\
.aj(fourUSymOneDay.where("USym = `MSFT`"),\
"ExchangeTimestamp", "MsftPrice = Last")
justAaplMultiday = fourUSym.where("USym = `AAPL`")
justAaplBest = fourUSymBest.where("USym = `AAPL`")
dlrVolume = staticSource2.where("Ticker in `AAPL`, `CSCO`, `GOOG`,\
`MSFT`").update("DlrVolume = Close * Volume",\
"Month = formatDate(EODTimestamp, TZ_NY).substring(0,7)")
Time series plots
timeSeriesPlot = Plot.plot("", justAaplBest, "ExchangeTimestamp", "Last").show()
# note .show()
timeSeries2axes = Plot.plot("MSFT Series", fourUSymBest.where("USym = `MSFT`"),\
"ExchangeTimestamp", "Last")\
.twinX()\
.plot("GOOG Series", fourUSymBest.where("USym = `GOOG`"),\
"ExchangeTimestamp", "Last").show()
# note the first parameter .plot("this-one"... could not be ""
# see .twinX()
timeSeries3axes = Plot.plot("MSFT Series", fourUSymBest.where("USym = `MSFT`"),\
"ExchangeTimestamp", "Last")\
.twinX()\
.plot("GOOG Series", fourUSymBest.where("USym = `GOOG`"),\
"ExchangeTimestamp", "Last")\
.twinX()\
.plot("CSCO Series", fourUSymBest.where("USym = `CSCO`"),\
"ExchangeTimestamp", "Last").show()
# can do more than 3
Multiple series on the same axis
timeSeriesSameAxis = Plot.plot("MSFT %-Chge", fourUSymBest.where("USym = `MSFT`"),\
"ExchangeTimestamp", "PercChange")\
.plot("GOOG %-Chge", fourUSymBest.where("USym = `GOOG`"),\
"ExchangeTimestamp", "PercChange").show()
# no .twinX() in this case
Bar chart
barChart = Plot.plot("", dlrVolume.where("Ticker = `AAPL`"),\
"EODTimestamp", "DlrVolume")\
.plotStyle("Bar").show()
Plot-by-some key
plotBySeries = Plot.plotBy("Percent Change", fourUSymBest, "ExchangeTimestamp",\
"PercChange", "USym").show()
# see that the last overload "USym", in this case, is the key
# in this case this will create 4 series for the respective 4 USyms
Stacked Area
stackedArea = Plot.plotBy("Dlr Volume", dlrVolume, "EODTimestamp",\
"DlrVolume", "Ticker")\
.plotStyle("stacked_area").show()
Area graph
viewAsArea = timeSeriesSameAxis.plotStyle("Area").show()
# see that it references the plotting object of the previous script
Business time
uglyRawMultiday = Plot.plot("", justAaplMultiday, "ExchangeTimestamp", "Last")\
.show()
# use xBusinessTime() to get rid of overnight and weekend time
prettyWithBizTimeConstraint = Plot.plot("", justAaplMultiday, "ExchangeTimestamp",\
"Last")\
.xBusinessTime().show()
# this relies on the prevailing biz-time calendar that is set
Scatter
scatterPlot = Plot.plot("", justAaplBest.where("i % 10 = 0"), "MsftPrice", "Last")\
.plotStyle("scatter").show()
# Note the manual down-sampling above
# Without it, for large tables (this one is 87k records) the UI will
# make you choose whether you really want to graph so many points
# Deephaven smart-auto-downsampling is only for line plots
Histogram
plotHistogram = Plot.histPlot("", justAaplBest, "Last", 20).show()
catHistogramBySym = Plot.catHistPlot("", fourUSymBest, "USym").show()
# de facto count of rows
Open-high-low-close
plotOHLC = Plot.ohlcPlot("", staticSource2.where("Ticker = `AAPL`"), "EODTimestamp", "Open", "High", "Low", "Close").show()
# syntax exists to profile data into time bins with OHLC
Error Bar Plot
errorBarTable = justAaplBest\
.updateView("TimeBin=upperBin(ExchangeTimestamp, 15 * MINUTE)")\
.by(caf.AggCombo(caf.AggAvg("AvgPrice = Last"),\
caf.AggStd("StdPrice = Last")), "TimeBin")\
.updateView("Low = AvgPrice - StdPrice", "Hi = AvgPrice + StdPrice")
errorBarPlot = Plot.errorBarY("", errorBarTable,"TimeBin",\
"AvgPrice", "Low", "Hi").show()
One-click plotting
One-click plotting allows a script writer to designate a key that users can enter into a one-click or linker experience, thereby toggling the plot to filter on the fly.
Users will need to add an "Input Filter" or configure the Linker tool to their UI to manipulate the One-Click plots. See web controls
oneClickEx = Plot.oneClick(fourUSymBest, "USym")
# this maps the one-click experience to the key of USym (in this case)
# there is no table visible from this
# in DH-speak it saves the OneClick SelectableDataSet to the variable
oneClickPlot = Plot.plot("USym", oneClickEx, "ExchangeTimestamp", "Last").show()
Another one-click example
datasetHolder2 = Plot.oneClick(dlrVolume.view("Month", "EODTimestamp", "DlrVolume")\
.sumBy("Month", "EODTimestamp").sort("EODTimestamp"), "Month")\
oneClickPlot2 = Plot.plot("Dlr Volume", datasetHolder3, "EODTimestamp","DlrVolume")\
.plotStyle("Area").show()
# change the Input Filter in the UI to 'Month'
# then, for example, type in that box 2017-03