Monitoring persistent query performance with dashboards

This guide will show you how to monitor the performance of your persistent queries by creating a dashboard of Deephaven's internal tables. These tables include heap usage and worker, port, server, and host information. The same source data can be used to drill down into the performance of a specific query worker, as discussed in our Troubleshooting and monitoring queries with internal tables guide.

Dashboard for monitoring persistent query memory and initialization times

The following code examples creates a dashboard including graphs of initialization times and heap usage of the last seven days of a specific PQ, and a graph of the heap usage throughout the day of a single run.

img

The two tables, PQHistory and Persistent Query Configurations, include helpful data such as query status (running, failed, etc.) and the amount of memory configured vs amount of memory currently used.

Expand for full query:
import com.illumon.util.calendar.Calendars
import com.illumon.iris.db.tables.utils.DBTimeUtils

date = currentDateNy()
previousDate = Calendars.calendar().previousDay(date)
oldestDate = Calendars.calendar().previousDay(7)
bytesPerGB = 1024*1024*1024

Table PQC = merge(db.t("DbInternal", "PersistentQueryConfigurationLogV2")
    .lastBy("Name")
    .where("EventType != `REMOVED`"),
    db.i("DbInternal", "PersistentQueryConfigurationLogV2")
    .lastBy("Name")
    .where("EventType != `REMOVED`")).lastBy("Name")

Table basePQs = db.i("DbInternal", "PersistentQueryStateLog")
    .where("Date=date")
    .whereIn(PQC, "Name")
    .view("Name","Owner","Status","ServerHost","DispatcherHost","WorkerName","ProcessInfoId")
    .lastBy("Name")

Table heapUsage = merge(
    db.i("DbInternal","ProcessMetrics")
    .where("Date >=oldestDate")
    .where("Name = `Memory-Heap.Used`")
    .lastBy("ProcessId")
    .updateView("MaxHeapUsedInGB=Max/bytesPerGB"),
    db.t("DbInternal","ProcessMetrics")
    .where("Date >=oldestDate")
    .where("Name = `Memory-Heap.Used`")
    .lastBy("ProcessId")
    .updateView("MaxHeapUsedInGB=Max/bytesPerGB"))
    .lastBy("ProcessId")

PQs = basePQs
    .naturalJoin(PQC, "Name", "Enabled,HeapSizeInGB,DataBufferPoolToHeapSizeRatio")
    .naturalJoin(heapUsage, "ProcessInfoId=ProcessId", "MaxHeapUsedInGB")
    .sort("Name")

Table nowTable=timeTable("00:00:01").renameColumns("Now=Timestamp")

Table runningPQHistory1 = db.i("DbInternal","PersistentQueryStateLog").where("Date >= `2023-03-07`","ProcessInfoId != null")
    .view("Date","Owner","Name","Timestamp","Status","DispatcherHost","WorkerName","ProcessInfoId","SerialNumber","VersionNumber")
    .by(AggCombo(AggMin("Name","DispatcherHost","WorkerName","SerialNumber","VersionNumber","Start=Timestamp"),AggMax("Timestamp"),AggLast("Status")),"ProcessInfoId")

Table runningPQHistory2=nowTable.snapshot(runningPQHistory1)
    .updateView("End=(Status==`Running` ? currentTime() : Timestamp)",
    "Duration_seconds=(DBTimeUtils.minus(End, Start)/1000000000)","Duration_minutes=Duration_seconds/60")
    .dropColumns("Now","Timestamp")

PQHistory = merge(db.t("DbInternal","PersistentQueryStateLog").where("Date >= oldestDate","ProcessInfoId != null")
    .view("Date","Owner","Name","Timestamp","Status","DispatcherHost","WorkerName","ProcessInfoId","SerialNumber","VersionNumber")
    .by(AggCombo(AggMin("Name","DispatcherHost","WorkerName","SerialNumber","VersionNumber","Start=Timestamp"),AggMax("End=Timestamp"),AggLast("Status")),"ProcessInfoId")
    .updateView("Duration_seconds=(DBTimeUtils.minus(End, Start)/1000000000)","Duration_minutes=Duration_seconds/60"),
    runningPQHistory2)
    .naturalJoin(heapUsage, "ProcessInfoId=ProcessId", "MaxHeapUsedInGB")
    .moveUpColumns("Start","End","Name")
    .sortDescending("Start")

Table heapOverTime = merge(
    db.i("DbInternal","ProcessMetrics")
    .where("Date >=oldestDate")
    .where("Name = `Memory-Heap.Used`")
    .view("Timestamp","ProcessId","HeapUsed=Last"),
    db.t("DbInternal","ProcessMetrics")
    .where("Date >=oldestDate")
    .where("Name = `Memory-Heap.Used`")
    .view("Timestamp","ProcessId","HeapUsed=Last"))
    .lastBy("Timestamp","ProcessId")
    .updateView("HeapGB=HeapUsed/bytesPerGB")

Table nonHeapOverTime = merge(
    db.i("DbInternal","ProcessMetrics")
    .where("Date >=oldestDate")
    .where("Name = `Memory-NonHeap.Used`")
    .view("Timestamp","ProcessId","HeapUsed=Last"),
    db.t("DbInternal","ProcessMetrics")
    .where("Date >=oldestDate")
    .where("Name = `Memory-NonHeap.Used`")
    .view("Timestamp","ProcessId","HeapUsed=Last"))
    .lastBy("Timestamp","ProcessId")
    .updateView("HeapGB=HeapUsed/bytesPerGB")

Table directOverTime = merge(
    db.i("DbInternal","ProcessMetrics")
    .where("Date >=oldestDate")
    .where("Name = `Memory-Direct.Used`")
    .view("Timestamp","ProcessId","HeapUsed=Last"),
    db.t("DbInternal","ProcessMetrics")
    .where("Date >=oldestDate")
    .where("Name = `Memory-Direct.Used`")
    .view("Timestamp","ProcessId","HeapUsed=Last"))
    .lastBy("Timestamp","ProcessId")
    .updateView("HeapGB=HeapUsed/bytesPerGB")

maxHeapHistoryOC = oneClick(PQHistory.where("MaxHeapUsedInGB > 0"), "Name")
hotOC = oneClick(heapOverTime, "ProcessId")
nhotOC = oneClick(nonHeapOverTime, "ProcessId")
dotOC = oneClick(directOverTime, "ProcessId")

MemoryUsageSingleRun =
    plot("Heap Usage (GB)", hotOC, "Timestamp", "HeapGB")
    .plot("Non-Heap Usage (GB)", nhotOC, "Timestamp", "HeapGB")
    .plot("Direct Usage (GB)", dotOC, "Timestamp", "HeapGB")
    .show()
HeapMaxUsageHistory = catPlot("Heap Usage (GB) By Execution", maxHeapHistoryOC, "Start", "MaxHeapUsedInGB").show()

To filter the graphs to a specific query, use a OneClick panel or or the Linker tool to link the graphs to the dashboard's PQHistory and PQ configuration tables.

Underlying data

This dashboard uses several tables within the DbInternal namespace. It can be helpful to know what data is available in case you want to create your own views.

  1. The PersistentQueryConfigurationLogV2 table contains the configuration information for all persistent queries. This includes the owner, what server it will run on, when it was last modified and by whom, etc.
  2. The PersistentQueryStateLog shows the status (Running, Initializing, Failed, etc.) of all persistent queries.
  3. The ProcessMetrics table contains (among many other metrics) the heap usage data used in this dashboard.

More information can be found in the troubleshooting documentation.

Dashboard for monitoring server usage

This dashboard calculates the heap usage of all persistent queries and breaks it down by server. This can help in load balancing your Deephaven cluster. It also shows all current workers and their server hosts, heap allocation, and PQ name (if applicable).

img

Expand for full query:
date = currentDateNy()
cutoffDate = minus(DBDateTime.now(),7*DAY).toDateString()
cutoffTime = minus(DBDateTime.now(), 5*MINUTE)

// get details about workers that are alive
Table activeWorkers = db.i("DbInternal","ProcessEventLog")
	.where("Date>=currentDateNy()")
	.lastBy("ProcessInfoId")
	.where("Timestamp >= cutoffTime")
	.view("ProcessInfoId", "WorkerName=Process", "Host")

// get dispatcher details and remove any recently terminated workers
Table currentWorkers = activeWorkers
	.join(db.i("DbInternal","AuditEventLog")
		.where("Date>=cutoffDate","Event=`Starting worker`")
		.update("PPID=Details.split(`:`)[2].trim()")
		,"ProcessInfoId=PPID"
		,"Process,ServerPort")
    .updateView("DispatcherName=Process == `db_query_server` ? `default` : `mergedispatcher`")
	.dropColumns("Process")
	.whereNotIn(db.i("DbInternal","AuditEventLog")
		.where("Date>=cutoffDate","Event=`SHUTTING_DOWN`"),
		"ProcessInfoId")

currentWorkerInfo = currentWorkers.naturalJoin(
    db.i("DbInternal", "QueryPerformanceLog").where("Date>=cutoffDate").lastBy("ProcessInfoId"),
    "ProcessInfoId","RequestedHeapSize")
	.where("!(isNull(RequestedHeapSize))")  //filter out workers which probably stopped ungracefully
	.update("RequestedHeapSizeGB=RequestedHeapSize/(1024*1024*1024)")
	.dropColumns("RequestedHeapSize")
	.naturalJoin(db.i("DbInternal", "PersistentQueryStateLog").where("Date>=date").lastBy("ProcessInfoId"),
		"ProcessInfoId",
		"PQName=Name")

workerHeapUsage = currentWorkerInfo
	.update("PurposeAndWorker = (PQName==null ? `Console` : PQName) + ` : ` + WorkerName")
	.sortDescending("RequestedHeapSizeGB")
    .rollup(
        AggCombo(AggSum("RequestedHeapSizeGB"))
    ,"Host"
    ,"DispatcherName"
    ,"PurposeAndWorker")

Underlying data

This dashboard uses several tables within the DbInternal namespace. It can be helpful to know what data is available in case you want to create your own views.

  1. The ProcessEventLog table contains the logs (stout, stderr, etc.) of all query workers. We use it here to get a table of the ProcessInfoId, Host, and Process (the name, e.g. worker_1523) of all query workers.

  2. The AuditEventLog: this dashboard uses the Starting worker event to get the server port and process name of the worker.

  3. The QueryPerformanceLog table contains query-level performance metrics. It's used here to get the RequestedHeapSize of each worker.

  4. The PersistentQueryStateLog (see above) is used to get the name of each PQ.