Skip to content

Commit a049391

Browse files
[postgres] Prevent incidents by detecting idle sessions holding locks (DataDog#21182)
* Add idle in transaction with locks * Fix metrix definition * Add config * Rename metric * Rename changelog to match PR# * Validate config * Validate config * Fix config with new ddev * Restore test_deadlock.py * Validate metadata * Validate models * Lint * Include shared locks * max_rows as parameter * Fix max_rows * Validate model * Hide config * Restore test_deadlock.py * Select only exclusive locks * Filter on state_change age instead of tx change to reduce noise * State change not null * Update postgres/changelog.d/21182.added Co-authored-by: Eric Weaver <eweaver755@gmail.com> --------- Co-authored-by: Eric Weaver <eweaver755@gmail.com>
1 parent d5b822a commit a049391

File tree

8 files changed

+99
-1
lines changed

8 files changed

+99
-1
lines changed

postgres/assets/configuration/spec.yaml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,30 @@ files:
282282
value:
283283
type: boolean
284284
example: true
285+
- name: locks_idle_in_transaction
286+
description: Configure collection of idle in transaction lock age metrics
287+
options:
288+
- name: enabled
289+
description: Enable collection of idle in transaction lock age metrics.
290+
value:
291+
type: boolean
292+
example: true
293+
display_default: true
294+
hidden: true
295+
- name: collection_interval
296+
description: Set the collection interval (in seconds) for idle in transaction lock age metrics.
297+
value:
298+
type: number
299+
example: 300
300+
display_default: 300
301+
hidden: true
302+
- name: max_rows
303+
description: Set the maximum number of rows to collect per check run.
304+
value:
305+
type: integer
306+
example: 100
307+
display_default: 100
308+
hidden: true
285309
- name: collect_checksum_metrics
286310
description: Collect counts of database failed checksums. Only supported on versions >= 12.
287311
value:

postgres/changelog.d/21182.added

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add gauge postgresql.locks.idle_in_transaction_age to measure age (s) of idle-in-transaction sessions holding exclusive relation locks; tags: pid, db, session_user, app, client_hostname, lock_mode, relation, relation_owner; limit 100 rows.

postgres/datadog_checks/postgres/config.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,13 @@ def __init__(self, instance, init_config, check):
9191
self.collect_database_size_metrics = is_affirmative(instance.get('collect_database_size_metrics', True))
9292
self.collect_wal_metrics = self._should_collect_wal_metrics(instance.get('collect_wal_metrics'))
9393
self.collect_bloat_metrics = is_affirmative(instance.get('collect_bloat_metrics', False))
94+
# Locks idle in transaction metrics config
95+
locks_idle_cfg = instance.get('locks_idle_in_transaction', {}) or {}
96+
self.locks_idle_in_transaction = {
97+
'enabled': is_affirmative(locks_idle_cfg.get('enabled', True)),
98+
'collection_interval': int(locks_idle_cfg.get('collection_interval', 300)),
99+
'max_rows': int(locks_idle_cfg.get('max_rows', 100)),
100+
}
94101
self.data_directory = instance.get('data_directory', None)
95102
self.ignore_databases = instance.get('ignore_databases', DEFAULT_IGNORE_DATABASES)
96103
if is_affirmative(instance.get('collect_default_database', True)):

postgres/datadog_checks/postgres/config_models/instance.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,16 @@ class Gcp(BaseModel):
136136
project_id: Optional[str] = None
137137

138138

139+
class LocksIdleInTransaction(BaseModel):
140+
model_config = ConfigDict(
141+
arbitrary_types_allowed=True,
142+
frozen=True,
143+
)
144+
collection_interval: Optional[float] = None
145+
enabled: Optional[bool] = None
146+
max_rows: Optional[int] = None
147+
148+
139149
class ManagedIdentity(BaseModel):
140150
model_config = ConfigDict(
141151
arbitrary_types_allowed=True,
@@ -263,6 +273,7 @@ class InstanceConfig(BaseModel):
263273
idle_connection_timeout: Optional[int] = None
264274
ignore_databases: Optional[tuple[str, ...]] = None
265275
ignore_schemas_owned_by: Optional[tuple[str, ...]] = None
276+
locks_idle_in_transaction: Optional[LocksIdleInTransaction] = None
266277
log_unobfuscated_plans: Optional[bool] = None
267278
log_unobfuscated_queries: Optional[bool] = None
268279
managed_identity: Optional[ManagedIdentity] = None

postgres/datadog_checks/postgres/data/conf.yaml.example

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,10 @@ instances:
226226
#
227227
# collect_count_metrics: true
228228

229+
## Configure collection of idle in transaction lock age metrics
230+
#
231+
# locks_idle_in_transaction:
232+
229233
## @param collect_checksum_metrics - boolean - optional - default: false
230234
## Collect counts of database failed checksums. Only supported on versions >= 12.
231235
#

postgres/datadog_checks/postgres/postgres.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
CONNECTION_METRICS,
4848
COUNT_METRICS,
4949
FUNCTION_METRICS,
50+
IDLE_TX_LOCK_AGE_METRICS,
5051
INDEX_PROGRESS_METRICS,
5152
QUERY_PG_CONTROL_CHECKPOINT,
5253
QUERY_PG_CONTROL_CHECKPOINT_LT_10,
@@ -381,6 +382,13 @@ def dynamic_queries(self):
381382
if self._config.dbm_enabled:
382383
queries.append(STAT_IO_METRICS)
383384

385+
if self._config.dbm_enabled and self._config.locks_idle_in_transaction['enabled']:
386+
query_def = copy.deepcopy(IDLE_TX_LOCK_AGE_METRICS)
387+
query_def['collection_interval'] = self._config.locks_idle_in_transaction['collection_interval']
388+
max_rows = self._config.locks_idle_in_transaction.get('max_rows', 100)
389+
query_def['query'] = query_def['query'].format(max_rows=max_rows)
390+
per_database_queries.append(query_def)
391+
384392
if not queries:
385393
self.log.debug("no dynamic queries defined")
386394
return None

postgres/datadog_checks/postgres/util.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1085,3 +1085,45 @@ def trim_leading_set_stmts(sql):
10851085
{'name': 'io.writes', 'type': 'monotonic_count'},
10861086
],
10871087
}
1088+
1089+
# Measures the age (in seconds) of idle-in-transaction sessions holding exclusive relation locks.
1090+
# Limits result set to 10 rows to avoid tag explosion.
1091+
IDLE_TX_LOCK_AGE_METRICS = {
1092+
'name': 'idle_tx_lock_age_metrics',
1093+
'query': (
1094+
"""
1095+
SELECT
1096+
l.pid,
1097+
a.datname,
1098+
a.usename AS session_user,
1099+
a.application_name,
1100+
a.client_hostname,
1101+
l.mode,
1102+
c.oid::regclass AS relation,
1103+
r.rolname AS relation_owner,
1104+
EXTRACT(EPOCH FROM (now() - a.xact_start)) AS xact_age
1105+
FROM pg_locks l
1106+
JOIN pg_stat_activity a ON a.pid = l.pid
1107+
JOIN pg_class c ON c.oid = l.relation
1108+
JOIN pg_roles r ON r.oid = c.relowner
1109+
WHERE l.locktype = 'relation'
1110+
AND l.granted = true
1111+
AND l.mode like '%Exclusive%'
1112+
AND a.state = 'idle in transaction'
1113+
AND a.state_change IS NOT NULL
1114+
AND now() - a.state_change > interval '60 seconds'
1115+
ORDER BY xact_age DESC
1116+
LIMIT {max_rows} ; """
1117+
).strip(),
1118+
'columns': [
1119+
{'name': 'pid', 'type': 'tag'},
1120+
{'name': 'db', 'type': 'tag'},
1121+
{'name': 'session_user', 'type': 'tag'},
1122+
{'name': 'app', 'type': 'tag'},
1123+
{'name': 'client_hostname', 'type': 'tag_not_null'},
1124+
{'name': 'lock_mode', 'type': 'tag'},
1125+
{'name': 'relation', 'type': 'tag'},
1126+
{'name': 'relation_owner', 'type': 'tag'},
1127+
{'name': 'locks.idle_in_transaction_age', 'type': 'gauge'},
1128+
],
1129+
}

postgres/metadata.csv

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ postgresql.last_autovacuum_age,gauge,,second,,"Last time at which this table was
101101
postgresql.last_vacuum_age,gauge,,second,,"Last time at which this table was manually vacuumed (not counting VACUUM FULL). This metric is tagged with db, schema, table.",0,postgres,age vacuum,,
102102
postgresql.live_rows,gauge,,row,,"Enabled with `relations`. The estimated number of live rows. This metric is tagged with db, schema, table.",0,postgres,live rows,,
103103
postgresql.locks,gauge,,lock,,"Enabled with `relations`. The number of locks active for this database. This metric is tagged with db, lock_mode, lock_type, schema, table, granted.",0,postgres,locks,,
104+
postgresql.locks.idle_in_transaction_age,gauge,,second,,Transaction age of idle in transaction sessions holding exclusive relation locks.,0,postgres,idle tx age with locks,,"pid,db,session_user,app,client_hostname,lock_mode,relation,relation_owner"
104105
postgresql.max_connections,gauge,,connection,, The maximum number of client connections allowed to this database.,0,postgres,max conns,,
105106
postgresql.percent_usage_connections,gauge,,fraction,,The number of connections to this database as a fraction of the maximum number of allowed connections.,0,postgres,pct usg conns,,
106107
postgresql.pg_stat_statements.dealloc,count,,,,The number of times pg_stat_statements had to evict least executed queries because pg_stat_statements.max was reached.,-1,postgres,pgss dealloc,,
@@ -235,4 +236,4 @@ postgresql.wal_receiver.last_msg_receipt_age,gauge,,second,,Time since the recep
235236
postgresql.wal_receiver.last_msg_send_age,gauge,,second,,The age of the latest message's send time received from the WAL sender. This metric is tagged with status.,0,postgres,wal receiver send age,,
236237
postgresql.wal_receiver.latest_end_age,gauge,,second,,Time since the reception of the last message from the WAL sender with an WAL location update. This metric is tagged with status.,0,postgres,wal receiver latest end,,
237238
postgresql.wal_receiver.received_timeline,gauge,,,,"Timeline number of last write-ahead log location received and flushed to disk, the initial value of this field being the timeline number of the first log location used when WAL receiver is started. This metric is tagged with status.",0,postgres,wal receiver tli,,
238-
postgresql.wal_size,gauge,,byte,,The sum of all WAL files on disk.,-1,postgres,wal size,,
239+
postgresql.wal_size,gauge,,byte,,The sum of all WAL files on disk.,-1,postgres,wal size,,

0 commit comments

Comments
 (0)