Skip to content

Commit 576da9d

Browse files
committed
Periodically Log JVM Memory and GC Usage
Closes: #3522
1 parent 7dc17aa commit 576da9d

File tree

11 files changed

+320
-1
lines changed

11 files changed

+320
-1
lines changed
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
#!/bin/bash
2+
set -euo pipefail
3+
4+
script_dir="$(dirname "$(readlink -f "$0")")"
5+
. "$script_dir/util.sh"
6+
7+
compose_file="$1"
8+
9+
log_line="$(docker compose -f "$compose_file" logs | grep "Heap:" | head -1)"
10+
11+
test_non_empty "JVM metrics log line" "$log_line"

.github/workflows/build.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1577,6 +1577,9 @@ jobs:
15771577
run: curl -sf http://localhost:8082/metrics
15781578
if: ${{ matrix.variant == 'distributed' }}
15791579

1580+
- name: Check JVM Metrics Log
1581+
run: .github/scripts/check-jvm-metrics-log.sh .github/${{ matrix.variant }}-test/docker-compose.yml
1582+
15801583
- name: Docker Stats
15811584
run: docker stats --no-stream
15821585
if: always()

AGENTS.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,10 @@ Rigorous adherence to these patterns is required:
4242
* Require `[blaze.test-util :as tu]`.
4343
* Call `(test/use-fixtures :each tu/fixture)`.
4444

45+
## Documentation
46+
47+
* **Environment Variables:** Every new environment variable introduced via `#blaze/cfg` in `resources/blaze.edn` must be documented in `docs/deployment/environment-variables.md`, following the existing format (heading, description, default value, since badge).
48+
4549
## Verification & Workflow
4650

4751
When starting to work on an issue, you can use the GitHub CLI to fetch the issue details: `gh issue view <issue-number>`
@@ -51,7 +55,7 @@ Before finishing a task, ensure the following commands pass:
5155
1. **Format:** `make fmt`
5256
2. **Lint:** `make lint` (Uses `clj-kondo`)
5357
3. **Test:** `make test` (Runs module and root tests)
54-
4. **Coverage:** `make test-coverage` (Checks for adequate test coverage)
58+
4. **Coverage:** `make test-coverage` (Checks for adequate test coverage — must be **≥ 95% forms**)
5559

5660
After verification, when working on an issue:
5761

deps.edn

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@
1616
blaze/interaction
1717
{:local/root "modules/interaction"}
1818

19+
blaze/jvm-metrics-logger
20+
{:local/root "modules/jvm-metrics-logger"}
21+
1922
blaze/openid-auth
2023
{:local/root "modules/openid-auth"}
2124

docs/deployment/environment-variables.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -358,6 +358,24 @@ one of trace, debug, info, warn or error
358358

359359
**Default:** info
360360

361+
#### `JVM_METRICS_LOGGER_NORMAL_INTERVAL` <Badge type="warning" text="Since 1.7"/>
362+
363+
The interval at which JVM heap, non-heap, and GC metrics are logged at INFO level during normal operation (heap below the warn threshold). Must be an ISO-8601 duration string.
364+
365+
**Default:** PT5M
366+
367+
#### `JVM_METRICS_LOGGER_WARN_INTERVAL` <Badge type="warning" text="Since 1.7"/>
368+
369+
The interval at which JVM heap, non-heap, and GC metrics are logged at WARN level when heap usage is at or above the warn threshold. Must be an ISO-8601 duration string.
370+
371+
**Default:** PT1M
372+
373+
#### `JVM_METRICS_LOGGER_WARN_THRESHOLD` <Badge type="warning" text="Since 1.7"/>
374+
375+
The heap usage percentage (1–99) at or above which metrics are logged at WARN level and at the shorter warn interval.
376+
377+
**Default:** 80
378+
361379
#### `JAVA_TOOL_OPTIONS`
362380

363381
| Name | Default | Since | Description |
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
fmt:
2+
cljfmt check src test deps.edn tests.edn
3+
4+
lint:
5+
clj-kondo --lint src test deps.edn
6+
7+
prep:
8+
$(MAKE) -C ../module-base prep
9+
clojure -X:deps prep
10+
11+
test: prep
12+
clojure -M:test:kaocha --profile :ci
13+
14+
test-coverage: prep
15+
clojure -M:test:coverage
16+
17+
deps-tree:
18+
clojure -X:deps tree
19+
20+
deps-list:
21+
clojure -X:deps list
22+
23+
cloc-prod:
24+
cloc src
25+
26+
cloc-test:
27+
cloc test
28+
29+
clean:
30+
rm -rf .clj-kondo/.cache .cpcache target
31+
32+
.PHONY: fmt lint prep test test-coverage deps-tree deps-list cloc-prod cloc-test clean
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
{:deps
2+
{blaze/module-base
3+
{:local/root "../module-base"}
4+
5+
blaze/scheduler
6+
{:local/root "../scheduler"}}
7+
8+
:aliases
9+
{:test
10+
{:extra-paths ["test"]
11+
12+
:extra-deps
13+
{blaze/module-test-util
14+
{:local/root "../module-test-util"}}}
15+
16+
:kaocha
17+
{:extra-deps
18+
{lambdaisland/kaocha
19+
{:mvn/version "1.91.1392"}}
20+
21+
:main-opts ["-m" "kaocha.runner"]}
22+
23+
:coverage
24+
{:extra-deps
25+
{lambdaisland/kaocha-cloverage
26+
{:mvn/version "1.1.89"}}
27+
28+
:main-opts ["-m" "kaocha.runner" "--profile" "coverage"]}}}
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
(ns blaze.jvm-metrics-logger
2+
(:require
3+
[blaze.module :as m]
4+
[blaze.scheduler :as sched]
5+
[blaze.scheduler.spec]
6+
[clojure.spec.alpha :as s]
7+
[clojure.string :as str]
8+
[integrant.core :as ig]
9+
[java-time.api :as time]
10+
[taoensso.timbre :as log])
11+
(:import
12+
[java.lang.management GarbageCollectorMXBean ManagementFactory MemoryUsage]))
13+
14+
(set! *warn-on-reflection* true)
15+
16+
(s/def ::normal-interval time/duration?)
17+
18+
(s/def ::warn-interval time/duration?)
19+
20+
(s/def ::warn-threshold (s/and int? #(<= 1 % 99)))
21+
22+
(defn- format-bytes [bytes]
23+
(if (>= bytes 1073741824)
24+
(format "%.1f GB" (/ bytes 1073741824.0))
25+
(format "%.0f MB" (/ bytes 1048576.0))))
26+
27+
(defn- heap-pct ^long [^MemoryUsage usage]
28+
(let [max (.getMax usage)]
29+
(if (pos? max)
30+
(long (* 100 (/ (.getUsed usage) max)))
31+
0)))
32+
33+
(defn- format-gc [^GarbageCollectorMXBean gc]
34+
(let [cnt (.getCollectionCount gc)
35+
time-s (/ (.getCollectionTime gc) 1000.0)]
36+
(format "%s %d %s %.1fs" (.getName gc) cnt
37+
(if (= 1 cnt) "collection" "collections") time-s)))
38+
39+
(defn heap-usage-string
40+
"Returns a human-readable string summarising the current JVM heap, non-heap
41+
and GC metrics. Exported as a public function to facilitate testing."
42+
[]
43+
(let [heap ^MemoryUsage (.getHeapMemoryUsage (ManagementFactory/getMemoryMXBean))
44+
non-heap ^MemoryUsage (.getNonHeapMemoryUsage (ManagementFactory/getMemoryMXBean))
45+
pct (heap-pct heap)
46+
gcs (seq (ManagementFactory/getGarbageCollectorMXBeans))
47+
gc-part (when gcs (str ", GC: " (str/join ", " (map format-gc gcs))))]
48+
(format "Heap: %s used / %s committed / %s max (%d%%), Non-Heap: %s used / %s committed%s"
49+
(format-bytes (.getUsed heap))
50+
(format-bytes (.getCommitted heap))
51+
(format-bytes (.getMax heap))
52+
pct
53+
(format-bytes (.getUsed non-heap))
54+
(format-bytes (.getCommitted non-heap))
55+
(or gc-part ""))))
56+
57+
(defn log-metrics!
58+
"Reads current JVM metrics and logs them at the appropriate level based on
59+
`warn-threshold` (percentage). Returns nil."
60+
[warn-threshold]
61+
(let [heap ^MemoryUsage (.getHeapMemoryUsage (ManagementFactory/getMemoryMXBean))
62+
pct (heap-pct heap)]
63+
(if (>= pct warn-threshold)
64+
(log/warn "High heap usage -" (heap-usage-string))
65+
(log/info (heap-usage-string))))
66+
nil)
67+
68+
(defmethod m/pre-init-spec :blaze/jvm-metrics-logger [_]
69+
(s/keys :req-un [:blaze/scheduler]
70+
:opt-un [::normal-interval ::warn-interval ::warn-threshold]))
71+
72+
(defmethod ig/init-key :blaze/jvm-metrics-logger
73+
[_ {:keys [scheduler normal-interval warn-interval warn-threshold]
74+
:or {normal-interval (time/minutes 5)
75+
warn-interval (time/minutes 1)
76+
warn-threshold 80}}]
77+
(log/info "Start JVM metrics logger")
78+
(let [normal-nanos (time/as normal-interval :nanos)
79+
last-info-log (atom 0)]
80+
{:future
81+
(sched/schedule-at-fixed-rate
82+
scheduler
83+
(fn []
84+
(let [heap ^MemoryUsage (.getHeapMemoryUsage (ManagementFactory/getMemoryMXBean))
85+
pct (heap-pct heap)
86+
now (System/nanoTime)]
87+
(cond
88+
(>= pct warn-threshold)
89+
(log-metrics! warn-threshold)
90+
91+
(>= (- now @last-info-log) normal-nanos)
92+
(do (reset! last-info-log now)
93+
(log-metrics! warn-threshold)))))
94+
warn-interval
95+
warn-interval)}))
96+
97+
(defmethod ig/halt-key! :blaze/jvm-metrics-logger
98+
[_ {:keys [future]}]
99+
(log/info "Stop JVM metrics logger")
100+
(sched/cancel future false))
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
(ns blaze.jvm-metrics-logger-test
2+
(:require
3+
[blaze.jvm-metrics-logger :as jml]
4+
[blaze.module-spec]
5+
[blaze.module.test-util :refer [given-failed-system with-system]]
6+
[blaze.scheduler.spec]
7+
[blaze.test-util :as tu]
8+
[clojure.spec.alpha :as s]
9+
[clojure.spec.test.alpha :as st]
10+
[clojure.test :as test :refer [deftest is testing]]
11+
[integrant.core :as ig]
12+
[java-time.api :as time]
13+
[taoensso.timbre :as log]))
14+
15+
(set! *warn-on-reflection* true)
16+
(st/instrument)
17+
(log/set-min-level! :trace)
18+
19+
(test/use-fixtures :each tu/fixture)
20+
21+
(def config
22+
{:blaze/jvm-metrics-logger
23+
{:scheduler (ig/ref :blaze/scheduler)
24+
:normal-interval (time/seconds 5)
25+
:warn-interval (time/seconds 1)
26+
:warn-threshold 80}
27+
28+
:blaze/scheduler {}})
29+
30+
(deftest init-test
31+
(testing "nil config"
32+
(given-failed-system {:blaze/jvm-metrics-logger nil}
33+
:key := :blaze/jvm-metrics-logger
34+
:reason := ::ig/build-failed-spec
35+
[:cause-data ::s/problems 0 :pred] := `map?))
36+
37+
(testing "missing scheduler"
38+
(given-failed-system {:blaze/jvm-metrics-logger {}}
39+
:key := :blaze/jvm-metrics-logger
40+
:reason := ::ig/build-failed-spec
41+
[:cause-data ::s/problems 0 :pred] := `(fn ~'[%] (contains? ~'% :scheduler))))
42+
43+
(testing "invalid scheduler"
44+
(given-failed-system (assoc-in config [:blaze/jvm-metrics-logger :scheduler] ::invalid)
45+
:key := :blaze/jvm-metrics-logger
46+
:reason := ::ig/build-failed-spec
47+
[:cause-data ::s/problems 0 :via] := [:blaze/scheduler]
48+
[:cause-data ::s/problems 0 :val] := ::invalid))
49+
50+
(testing "invalid warn-threshold"
51+
(given-failed-system (assoc-in config [:blaze/jvm-metrics-logger :warn-threshold] 0)
52+
:key := :blaze/jvm-metrics-logger
53+
:reason := ::ig/build-failed-spec
54+
[:cause-data ::s/problems 0 :via] := [::jml/warn-threshold]
55+
[:cause-data ::s/problems 0 :val] := 0))
56+
57+
(testing "success"
58+
(with-system [system config]
59+
(is (some? (get system :blaze/jvm-metrics-logger))))))
60+
61+
(deftest heap-usage-string-test
62+
(testing "returns a string containing heap info"
63+
(let [result (jml/heap-usage-string)]
64+
(is (string? result))
65+
(is (re-find #"Heap:" result))
66+
(is (re-find #"Non-Heap:" result))
67+
(is (re-find #"\d+%" result))))
68+
69+
(testing "includes GC info when collectors are present"
70+
(let [result (jml/heap-usage-string)]
71+
(is (re-find #"GC:" result)))))
72+
73+
(deftest log-metrics-test
74+
(testing "logs at info level when below threshold"
75+
(is (nil? (jml/log-metrics! 100))))
76+
77+
(testing "logs at warn level when at or above threshold"
78+
(is (nil? (jml/log-metrics! 0)))))
79+
80+
(deftest scheduled-info-task-test
81+
(testing "info task fires and logs without error when heap is below threshold"
82+
(with-system [_ {:blaze/jvm-metrics-logger
83+
{:scheduler (ig/ref :blaze/scheduler)
84+
:normal-interval (time/millis 50)
85+
:warn-interval (time/millis 50)
86+
:warn-threshold 99}
87+
:blaze/scheduler {}}]
88+
(Thread/sleep 200)
89+
(is true))))
90+
91+
(deftest scheduled-warn-task-test
92+
(testing "warn task fires and logs without error when heap is above threshold"
93+
(with-system [_ {:blaze/jvm-metrics-logger
94+
{:scheduler (ig/ref :blaze/scheduler)
95+
:normal-interval (time/seconds 60)
96+
:warn-interval (time/millis 50)
97+
:warn-threshold 1}
98+
:blaze/scheduler {}}]
99+
(Thread/sleep 200)
100+
(is true))))
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
#kaocha/v1
2+
#merge
3+
[{}
4+
#profile {:ci {:reporter kaocha.report/documentation
5+
:color? false}
6+
:coverage {:plugins [:kaocha.plugin/cloverage]
7+
:cloverage/opts
8+
{:codecov? true}
9+
:reporter kaocha.report/documentation
10+
:color? false}}]

0 commit comments

Comments
 (0)