diff --git a/internal/controller/metrics.go b/internal/controller/metrics.go index 5227de2..be928f3 100644 --- a/internal/controller/metrics.go +++ b/internal/controller/metrics.go @@ -1,6 +1,8 @@ package controller import ( + "time" + "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" "sigs.k8s.io/controller-runtime/pkg/metrics" @@ -8,10 +10,36 @@ import ( const ( METIC_PREFIX = "node_disruption_controller_" + + NodeDisruptionReconcileController = "NodeDisruption" + + ReconcileResultSuccess = "success" + ReconcileResultError = "error" ) var ( // NODE DISRUPTION METRICS + NodeDisruptionReconcileTotal = promauto.With(metrics.Registry).NewCounterVec( + prometheus.CounterOpts{ + Name: METIC_PREFIX + "reconcile_total", + Help: "Total number of node disruption controller reconciliations by result", + }, + []string{"controller", "result"}, + ) + NodeDisruptionLastSuccessfulReconcileTimestamp = promauto.With(metrics.Registry).NewGaugeVec( + prometheus.GaugeOpts{ + Name: METIC_PREFIX + "last_successful_reconcile_timestamp_seconds", + Help: "Unix timestamp of the last successful node disruption controller reconciliation", + }, + []string{"controller"}, + ) + NodeDisruptionLastFailedReconcileTimestamp = promauto.With(metrics.Registry).NewGaugeVec( + prometheus.GaugeOpts{ + Name: METIC_PREFIX + "last_failed_reconcile_timestamp_seconds", + Help: "Unix timestamp of the last failed node disruption controller reconciliation", + }, + []string{"controller"}, + ) NodeDisruptionGrantedTotal = promauto.With(metrics.Registry).NewCounterVec( prometheus.CounterOpts{ Name: METIC_PREFIX + "node_disruption_granted_total", @@ -147,3 +175,14 @@ var ( []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind", "node_disruption_name"}, ) ) + +func ObserveNodeDisruptionReconcile(result string) { + NodeDisruptionReconcileTotal.WithLabelValues(NodeDisruptionReconcileController, result).Inc() + now := float64(time.Now().Unix()) + switch result { + case ReconcileResultSuccess: + NodeDisruptionLastSuccessfulReconcileTimestamp.WithLabelValues(NodeDisruptionReconcileController).Set(now) + case ReconcileResultError: + NodeDisruptionLastFailedReconcileTimestamp.WithLabelValues(NodeDisruptionReconcileController).Set(now) + } +} diff --git a/internal/controller/nodedisruption_controller.go b/internal/controller/nodedisruption_controller.go index d382e5a..50263d3 100644 --- a/internal/controller/nodedisruption_controller.go +++ b/internal/controller/nodedisruption_controller.go @@ -76,13 +76,19 @@ type NodeDisruptionReconciler struct { // move the current state of the cluster closer to the desired state. // For more details, check Reconcile and its Result here: // - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.15.0/pkg/reconcile -func (r *NodeDisruptionReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { +func (r *NodeDisruptionReconciler) Reconcile(ctx context.Context, req ctrl.Request) (clusterResult ctrl.Result, err error) { logger := log.FromContext(ctx) - clusterResult := ctrl.Result{} + defer func() { + if err != nil { + ObserveNodeDisruptionReconcile(ReconcileResultError) + return + } + ObserveNodeDisruptionReconcile(ReconcileResultSuccess) + }() nd := &nodedisruptionv1alpha1.NodeDisruption{} - err := r.Get(ctx, req.NamespacedName, nd) + err = r.Get(ctx, req.NamespacedName, nd) if err != nil { if errors.IsNotFound(err) { PruneNodeDisruptionMetrics(req.Name) @@ -119,12 +125,16 @@ func (r *NodeDisruptionReconciler) Reconcile(ctx context.Context, req ctrl.Reque err = reconciler.Reconcile(ctx) if err != nil { - return clusterResult, nil + return clusterResult, err } if !reflect.DeepEqual(nd.Status, reconciler.NodeDisruption.Status) { logger.Info("Updating Status, done with", "state", reconciler.NodeDisruption.Status.State) - return clusterResult, reconciler.UpdateStatus(ctx) + err = reconciler.UpdateStatus(ctx) + if err != nil { + return clusterResult, err + } + return clusterResult, nil } logger.Info("Reconciliation successful", "state", reconciler.NodeDisruption.Status.State) return clusterResult, nil