Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 13 additions & 45 deletions pkg/azurefile/controllerserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ import (
"k8s.io/utils/ptr"
csiMetrics "sigs.k8s.io/azurefile-csi-driver/pkg/metrics"
azcache "sigs.k8s.io/cloud-provider-azure/pkg/cache"
"sigs.k8s.io/cloud-provider-azure/pkg/metrics"
"sigs.k8s.io/cloud-provider-azure/pkg/provider/storage"
)

Expand Down Expand Up @@ -517,14 +516,6 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest)
}
}

csiMC := csiMetrics.NewCSIMetricContext(requestName)
isOperationSucceeded := false
defer func() {
csiMC.ObserveWithLabels(isOperationSucceeded,
"protocol", string(shareProtocol),
"storage_account_type", sku)
}()

if sourceID != "" {
_, srcAccountName, _, _, _, _, err = GetFileShareInfo(sourceID) //nolint:dogsled
if err != nil {
Expand Down Expand Up @@ -567,9 +558,10 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest)
SourceAccountName: srcAccountName,
}

mc := metrics.NewMetricContext(azureFileCSIDriverName, requestName, d.cloud.ResourceGroup, subsID, d.Name)
mc := csiMetrics.NewCSIMetricContext(requestName).WithBasicVolumeInfo(d.cloud.ResourceGroup, subsID, d.Name)
isOperationSucceeded := false
defer func() {
mc.ObserveOperationWithResult(isOperationSucceeded, VolumeID, volumeID)
mc.WithAdditionalVolumeInfo(VolumeID, volumeID).ObserveWithLabels(isOperationSucceeded, csiMetrics.Protocol, string(shareProtocol), csiMetrics.StorageAccountType, sku)
}()
Comment on lines +561 to 565
Copy link

Copilot AI Feb 14, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CreateVolume no longer creates/defer-observes the CSI metric context near the start of the method. As a result, any early returns during validation or setup (before this point) won’t be counted/logged, which is a metrics behavior regression. If the goal is consistent latency metrics for all CreateVolume calls, create the metric context at function entry and defer ObserveWithLabels there (adding volume info/labels as values become available).

Copilot uses AI. Check for mistakes.

var accountKey, lockKey string
Expand Down Expand Up @@ -791,12 +783,6 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest)

// DeleteVolume delete an azure file
func (d *Driver) DeleteVolume(ctx context.Context, req *csi.DeleteVolumeRequest) (resp *csi.DeleteVolumeResponse, returnedErr error) {
requestName := "controller_delete_volume"
csiMC := csiMetrics.NewCSIMetricContext(requestName)
defer func() {
csiMC.Observe(returnedErr == nil)
}()

volumeID := req.GetVolumeId()
if len(volumeID) == 0 {
return nil, status.Error(codes.InvalidArgument, "Volume ID missing in request")
Expand Down Expand Up @@ -841,9 +827,9 @@ func (d *Driver) DeleteVolume(ctx context.Context, req *csi.DeleteVolumeRequest)
secret = createStorageAccountSecret(accountName, accountKey)
}

mc := metrics.NewMetricContext(azureFileCSIDriverName, requestName, resourceGroupName, subsID, d.Name)
mc := csiMetrics.NewCSIMetricContext("controller_delete_volume").WithBasicVolumeInfo(resourceGroupName, subsID, d.Name)
defer func() {
mc.ObserveOperationWithResult(returnedErr == nil, VolumeID, volumeID)
mc.WithAdditionalVolumeInfo(VolumeID, volumeID).Observe(returnedErr == nil)
}()

if err := d.DeleteFileShare(ctx, subsID, resourceGroupName, accountName, fileShareName, secret, useDataPlaneAPI); err != nil {
Expand Down Expand Up @@ -950,12 +936,6 @@ func (d *Driver) ControllerUnpublishVolume(_ context.Context, _ *csi.ControllerU

// CreateSnapshot create a snapshot
func (d *Driver) CreateSnapshot(ctx context.Context, req *csi.CreateSnapshotRequest) (resp *csi.CreateSnapshotResponse, returnedErr error) {
requestName := "controller_create_snapshot"
csiMC := csiMetrics.NewCSIMetricContext(requestName)
defer func() {
csiMC.Observe(returnedErr == nil)
}()

sourceVolumeID := req.GetSourceVolumeId()
snapshotName := req.Name
if len(snapshotName) == 0 {
Expand Down Expand Up @@ -993,9 +973,9 @@ func (d *Driver) CreateSnapshot(ctx context.Context, req *csi.CreateSnapshotRequ
useDataPlaneAPI = d.useDataPlaneAPI(ctx, sourceVolumeID, accountName)
}

mc := metrics.NewMetricContext(azureFileCSIDriverName, requestName, rgName, subsID, d.Name)
mc := csiMetrics.NewCSIMetricContext("controller_create_snapshot").WithBasicVolumeInfo(rgName, subsID, d.Name)
defer func() {
mc.ObserveOperationWithResult(returnedErr == nil, SourceResourceID, sourceVolumeID, SnapshotName, snapshotName)
mc.WithAdditionalVolumeInfo(SourceResourceID, sourceVolumeID, SnapshotName, snapshotName).Observe(returnedErr == nil)
}()

exists, itemSnapshot, itemSnapshotTime, itemSnapshotQuota, err := d.snapshotExists(ctx, sourceVolumeID, snapshotName, req.GetSecrets(), useDataPlaneAPI)
Expand Down Expand Up @@ -1105,13 +1085,6 @@ func (d *Driver) CreateSnapshot(ctx context.Context, req *csi.CreateSnapshotRequ

// DeleteSnapshot delete a snapshot (todo)
func (d *Driver) DeleteSnapshot(ctx context.Context, req *csi.DeleteSnapshotRequest) (*csi.DeleteSnapshotResponse, error) {
requestName := "controller_delete_snapshot"
csiMC := csiMetrics.NewCSIMetricContext(requestName)
isOperationSucceeded := false
defer func() {
csiMC.Observe(isOperationSucceeded)
}()

if len(req.SnapshotId) == 0 {
return nil, status.Error(codes.InvalidArgument, "Snapshot ID must be provided")
}
Expand All @@ -1133,9 +1106,10 @@ func (d *Driver) DeleteSnapshot(ctx context.Context, req *csi.DeleteSnapshotRequ
subsID = d.cloud.SubscriptionID
}

mc := metrics.NewMetricContext(azureFileCSIDriverName, requestName, rgName, subsID, d.Name)
mc := csiMetrics.NewCSIMetricContext("controller_delete_snapshot").WithBasicVolumeInfo(rgName, subsID, d.Name)
isOperationSucceeded := false
defer func() {
mc.ObserveOperationWithResult(isOperationSucceeded, SnapshotID, req.SnapshotId)
mc.WithAdditionalVolumeInfo(SnapshotID, req.SnapshotId).Observe(isOperationSucceeded)
}()
Comment on lines +1109 to 1113
Copy link

Copilot AI Feb 14, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DeleteSnapshot metrics are started only after some validation/parsing, so early failures won’t record latency/outcome anymore (compared to when the deferred observe was at the top). Consider starting the metric context at method entry to keep metrics consistent across all code paths.

Copilot uses AI. Check for mistakes.

var deleteErr error
Expand Down Expand Up @@ -1303,13 +1277,6 @@ func (d *Driver) execAzcopyCopy(srcPath, dstPath string, azcopyCopyOptions, auth

// ControllerExpandVolume controller expand volume
func (d *Driver) ControllerExpandVolume(ctx context.Context, req *csi.ControllerExpandVolumeRequest) (*csi.ControllerExpandVolumeResponse, error) {
requestName := "controller_expand_volume"
csiMC := csiMetrics.NewCSIMetricContext(requestName)
isOperationSucceeded := false
defer func() {
csiMC.Observe(isOperationSucceeded)
}()

volumeID := req.GetVolumeId()
if len(volumeID) == 0 {
return nil, status.Error(codes.InvalidArgument, "Volume ID missing in request")
Expand Down Expand Up @@ -1348,9 +1315,10 @@ func (d *Driver) ControllerExpandVolume(ctx context.Context, req *csi.Controller
}
}

mc := metrics.NewMetricContext(azureFileCSIDriverName, requestName, resourceGroupName, subsID, d.Name)
mc := csiMetrics.NewCSIMetricContext("controller_expand_volume").WithBasicVolumeInfo(resourceGroupName, subsID, d.Name)
isOperationSucceeded := false
defer func() {
mc.ObserveOperationWithResult(isOperationSucceeded, VolumeID, volumeID)
mc.WithAdditionalVolumeInfo(VolumeID, volumeID).Observe(isOperationSucceeded)
}()

secrets := req.GetSecrets()
Expand Down
36 changes: 12 additions & 24 deletions pkg/azurefile/nodeserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,9 @@ func NewMountClient(cc *grpc.ClientConn) *MountClient {

// NodePublishVolume mount the volume from staging to target path
func (d *Driver) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolumeRequest) (resp *csi.NodePublishVolumeResponse, returnedErr error) {
csiMC := csiMetrics.NewCSIMetricContext("node_publish_volume")
mc := csiMetrics.NewCSIMetricContext("node_publish_volume")
defer func() {
csiMC.Observe(returnedErr == nil)
mc.Observe(returnedErr == nil)
}()

volCap := req.GetVolumeCapability()
Expand Down Expand Up @@ -204,9 +204,9 @@ func (d *Driver) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolu

// NodeUnpublishVolume unmount the volume from the target path
func (d *Driver) NodeUnpublishVolume(_ context.Context, req *csi.NodeUnpublishVolumeRequest) (resp *csi.NodeUnpublishVolumeResponse, returnedErr error) {
csiMC := csiMetrics.NewCSIMetricContext("node_unpublish_volume")
mc := csiMetrics.NewCSIMetricContext("node_unpublish_volume")
defer func() {
csiMC.Observe(returnedErr == nil)
mc.Observe(returnedErr == nil)
}()

if len(req.GetVolumeId()) == 0 {
Expand Down Expand Up @@ -242,12 +242,6 @@ func (d *Driver) NodeUnpublishVolume(_ context.Context, req *csi.NodeUnpublishVo

// NodeStageVolume mount the volume to a staging path
func (d *Driver) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRequest) (resp *csi.NodeStageVolumeResponse, returnedErr error) {
requestName := "node_stage_volume"
csiMC := csiMetrics.NewCSIMetricContext(requestName)
defer func() {
csiMC.Observe(returnedErr == nil)
}()

if len(req.GetVolumeId()) == 0 {
return nil, status.Error(codes.InvalidArgument, "Volume ID missing in request")
}
Expand Down Expand Up @@ -277,9 +271,9 @@ func (d *Driver) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRe
klog.V(2).Infof("CSI volume is read-only, mounting with extra option ro")
}

mc := metrics.NewMetricContext(azureFileCSIDriverName, requestName, d.cloud.ResourceGroup, "", d.Name)
mc := csiMetrics.NewCSIMetricContext("node_stage_volume").WithBasicVolumeInfo(d.cloud.ResourceGroup, "", d.Name)
defer func() {
mc.ObserveOperationWithResult(returnedErr == nil, VolumeID, volumeID)
mc.WithAdditionalVolumeInfo(VolumeID, volumeID).Observe(returnedErr == nil)
}()

_, accountName, accountKey, fileShareName, diskName, _, err := d.GetAccountInfo(ctx, volumeID, req.GetSecrets(), context)
Expand Down Expand Up @@ -593,13 +587,6 @@ func (d *Driver) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRe

// NodeUnstageVolume unmount the volume from the staging path
func (d *Driver) NodeUnstageVolume(_ context.Context, req *csi.NodeUnstageVolumeRequest) (*csi.NodeUnstageVolumeResponse, error) {
requestName := "node_unstage_volume"
csiMC := csiMetrics.NewCSIMetricContext(requestName)
isOperationSucceeded := false
defer func() {
csiMC.Observe(isOperationSucceeded)
}()

volumeID := req.GetVolumeId()
if len(volumeID) == 0 {
return nil, status.Error(codes.InvalidArgument, "Volume ID missing in request")
Expand All @@ -615,9 +602,10 @@ func (d *Driver) NodeUnstageVolume(_ context.Context, req *csi.NodeUnstageVolume
}
defer d.volumeLocks.Release(lockKey)

mc := metrics.NewMetricContext(azureFileCSIDriverName, requestName, d.cloud.ResourceGroup, "", d.Name)
mc := csiMetrics.NewCSIMetricContext("node_unstage_volume").WithBasicVolumeInfo(d.cloud.ResourceGroup, "", d.Name)
isOperationSucceeded := false
defer func() {
mc.ObserveOperationWithResult(isOperationSucceeded, VolumeID, volumeID)
mc.WithAdditionalVolumeInfo(VolumeID, volumeID).Observe(isOperationSucceeded)
}()

klog.V(2).Infof("NodeUnstageVolume: unmount volume %s on %s", volumeID, stagingTargetPath)
Expand Down Expand Up @@ -715,11 +703,11 @@ func (d *Driver) NodeGetVolumeStats(ctx context.Context, req *csi.NodeGetVolumeS
klog.V(6).Infof("NodeGetVolumeStats: begin to get VolumeStats on volume %s path %s", req.VolumeId, req.VolumePath)
}

mc := metrics.NewMetricContext(azureFileCSIDriverName, "node_get_volume_stats", d.cloud.ResourceGroup, "", d.Name)
mc.LogLevel = 6 // change log level
azureMC := metrics.NewMetricContext(azureFileCSIDriverName, "node_get_volume_stats", d.cloud.ResourceGroup, "", d.Name)
azureMC.LogLevel = 6 // change log level
isOperationSucceeded := false
defer func() {
mc.ObserveOperationWithResult(isOperationSucceeded, VolumeID, req.VolumeId)
azureMC.ObserveOperationWithResult(isOperationSucceeded, VolumeID, req.VolumeId)
}()

resp, err := GetVolumeStats(req.VolumePath, d.enableWindowsHostProcess)
Expand Down
68 changes: 59 additions & 9 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,20 @@ limitations under the License.
package metrics

import (
"strings"
"time"

"k8s.io/component-base/metrics"
"k8s.io/component-base/metrics/legacyregistry"
klog "k8s.io/klog/v2"
)

const (
subSystem = "azurefile_csi_driver"

// Label keys for metrics
Protocol = "protocol"
StorageAccountType = "storage_account_type"
)

var (
Expand All @@ -47,7 +53,7 @@ var (
Buckets: []float64{0.1, 0.2, 0.5, 1, 5, 10, 15, 20, 30, 40, 50, 60, 100, 200, 300},
StabilityLevel: metrics.ALPHA,
},
[]string{"operation", "success", "protocol", "storage_account_type"},
[]string{"operation", "success", Protocol, StorageAccountType},
)

operationTotal = metrics.NewCounterVec(
Expand All @@ -69,18 +75,48 @@ func init() {

// CSIMetricContext represents the context for CSI operation metrics
type CSIMetricContext struct {
operation string
start time.Time
labels map[string]string
operation string
volumeContext []interface{}
start time.Time
labels map[string]string
logLevel int32
}

// NewCSIMetricContext creates a new CSI metric context
func NewCSIMetricContext(operation string) *CSIMetricContext {
return &CSIMetricContext{
operation: operation,
start: time.Now(),
labels: make(map[string]string),
operation: operation,
volumeContext: []interface{}{},
start: time.Now(),
labels: make(map[string]string),
logLevel: 3,
}
}

// WithBasicVolumeInfo adds the standard volume-related context to the metric context
func (mc *CSIMetricContext) WithBasicVolumeInfo(resourceGroup, subscriptionID, source string) *CSIMetricContext {
if resourceGroup != "" {
mc.volumeContext = append(mc.volumeContext, "resource_group", strings.ToLower(resourceGroup))
}
if subscriptionID != "" {
mc.volumeContext = append(mc.volumeContext, "subscription_id", subscriptionID)
}
if source != "" {
mc.volumeContext = append(mc.volumeContext, "source", source)
}
return mc
}

// WithAdditionalVolumeInfo adds additional volume-related context as key-value pairs
// e.g., WithAdditionalVolumeInfo("volumeid", "vol-123")
func (mc *CSIMetricContext) WithAdditionalVolumeInfo(keyValuePairs ...string) *CSIMetricContext {
if len(keyValuePairs)%2 != 0 {
return mc
}
for i := 0; i < len(keyValuePairs); i += 2 {
mc.volumeContext = append(mc.volumeContext, keyValuePairs[i], keyValuePairs[i+1])
}
return mc
}

// WithLabel adds a label to the metric context
Expand All @@ -92,6 +128,12 @@ func (mc *CSIMetricContext) WithLabel(key, value string) *CSIMetricContext {
return mc
}

// WithLogLevel sets the log level for the metric context
func (mc *CSIMetricContext) WithLogLevel(level int32) *CSIMetricContext {
mc.logLevel = level
return mc
}

// Observe records the operation result and duration
func (mc *CSIMetricContext) Observe(success bool) {
duration := time.Since(mc.start).Seconds()
Expand All @@ -106,8 +148,8 @@ func (mc *CSIMetricContext) Observe(success bool) {

// Record detailed metrics if labels are present
if len(mc.labels) > 0 {
protocol := mc.labels["protocol"]
storageAccountType := mc.labels["storage_account_type"]
protocol := mc.labels[Protocol]
storageAccountType := mc.labels[StorageAccountType]

operationDurationWithLabels.WithLabelValues(
mc.operation,
Expand All @@ -116,6 +158,14 @@ func (mc *CSIMetricContext) Observe(success bool) {
storageAccountType,
).Observe(duration)
}

logger := klog.Background().WithName("logLatency").V(int(mc.logLevel))
if !logger.Enabled() {
return
}

keysAndValues := []interface{}{"latency_seconds", duration, "request", subSystem + "_" + mc.operation, "success", successStr}
logger.Info("Observed Request Latency", append(keysAndValues, mc.volumeContext...)...)
}

// ObserveWithLabels records the operation with provided label pairs
Expand Down
Loading
Loading