Skip to content

Commit 8f57958

Browse files
committed
chore: add csi specific metrics
1 parent 75c6405 commit 8f57958

File tree

5 files changed

+574
-33
lines changed

5 files changed

+574
-33
lines changed

deploy/example/metrics/README.md

Lines changed: 53 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@ kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/azurefile-csi
77
2. Get `EXTERNAL-IP` of service `csi-azurefile-controller`
88
```console
99
$ kubectl get svc csi-azurefile-controller -n kube-system
10-
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
11-
csi-azurefile-controller ClusterIP 10.0.184.0 20.39.21.132 29614/TCP 47m
10+
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
11+
csi-azurefile-controller LoadBalancer 10.0.184.0 20.39.21.132 29614:30563/TCP 47m
1212
```
1313

1414
3. Run following command to get cloudprovider_azure operation metrics
@@ -17,8 +17,58 @@ ip=`kubectl get svc csi-azurefile-controller -n kube-system | grep file | awk '{
1717
curl http://$ip:29614/metrics | grep cloudprovider_azure | grep file | grep -e sum -e count
1818
```
1919

20+
4. Run following command to get CSI-specific operation metrics
21+
```console
22+
ip=`kubectl get svc csi-azurefile-controller -n kube-system | grep file | awk '{print $4}'`
23+
curl http://$ip:29614/metrics | grep azurefile_csi_driver_operation | grep -e sum -e count
24+
```
25+
26+
27+
## CSI Driver Metrics
28+
29+
The Azure File CSI driver exposes the following custom metrics:
30+
31+
### Controller Metrics (port 29614)
32+
33+
| Metric | Type | Labels | Description |
34+
|--------|------|--------|-------------|
35+
| `azurefile_csi_driver_operation_duration_seconds` | Histogram | `operation`, `success` | Duration of CSI operations in seconds |
36+
| `azurefile_csi_driver_operation_duration_seconds_labeled` | Histogram | `operation`, `success`, `protocol`, `storage_account_type` | Duration of CSI operations with additional labels |
37+
| `azurefile_csi_driver_operations_total` | Counter | `operation`, `success` | Total number of CSI operations |
38+
39+
**Label Values:**
40+
- `operation`: `controller_create_volume`, `controller_delete_volume`, `controller_create_snapshot`, `controller_delete_snapshot`, `controller_expand_volume`
41+
- `success`: `true`, `false`
42+
- `protocol`: `SMB`, `NFS`
43+
- `storage_account_type`: `Premium_LRS`, `Premium_ZRS`, `Standard_LRS`, `StandardV2_LRS`, `Standard_GRS`, `Standard_ZRS`, etc.
44+
45+
### Node Metrics (port 29615)
46+
47+
| Metric | Type | Labels | Description |
48+
|--------|------|--------|-------------|
49+
| `azurefile_csi_driver_operation_duration_seconds` | Histogram | `operation`, `success` | Duration of CSI operations in seconds |
50+
| `azurefile_csi_driver_operations_total` | Counter | `operation`, `success` | Total number of CSI operations |
51+
52+
**Label Values:**
53+
- `operation`: `node_stage_volume`, `node_unstage_volume`, `node_publish_volume`, `node_unpublish_volume`
54+
- `success`: `true`, `false`
55+
56+
### Azure Cloud Provider Metrics
57+
58+
The CSI driver also exposes Azure cloud provider metrics from the underlying Azure SDK operations:
59+
60+
| Metric | Type | Labels | Description |
61+
|--------|------|--------|-------------|
62+
| `cloudprovider_azure_api_request_duration_seconds` | Histogram | `request`, `resource_group`, `subscription_id`, `source`, `result` | Latency of Azure API calls |
63+
| `cloudprovider_azure_api_request_throttled_count` | Counter | `request`, `resource_group`, `subscription_id`, `source` | Number of throttled Azure API requests |
64+
| `cloudprovider_azure_api_request_errors` | Counter | `request`, `resource_group`, `subscription_id`, `source` | Number of errors in Azure API requests |
65+
66+
These metrics help monitor Azure API performance, throttling, and error rates for file share operations.
67+
2068
## Get Prometheus metrics from CSI driver node pod
2169

2270
```console
23-
kubectl get --raw /api/v1/namespaces/kube-system/pods/csi-azurefile-node-hfgrn:29615/proxy/metrics
71+
kubectl get --raw /api/v1/namespaces/kube-system/pods/csi-azurefile-node-xxxxx:29615/proxy/metrics
2472
```
73+
74+
> **Note:** Replace `csi-azurefile-node-xxxxx` with an actual pod name from `kubectl get pods -n kube-system -l app=csi-azurefile-node`

pkg/azurefile/controllerserver.go

Lines changed: 46 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ import (
4141
"k8s.io/apimachinery/pkg/util/wait"
4242
"k8s.io/klog/v2"
4343
"k8s.io/utils/ptr"
44+
csiMetrics "sigs.k8s.io/azurefile-csi-driver/pkg/metrics"
4445
azcache "sigs.k8s.io/cloud-provider-azure/pkg/cache"
4546
"sigs.k8s.io/cloud-provider-azure/pkg/metrics"
4647
"sigs.k8s.io/cloud-provider-azure/pkg/provider/storage"
@@ -515,6 +516,15 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest)
515516
requestName = "controller_create_volume_from_volume"
516517
}
517518
}
519+
520+
csiMC := csiMetrics.NewCSIMetricContext(requestName)
521+
isOperationSucceeded := false
522+
defer func() {
523+
csiMC.ObserveWithLabels(isOperationSucceeded,
524+
"protocol", string(shareProtocol),
525+
"storage_account_type", sku)
526+
}()
527+
518528
if sourceID != "" {
519529
_, srcAccountName, _, _, _, _, err = GetFileShareInfo(sourceID) //nolint:dogsled
520530
if err != nil {
@@ -558,7 +568,6 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest)
558568
}
559569

560570
mc := metrics.NewMetricContext(azureFileCSIDriverName, requestName, d.cloud.ResourceGroup, subsID, d.Name)
561-
isOperationSucceeded := false
562571
defer func() {
563572
mc.ObserveOperationWithResult(isOperationSucceeded, VolumeID, volumeID)
564573
}()
@@ -781,7 +790,13 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest)
781790
}
782791

783792
// DeleteVolume delete an azure file
784-
func (d *Driver) DeleteVolume(ctx context.Context, req *csi.DeleteVolumeRequest) (*csi.DeleteVolumeResponse, error) {
793+
func (d *Driver) DeleteVolume(ctx context.Context, req *csi.DeleteVolumeRequest) (resp *csi.DeleteVolumeResponse, returnedErr error) {
794+
requestName := "controller_delete_volume"
795+
csiMC := csiMetrics.NewCSIMetricContext(requestName)
796+
defer func() {
797+
csiMC.Observe(returnedErr == nil)
798+
}()
799+
785800
volumeID := req.GetVolumeId()
786801
if len(volumeID) == 0 {
787802
return nil, status.Error(codes.InvalidArgument, "Volume ID missing in request")
@@ -826,10 +841,9 @@ func (d *Driver) DeleteVolume(ctx context.Context, req *csi.DeleteVolumeRequest)
826841
secret = createStorageAccountSecret(accountName, accountKey)
827842
}
828843

829-
mc := metrics.NewMetricContext(azureFileCSIDriverName, "controller_delete_volume", resourceGroupName, subsID, d.Name)
830-
isOperationSucceeded := false
844+
mc := metrics.NewMetricContext(azureFileCSIDriverName, requestName, resourceGroupName, subsID, d.Name)
831845
defer func() {
832-
mc.ObserveOperationWithResult(isOperationSucceeded, VolumeID, volumeID)
846+
mc.ObserveOperationWithResult(returnedErr == nil, VolumeID, volumeID)
833847
}()
834848

835849
if err := d.DeleteFileShare(ctx, subsID, resourceGroupName, accountName, fileShareName, secret, useDataPlaneAPI); err != nil {
@@ -840,7 +854,6 @@ func (d *Driver) DeleteVolume(ctx context.Context, req *csi.DeleteVolumeRequest)
840854
klog.Warningf("RemoveStorageAccountTag(%s) under rg(%s) account(%s) failed with %v", storage.SkipMatchingTag, resourceGroupName, accountName, err)
841855
}
842856

843-
isOperationSucceeded = true
844857
return &csi.DeleteVolumeResponse{}, nil
845858
}
846859

@@ -936,7 +949,13 @@ func (d *Driver) ControllerUnpublishVolume(_ context.Context, _ *csi.ControllerU
936949
}
937950

938951
// CreateSnapshot create a snapshot
939-
func (d *Driver) CreateSnapshot(ctx context.Context, req *csi.CreateSnapshotRequest) (*csi.CreateSnapshotResponse, error) {
952+
func (d *Driver) CreateSnapshot(ctx context.Context, req *csi.CreateSnapshotRequest) (resp *csi.CreateSnapshotResponse, returnedErr error) {
953+
requestName := "controller_create_snapshot"
954+
csiMC := csiMetrics.NewCSIMetricContext(requestName)
955+
defer func() {
956+
csiMC.Observe(returnedErr == nil)
957+
}()
958+
940959
sourceVolumeID := req.GetSourceVolumeId()
941960
snapshotName := req.Name
942961
if len(snapshotName) == 0 {
@@ -974,10 +993,9 @@ func (d *Driver) CreateSnapshot(ctx context.Context, req *csi.CreateSnapshotRequ
974993
useDataPlaneAPI = d.useDataPlaneAPI(ctx, sourceVolumeID, accountName)
975994
}
976995

977-
mc := metrics.NewMetricContext(azureFileCSIDriverName, "controller_create_snapshot", rgName, subsID, d.Name)
978-
isOperationSucceeded := false
996+
mc := metrics.NewMetricContext(azureFileCSIDriverName, requestName, rgName, subsID, d.Name)
979997
defer func() {
980-
mc.ObserveOperationWithResult(isOperationSucceeded, SourceResourceID, sourceVolumeID, SnapshotName, snapshotName)
998+
mc.ObserveOperationWithResult(returnedErr == nil, SourceResourceID, sourceVolumeID, SnapshotName, snapshotName)
981999
}()
9821000

9831001
exists, itemSnapshot, itemSnapshotTime, itemSnapshotQuota, err := d.snapshotExists(ctx, sourceVolumeID, snapshotName, req.GetSecrets(), useDataPlaneAPI)
@@ -1071,7 +1089,7 @@ func (d *Driver) CreateSnapshot(ctx context.Context, req *csi.CreateSnapshotRequ
10711089
}
10721090
}
10731091

1074-
createResp := &csi.CreateSnapshotResponse{
1092+
resp = &csi.CreateSnapshotResponse{
10751093
Snapshot: &csi.Snapshot{
10761094
SizeBytes: util.GiBToBytes(int64(itemSnapshotQuota)),
10771095
SnapshotId: sourceVolumeID + "#" + itemSnapshot + "#" + subsID,
@@ -1082,12 +1100,18 @@ func (d *Driver) CreateSnapshot(ctx context.Context, req *csi.CreateSnapshotRequ
10821100
},
10831101
}
10841102

1085-
isOperationSucceeded = true
1086-
return createResp, nil
1103+
return resp, nil
10871104
}
10881105

10891106
// DeleteSnapshot delete a snapshot (todo)
10901107
func (d *Driver) DeleteSnapshot(ctx context.Context, req *csi.DeleteSnapshotRequest) (*csi.DeleteSnapshotResponse, error) {
1108+
requestName := "controller_delete_snapshot"
1109+
csiMC := csiMetrics.NewCSIMetricContext(requestName)
1110+
isOperationSucceeded := false
1111+
defer func() {
1112+
csiMC.Observe(isOperationSucceeded)
1113+
}()
1114+
10911115
if len(req.SnapshotId) == 0 {
10921116
return nil, status.Error(codes.InvalidArgument, "Snapshot ID must be provided")
10931117
}
@@ -1109,8 +1133,7 @@ func (d *Driver) DeleteSnapshot(ctx context.Context, req *csi.DeleteSnapshotRequ
11091133
subsID = d.cloud.SubscriptionID
11101134
}
11111135

1112-
mc := metrics.NewMetricContext(azureFileCSIDriverName, "controller_delete_snapshot", rgName, subsID, d.Name)
1113-
isOperationSucceeded := false
1136+
mc := metrics.NewMetricContext(azureFileCSIDriverName, requestName, rgName, subsID, d.Name)
11141137
defer func() {
11151138
mc.ObserveOperationWithResult(isOperationSucceeded, SnapshotID, req.SnapshotId)
11161139
}()
@@ -1280,6 +1303,13 @@ func (d *Driver) execAzcopyCopy(srcPath, dstPath string, azcopyCopyOptions, auth
12801303

12811304
// ControllerExpandVolume controller expand volume
12821305
func (d *Driver) ControllerExpandVolume(ctx context.Context, req *csi.ControllerExpandVolumeRequest) (*csi.ControllerExpandVolumeResponse, error) {
1306+
requestName := "controller_expand_volume"
1307+
csiMC := csiMetrics.NewCSIMetricContext(requestName)
1308+
isOperationSucceeded := false
1309+
defer func() {
1310+
csiMC.Observe(isOperationSucceeded)
1311+
}()
1312+
12831313
volumeID := req.GetVolumeId()
12841314
if len(volumeID) == 0 {
12851315
return nil, status.Error(codes.InvalidArgument, "Volume ID missing in request")
@@ -1318,8 +1348,7 @@ func (d *Driver) ControllerExpandVolume(ctx context.Context, req *csi.Controller
13181348
}
13191349
}
13201350

1321-
mc := metrics.NewMetricContext(azureFileCSIDriverName, "controller_expand_volume", resourceGroupName, subsID, d.Name)
1322-
isOperationSucceeded := false
1351+
mc := metrics.NewMetricContext(azureFileCSIDriverName, requestName, resourceGroupName, subsID, d.Name)
13231352
defer func() {
13241353
mc.ObserveOperationWithResult(isOperationSucceeded, VolumeID, volumeID)
13251354
}()
@@ -1402,9 +1431,6 @@ func (d *Driver) snapshotExists(ctx context.Context, sourceVolumeID, snapshotNam
14021431

14031432
// List share snapshots.
14041433
listSnapshot := serviceURL.NewListSharesPager(&service.ListSharesOptions{Include: service.ListSharesInclude{Metadata: true, Snapshots: true}})
1405-
if err != nil {
1406-
return false, "", time.Time{}, 0, err
1407-
}
14081434
for listSnapshot.More() {
14091435
response, err := listSnapshot.NextPage(ctx)
14101436
if err != nil {

pkg/azurefile/nodeserver.go

Lines changed: 30 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ import (
3939
"golang.org/x/net/context"
4040
"google.golang.org/grpc"
4141
mount_azurefile "sigs.k8s.io/azurefile-csi-driver/pkg/azurefile-proxy/pb"
42+
csiMetrics "sigs.k8s.io/azurefile-csi-driver/pkg/metrics"
4243
volumehelper "sigs.k8s.io/azurefile-csi-driver/pkg/util"
4344
azcache "sigs.k8s.io/cloud-provider-azure/pkg/cache"
4445
"sigs.k8s.io/cloud-provider-azure/pkg/metrics"
@@ -58,7 +59,12 @@ func NewMountClient(cc *grpc.ClientConn) *MountClient {
5859
}
5960

6061
// NodePublishVolume mount the volume from staging to target path
61-
func (d *Driver) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolumeRequest) (*csi.NodePublishVolumeResponse, error) {
62+
func (d *Driver) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolumeRequest) (resp *csi.NodePublishVolumeResponse, returnedErr error) {
63+
csiMC := csiMetrics.NewCSIMetricContext("node_publish_volume")
64+
defer func() {
65+
csiMC.Observe(returnedErr == nil)
66+
}()
67+
6268
volCap := req.GetVolumeCapability()
6369
if volCap == nil {
6470
return nil, status.Error(codes.InvalidArgument, "Volume capability missing in request")
@@ -197,7 +203,12 @@ func (d *Driver) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolu
197203
}
198204

199205
// NodeUnpublishVolume unmount the volume from the target path
200-
func (d *Driver) NodeUnpublishVolume(_ context.Context, req *csi.NodeUnpublishVolumeRequest) (*csi.NodeUnpublishVolumeResponse, error) {
206+
func (d *Driver) NodeUnpublishVolume(_ context.Context, req *csi.NodeUnpublishVolumeRequest) (resp *csi.NodeUnpublishVolumeResponse, returnedErr error) {
207+
csiMC := csiMetrics.NewCSIMetricContext("node_unpublish_volume")
208+
defer func() {
209+
csiMC.Observe(returnedErr == nil)
210+
}()
211+
201212
if len(req.GetVolumeId()) == 0 {
202213
return nil, status.Error(codes.InvalidArgument, "Volume ID missing in request")
203214
}
@@ -224,14 +235,19 @@ func (d *Driver) NodeUnpublishVolume(_ context.Context, req *csi.NodeUnpublishVo
224235
return nil, status.Errorf(codes.Internal, "failed to direct volume remove mount info %s: %v", targetPath, err)
225236
}
226237
}
227-
228238
klog.V(2).Infof("NodeUnpublishVolume: unmount volume %s on %s successfully", volumeID, targetPath)
229239

230240
return &csi.NodeUnpublishVolumeResponse{}, nil
231241
}
232242

233243
// NodeStageVolume mount the volume to a staging path
234-
func (d *Driver) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRequest) (*csi.NodeStageVolumeResponse, error) {
244+
func (d *Driver) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRequest) (resp *csi.NodeStageVolumeResponse, returnedErr error) {
245+
requestName := "node_stage_volume"
246+
csiMC := csiMetrics.NewCSIMetricContext(requestName)
247+
defer func() {
248+
csiMC.Observe(returnedErr == nil)
249+
}()
250+
235251
if len(req.GetVolumeId()) == 0 {
236252
return nil, status.Error(codes.InvalidArgument, "Volume ID missing in request")
237253
}
@@ -261,10 +277,9 @@ func (d *Driver) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRe
261277
klog.V(2).Infof("CSI volume is read-only, mounting with extra option ro")
262278
}
263279

264-
mc := metrics.NewMetricContext(azureFileCSIDriverName, "node_stage_volume", d.cloud.ResourceGroup, "", d.Name)
265-
isOperationSucceeded := false
280+
mc := metrics.NewMetricContext(azureFileCSIDriverName, requestName, d.cloud.ResourceGroup, "", d.Name)
266281
defer func() {
267-
mc.ObserveOperationWithResult(isOperationSucceeded, VolumeID, volumeID)
282+
mc.ObserveOperationWithResult(returnedErr == nil, VolumeID, volumeID)
268283
}()
269284

270285
_, accountName, accountKey, fileShareName, diskName, _, err := d.GetAccountInfo(ctx, volumeID, req.GetSecrets(), context)
@@ -573,12 +588,18 @@ func (d *Driver) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRe
573588
}
574589
}
575590

576-
isOperationSucceeded = true
577591
return &csi.NodeStageVolumeResponse{}, nil
578592
}
579593

580594
// NodeUnstageVolume unmount the volume from the staging path
581595
func (d *Driver) NodeUnstageVolume(_ context.Context, req *csi.NodeUnstageVolumeRequest) (*csi.NodeUnstageVolumeResponse, error) {
596+
requestName := "node_unstage_volume"
597+
csiMC := csiMetrics.NewCSIMetricContext(requestName)
598+
isOperationSucceeded := false
599+
defer func() {
600+
csiMC.Observe(isOperationSucceeded)
601+
}()
602+
582603
volumeID := req.GetVolumeId()
583604
if len(volumeID) == 0 {
584605
return nil, status.Error(codes.InvalidArgument, "Volume ID missing in request")
@@ -594,8 +615,7 @@ func (d *Driver) NodeUnstageVolume(_ context.Context, req *csi.NodeUnstageVolume
594615
}
595616
defer d.volumeLocks.Release(lockKey)
596617

597-
mc := metrics.NewMetricContext(azureFileCSIDriverName, "node_unstage_volume", d.cloud.ResourceGroup, "", d.Name)
598-
isOperationSucceeded := false
618+
mc := metrics.NewMetricContext(azureFileCSIDriverName, requestName, d.cloud.ResourceGroup, "", d.Name)
599619
defer func() {
600620
mc.ObserveOperationWithResult(isOperationSucceeded, VolumeID, volumeID)
601621
}()

0 commit comments

Comments
 (0)