Skip to content

Commit 78e134c

Browse files
committed
chore: add csi specific metrics
1 parent 75c6405 commit 78e134c

File tree

5 files changed

+572
-20
lines changed

5 files changed

+572
-20
lines changed

deploy/example/metrics/README.md

Lines changed: 53 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@ kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/azurefile-csi
77
2. Get `EXTERNAL-IP` of service `csi-azurefile-controller`
88
```console
99
$ kubectl get svc csi-azurefile-controller -n kube-system
10-
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
11-
csi-azurefile-controller ClusterIP 10.0.184.0 20.39.21.132 29614/TCP 47m
10+
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
11+
csi-azurefile-controller LoadBalancer 10.0.184.0 20.39.21.132 29614:30563/TCP 47m
1212
```
1313

1414
3. Run following command to get cloudprovider_azure operation metrics
@@ -17,8 +17,58 @@ ip=`kubectl get svc csi-azurefile-controller -n kube-system | grep file | awk '{
1717
curl http://$ip:29614/metrics | grep cloudprovider_azure | grep file | grep -e sum -e count
1818
```
1919

20+
4. Run following command to get CSI-specific operation metrics
21+
```console
22+
ip=`kubectl get svc csi-azurefile-controller -n kube-system | grep file | awk '{print $4}'`
23+
curl http://$ip:29614/metrics | grep azurefile_csi_driver_operation | grep -e sum -e count
24+
```
25+
26+
27+
## CSI Driver Metrics
28+
29+
The Azure File CSI driver exposes the following custom metrics:
30+
31+
### Controller Metrics (port 29614)
32+
33+
| Metric | Type | Labels | Description |
34+
|--------|------|--------|-------------|
35+
| `azurefile_csi_driver_operation_duration_seconds` | Histogram | `operation`, `success` | Duration of CSI operations in seconds |
36+
| `azurefile_csi_driver_operation_duration_seconds_labeled` | Histogram | `operation`, `success`, `protocol`, `storage_account_type` | Duration of CSI operations with additional labels |
37+
| `azurefile_csi_driver_operations_total` | Counter | `operation`, `success` | Total number of CSI operations |
38+
39+
**Label Values:**
40+
- `operation`: `controller_create_volume`, `controller_delete_volume`, `controller_create_snapshot`, `controller_delete_snapshot`, `controller_expand_volume`
41+
- `success`: `true`, `false`
42+
- `protocol`: `SMB`, `NFS`
43+
- `storage_account_type`: `Premium_LRS`, `Premium_ZRS`, `Standard_LRS`, `StandardV2_LRS`, `Standard_GRS`, `Standard_ZRS`, etc.
44+
45+
### Node Metrics (port 29615)
46+
47+
| Metric | Type | Labels | Description |
48+
|--------|------|--------|-------------|
49+
| `azurefile_csi_driver_operation_duration_seconds` | Histogram | `operation`, `success` | Duration of CSI operations in seconds |
50+
| `azurefile_csi_driver_operations_total` | Counter | `operation`, `success` | Total number of CSI operations |
51+
52+
**Label Values:**
53+
- `operation`: `node_stage_volume`, `node_unstage_volume`, `node_publish_volume`, `node_unpublish_volume`
54+
- `success`: `true`, `false`
55+
56+
### Azure Cloud Provider Metrics
57+
58+
The CSI driver also exposes Azure cloud provider metrics from the underlying Azure SDK operations:
59+
60+
| Metric | Type | Labels | Description |
61+
|--------|------|--------|-------------|
62+
| `cloudprovider_azure_api_request_duration_seconds` | Histogram | `request`, `resource_group`, `subscription_id`, `source`, `result` | Latency of Azure API calls |
63+
| `cloudprovider_azure_api_request_throttled_count` | Counter | `request`, `resource_group`, `subscription_id`, `source` | Number of throttled Azure API requests |
64+
| `cloudprovider_azure_api_request_errors` | Counter | `request`, `resource_group`, `subscription_id`, `source` | Number of errors in Azure API requests |
65+
66+
These metrics help monitor Azure API performance, throttling, and error rates for file share operations.
67+
2068
## Get Prometheus metrics from CSI driver node pod
2169

2270
```console
23-
kubectl get --raw /api/v1/namespaces/kube-system/pods/csi-azurefile-node-hfgrn:29615/proxy/metrics
71+
kubectl get --raw /api/v1/namespaces/kube-system/pods/csi-azurefile-node-xxxxx:29615/proxy/metrics
2472
```
73+
74+
> **Note:** Replace `csi-azurefile-node-xxxxx` with an actual pod name from `kubectl get pods -n kube-system -l app=csi-azurefile-node`

pkg/azurefile/controllerserver.go

Lines changed: 42 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ import (
4141
"k8s.io/apimachinery/pkg/util/wait"
4242
"k8s.io/klog/v2"
4343
"k8s.io/utils/ptr"
44+
csiMetrics "sigs.k8s.io/azurefile-csi-driver/pkg/metrics"
4445
azcache "sigs.k8s.io/cloud-provider-azure/pkg/cache"
4546
"sigs.k8s.io/cloud-provider-azure/pkg/metrics"
4647
"sigs.k8s.io/cloud-provider-azure/pkg/provider/storage"
@@ -515,6 +516,15 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest)
515516
requestName = "controller_create_volume_from_volume"
516517
}
517518
}
519+
520+
csiMC := csiMetrics.NewCSIMetricContext(requestName)
521+
isOperationSucceeded := false
522+
defer func() {
523+
csiMC.ObserveWithLabels(isOperationSucceeded,
524+
"protocol", string(shareProtocol),
525+
"storage_account_type", sku)
526+
}()
527+
518528
if sourceID != "" {
519529
_, srcAccountName, _, _, _, _, err = GetFileShareInfo(sourceID) //nolint:dogsled
520530
if err != nil {
@@ -558,7 +568,6 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest)
558568
}
559569

560570
mc := metrics.NewMetricContext(azureFileCSIDriverName, requestName, d.cloud.ResourceGroup, subsID, d.Name)
561-
isOperationSucceeded := false
562571
defer func() {
563572
mc.ObserveOperationWithResult(isOperationSucceeded, VolumeID, volumeID)
564573
}()
@@ -782,6 +791,13 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest)
782791

783792
// DeleteVolume delete an azure file
784793
func (d *Driver) DeleteVolume(ctx context.Context, req *csi.DeleteVolumeRequest) (*csi.DeleteVolumeResponse, error) {
794+
requestName := "controller_delete_volume"
795+
csiMC := csiMetrics.NewCSIMetricContext(requestName)
796+
isOperationSucceeded := false
797+
defer func() {
798+
csiMC.Observe(isOperationSucceeded)
799+
}()
800+
785801
volumeID := req.GetVolumeId()
786802
if len(volumeID) == 0 {
787803
return nil, status.Error(codes.InvalidArgument, "Volume ID missing in request")
@@ -826,8 +842,7 @@ func (d *Driver) DeleteVolume(ctx context.Context, req *csi.DeleteVolumeRequest)
826842
secret = createStorageAccountSecret(accountName, accountKey)
827843
}
828844

829-
mc := metrics.NewMetricContext(azureFileCSIDriverName, "controller_delete_volume", resourceGroupName, subsID, d.Name)
830-
isOperationSucceeded := false
845+
mc := metrics.NewMetricContext(azureFileCSIDriverName, requestName, resourceGroupName, subsID, d.Name)
831846
defer func() {
832847
mc.ObserveOperationWithResult(isOperationSucceeded, VolumeID, volumeID)
833848
}()
@@ -937,6 +952,13 @@ func (d *Driver) ControllerUnpublishVolume(_ context.Context, _ *csi.ControllerU
937952

938953
// CreateSnapshot create a snapshot
939954
func (d *Driver) CreateSnapshot(ctx context.Context, req *csi.CreateSnapshotRequest) (*csi.CreateSnapshotResponse, error) {
955+
requestName := "controller_create_snapshot"
956+
csiMC := csiMetrics.NewCSIMetricContext(requestName)
957+
isOperationSucceeded := false
958+
defer func() {
959+
csiMC.Observe(isOperationSucceeded)
960+
}()
961+
940962
sourceVolumeID := req.GetSourceVolumeId()
941963
snapshotName := req.Name
942964
if len(snapshotName) == 0 {
@@ -974,8 +996,7 @@ func (d *Driver) CreateSnapshot(ctx context.Context, req *csi.CreateSnapshotRequ
974996
useDataPlaneAPI = d.useDataPlaneAPI(ctx, sourceVolumeID, accountName)
975997
}
976998

977-
mc := metrics.NewMetricContext(azureFileCSIDriverName, "controller_create_snapshot", rgName, subsID, d.Name)
978-
isOperationSucceeded := false
999+
mc := metrics.NewMetricContext(azureFileCSIDriverName, requestName, rgName, subsID, d.Name)
9791000
defer func() {
9801001
mc.ObserveOperationWithResult(isOperationSucceeded, SourceResourceID, sourceVolumeID, SnapshotName, snapshotName)
9811002
}()
@@ -1088,6 +1109,13 @@ func (d *Driver) CreateSnapshot(ctx context.Context, req *csi.CreateSnapshotRequ
10881109

10891110
// DeleteSnapshot delete a snapshot (todo)
10901111
func (d *Driver) DeleteSnapshot(ctx context.Context, req *csi.DeleteSnapshotRequest) (*csi.DeleteSnapshotResponse, error) {
1112+
requestName := "controller_delete_snapshot"
1113+
csiMC := csiMetrics.NewCSIMetricContext(requestName)
1114+
isOperationSucceeded := false
1115+
defer func() {
1116+
csiMC.Observe(isOperationSucceeded)
1117+
}()
1118+
10911119
if len(req.SnapshotId) == 0 {
10921120
return nil, status.Error(codes.InvalidArgument, "Snapshot ID must be provided")
10931121
}
@@ -1109,8 +1137,7 @@ func (d *Driver) DeleteSnapshot(ctx context.Context, req *csi.DeleteSnapshotRequ
11091137
subsID = d.cloud.SubscriptionID
11101138
}
11111139

1112-
mc := metrics.NewMetricContext(azureFileCSIDriverName, "controller_delete_snapshot", rgName, subsID, d.Name)
1113-
isOperationSucceeded := false
1140+
mc := metrics.NewMetricContext(azureFileCSIDriverName, requestName, rgName, subsID, d.Name)
11141141
defer func() {
11151142
mc.ObserveOperationWithResult(isOperationSucceeded, SnapshotID, req.SnapshotId)
11161143
}()
@@ -1280,6 +1307,13 @@ func (d *Driver) execAzcopyCopy(srcPath, dstPath string, azcopyCopyOptions, auth
12801307

12811308
// ControllerExpandVolume controller expand volume
12821309
func (d *Driver) ControllerExpandVolume(ctx context.Context, req *csi.ControllerExpandVolumeRequest) (*csi.ControllerExpandVolumeResponse, error) {
1310+
requestName := "controller_expand_volume"
1311+
csiMC := csiMetrics.NewCSIMetricContext(requestName)
1312+
isOperationSucceeded := false
1313+
defer func() {
1314+
csiMC.Observe(isOperationSucceeded)
1315+
}()
1316+
12831317
volumeID := req.GetVolumeId()
12841318
if len(volumeID) == 0 {
12851319
return nil, status.Error(codes.InvalidArgument, "Volume ID missing in request")
@@ -1318,8 +1352,7 @@ func (d *Driver) ControllerExpandVolume(ctx context.Context, req *csi.Controller
13181352
}
13191353
}
13201354

1321-
mc := metrics.NewMetricContext(azureFileCSIDriverName, "controller_expand_volume", resourceGroupName, subsID, d.Name)
1322-
isOperationSucceeded := false
1355+
mc := metrics.NewMetricContext(azureFileCSIDriverName, requestName, resourceGroupName, subsID, d.Name)
13231356
defer func() {
13241357
mc.ObserveOperationWithResult(isOperationSucceeded, VolumeID, volumeID)
13251358
}()
@@ -1402,9 +1435,6 @@ func (d *Driver) snapshotExists(ctx context.Context, sourceVolumeID, snapshotNam
14021435

14031436
// List share snapshots.
14041437
listSnapshot := serviceURL.NewListSharesPager(&service.ListSharesOptions{Include: service.ListSharesInclude{Metadata: true, Snapshots: true}})
1405-
if err != nil {
1406-
return false, "", time.Time{}, 0, err
1407-
}
14081438
for listSnapshot.More() {
14091439
response, err := listSnapshot.NextPage(ctx)
14101440
if err != nil {

pkg/azurefile/nodeserver.go

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ import (
3939
"golang.org/x/net/context"
4040
"google.golang.org/grpc"
4141
mount_azurefile "sigs.k8s.io/azurefile-csi-driver/pkg/azurefile-proxy/pb"
42+
csiMetrics "sigs.k8s.io/azurefile-csi-driver/pkg/metrics"
4243
volumehelper "sigs.k8s.io/azurefile-csi-driver/pkg/util"
4344
azcache "sigs.k8s.io/cloud-provider-azure/pkg/cache"
4445
"sigs.k8s.io/cloud-provider-azure/pkg/metrics"
@@ -59,6 +60,12 @@ func NewMountClient(cc *grpc.ClientConn) *MountClient {
5960

6061
// NodePublishVolume mount the volume from staging to target path
6162
func (d *Driver) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolumeRequest) (*csi.NodePublishVolumeResponse, error) {
63+
csiMC := csiMetrics.NewCSIMetricContext("node_publish_volume")
64+
isOperationSucceeded := false
65+
defer func() {
66+
csiMC.Observe(isOperationSucceeded)
67+
}()
68+
6269
volCap := req.GetVolumeCapability()
6370
if volCap == nil {
6471
return nil, status.Error(codes.InvalidArgument, "Volume capability missing in request")
@@ -191,13 +198,20 @@ func (d *Driver) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolu
191198
}
192199
return nil, status.Errorf(codes.Internal, "Could not mount %s at %s: %v", source, target, err)
193200
}
201+
isOperationSucceeded = true
194202
klog.V(2).Infof("NodePublishVolume: mount %s at %s successfully", source, target)
195203

196204
return &csi.NodePublishVolumeResponse{}, nil
197205
}
198206

199207
// NodeUnpublishVolume unmount the volume from the target path
200208
func (d *Driver) NodeUnpublishVolume(_ context.Context, req *csi.NodeUnpublishVolumeRequest) (*csi.NodeUnpublishVolumeResponse, error) {
209+
csiMC := csiMetrics.NewCSIMetricContext("node_unpublish_volume")
210+
isOperationSucceeded := false
211+
defer func() {
212+
csiMC.Observe(isOperationSucceeded)
213+
}()
214+
201215
if len(req.GetVolumeId()) == 0 {
202216
return nil, status.Error(codes.InvalidArgument, "Volume ID missing in request")
203217
}
@@ -224,14 +238,21 @@ func (d *Driver) NodeUnpublishVolume(_ context.Context, req *csi.NodeUnpublishVo
224238
return nil, status.Errorf(codes.Internal, "failed to direct volume remove mount info %s: %v", targetPath, err)
225239
}
226240
}
227-
241+
isOperationSucceeded = true
228242
klog.V(2).Infof("NodeUnpublishVolume: unmount volume %s on %s successfully", volumeID, targetPath)
229243

230244
return &csi.NodeUnpublishVolumeResponse{}, nil
231245
}
232246

233247
// NodeStageVolume mount the volume to a staging path
234248
func (d *Driver) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRequest) (*csi.NodeStageVolumeResponse, error) {
249+
requestName := "node_stage_volume"
250+
csiMC := csiMetrics.NewCSIMetricContext(requestName)
251+
isOperationSucceeded := false
252+
defer func() {
253+
csiMC.Observe(isOperationSucceeded)
254+
}()
255+
235256
if len(req.GetVolumeId()) == 0 {
236257
return nil, status.Error(codes.InvalidArgument, "Volume ID missing in request")
237258
}
@@ -261,8 +282,7 @@ func (d *Driver) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRe
261282
klog.V(2).Infof("CSI volume is read-only, mounting with extra option ro")
262283
}
263284

264-
mc := metrics.NewMetricContext(azureFileCSIDriverName, "node_stage_volume", d.cloud.ResourceGroup, "", d.Name)
265-
isOperationSucceeded := false
285+
mc := metrics.NewMetricContext(azureFileCSIDriverName, requestName, d.cloud.ResourceGroup, "", d.Name)
266286
defer func() {
267287
mc.ObserveOperationWithResult(isOperationSucceeded, VolumeID, volumeID)
268288
}()
@@ -579,6 +599,13 @@ func (d *Driver) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRe
579599

580600
// NodeUnstageVolume unmount the volume from the staging path
581601
func (d *Driver) NodeUnstageVolume(_ context.Context, req *csi.NodeUnstageVolumeRequest) (*csi.NodeUnstageVolumeResponse, error) {
602+
requestName := "node_unstage_volume"
603+
csiMC := csiMetrics.NewCSIMetricContext(requestName)
604+
isOperationSucceeded := false
605+
defer func() {
606+
csiMC.Observe(isOperationSucceeded)
607+
}()
608+
582609
volumeID := req.GetVolumeId()
583610
if len(volumeID) == 0 {
584611
return nil, status.Error(codes.InvalidArgument, "Volume ID missing in request")
@@ -594,8 +621,7 @@ func (d *Driver) NodeUnstageVolume(_ context.Context, req *csi.NodeUnstageVolume
594621
}
595622
defer d.volumeLocks.Release(lockKey)
596623

597-
mc := metrics.NewMetricContext(azureFileCSIDriverName, "node_unstage_volume", d.cloud.ResourceGroup, "", d.Name)
598-
isOperationSucceeded := false
624+
mc := metrics.NewMetricContext(azureFileCSIDriverName, requestName, d.cloud.ResourceGroup, "", d.Name)
599625
defer func() {
600626
mc.ObserveOperationWithResult(isOperationSucceeded, VolumeID, volumeID)
601627
}()

0 commit comments

Comments
 (0)