Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 53 additions & 3 deletions deploy/example/metrics/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/azurefile-csi
2. Get `EXTERNAL-IP` of service `csi-azurefile-controller`
```console
$ kubectl get svc csi-azurefile-controller -n kube-system
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
csi-azurefile-controller ClusterIP 10.0.184.0 20.39.21.132 29614/TCP 47m
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
csi-azurefile-controller LoadBalancer 10.0.184.0 20.39.21.132 29614:30563/TCP 47m
```

3. Run following command to get cloudprovider_azure operation metrics
Expand All @@ -17,8 +17,58 @@ ip=`kubectl get svc csi-azurefile-controller -n kube-system | grep file | awk '{
curl http://$ip:29614/metrics | grep cloudprovider_azure | grep file | grep -e sum -e count
```

4. Run following command to get CSI-specific operation metrics
```console
ip=`kubectl get svc csi-azurefile-controller -n kube-system | grep file | awk '{print $4}'`
curl http://$ip:29614/metrics | grep azurefile_csi_driver_operation | grep -e sum -e count
```


## CSI Driver Metrics

The Azure File CSI driver exposes the following custom metrics:

### Controller Metrics (port 29614)

| Metric | Type | Labels | Description |
|--------|------|--------|-------------|
| `azurefile_csi_driver_operation_duration_seconds` | Histogram | `operation`, `success` | Duration of CSI operations in seconds |
| `azurefile_csi_driver_operation_duration_seconds_labeled` | Histogram | `operation`, `success`, `protocol`, `storage_account_type` | Duration of CSI operations with additional labels |
| `azurefile_csi_driver_operations_total` | Counter | `operation`, `success` | Total number of CSI operations |

**Label Values:**
- `operation`: `controller_create_volume`, `controller_delete_volume`, `controller_create_snapshot`, `controller_delete_snapshot`, `controller_expand_volume`
- `success`: `true`, `false`
- `protocol`: `SMB`, `NFS`
- `storage_account_type`: `Premium_LRS`, `Premium_ZRS`, `Standard_LRS`, `StandardV2_LRS`, `Standard_GRS`, `Standard_ZRS`, etc.

### Node Metrics (port 29615)

| Metric | Type | Labels | Description |
|--------|------|--------|-------------|
| `azurefile_csi_driver_operation_duration_seconds` | Histogram | `operation`, `success` | Duration of CSI operations in seconds |
| `azurefile_csi_driver_operations_total` | Counter | `operation`, `success` | Total number of CSI operations |

**Label Values:**
- `operation`: `node_stage_volume`, `node_unstage_volume`, `node_publish_volume`, `node_unpublish_volume`
- `success`: `true`, `false`

### Azure Cloud Provider Metrics

The CSI driver also exposes Azure cloud provider metrics from the underlying Azure SDK operations:

| Metric | Type | Labels | Description |
|--------|------|--------|-------------|
| `cloudprovider_azure_api_request_duration_seconds` | Histogram | `request`, `resource_group`, `subscription_id`, `source`, `result` | Latency of Azure API calls |
| `cloudprovider_azure_api_request_throttled_count` | Counter | `request`, `resource_group`, `subscription_id`, `source` | Number of throttled Azure API requests |
| `cloudprovider_azure_api_request_errors` | Counter | `request`, `resource_group`, `subscription_id`, `source` | Number of errors in Azure API requests |

These metrics help monitor Azure API performance, throttling, and error rates for file share operations.

## Get Prometheus metrics from CSI driver node pod

```console
kubectl get --raw /api/v1/namespaces/kube-system/pods/csi-azurefile-node-hfgrn:29615/proxy/metrics
kubectl get --raw /api/v1/namespaces/kube-system/pods/csi-azurefile-node-xxxxx:29615/proxy/metrics
```

> **Note:** Replace `csi-azurefile-node-xxxxx` with an actual pod name from `kubectl get pods -n kube-system -l app=csi-azurefile-node`
66 changes: 46 additions & 20 deletions pkg/azurefile/controllerserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ import (
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/klog/v2"
"k8s.io/utils/ptr"
csiMetrics "sigs.k8s.io/azurefile-csi-driver/pkg/metrics"
azcache "sigs.k8s.io/cloud-provider-azure/pkg/cache"
"sigs.k8s.io/cloud-provider-azure/pkg/metrics"
"sigs.k8s.io/cloud-provider-azure/pkg/provider/storage"
Expand Down Expand Up @@ -515,6 +516,15 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest)
requestName = "controller_create_volume_from_volume"
}
}

csiMC := csiMetrics.NewCSIMetricContext(requestName)
isOperationSucceeded := false
defer func() {
csiMC.ObserveWithLabels(isOperationSucceeded,
"protocol", string(shareProtocol),
"storage_account_type", sku)
}()

if sourceID != "" {
_, srcAccountName, _, _, _, _, err = GetFileShareInfo(sourceID) //nolint:dogsled
if err != nil {
Expand Down Expand Up @@ -558,7 +568,6 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest)
}

mc := metrics.NewMetricContext(azureFileCSIDriverName, requestName, d.cloud.ResourceGroup, subsID, d.Name)
isOperationSucceeded := false
defer func() {
mc.ObserveOperationWithResult(isOperationSucceeded, VolumeID, volumeID)
}()
Expand Down Expand Up @@ -781,7 +790,13 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest)
}

// DeleteVolume delete an azure file
func (d *Driver) DeleteVolume(ctx context.Context, req *csi.DeleteVolumeRequest) (*csi.DeleteVolumeResponse, error) {
func (d *Driver) DeleteVolume(ctx context.Context, req *csi.DeleteVolumeRequest) (resp *csi.DeleteVolumeResponse, returnedErr error) {
requestName := "controller_delete_volume"
csiMC := csiMetrics.NewCSIMetricContext(requestName)
defer func() {
csiMC.Observe(returnedErr == nil)
}()

volumeID := req.GetVolumeId()
if len(volumeID) == 0 {
return nil, status.Error(codes.InvalidArgument, "Volume ID missing in request")
Expand Down Expand Up @@ -826,10 +841,9 @@ func (d *Driver) DeleteVolume(ctx context.Context, req *csi.DeleteVolumeRequest)
secret = createStorageAccountSecret(accountName, accountKey)
}

mc := metrics.NewMetricContext(azureFileCSIDriverName, "controller_delete_volume", resourceGroupName, subsID, d.Name)
isOperationSucceeded := false
mc := metrics.NewMetricContext(azureFileCSIDriverName, requestName, resourceGroupName, subsID, d.Name)
defer func() {
mc.ObserveOperationWithResult(isOperationSucceeded, VolumeID, volumeID)
mc.ObserveOperationWithResult(returnedErr == nil, VolumeID, volumeID)
}()

if err := d.DeleteFileShare(ctx, subsID, resourceGroupName, accountName, fileShareName, secret, useDataPlaneAPI); err != nil {
Expand All @@ -840,7 +854,6 @@ func (d *Driver) DeleteVolume(ctx context.Context, req *csi.DeleteVolumeRequest)
klog.Warningf("RemoveStorageAccountTag(%s) under rg(%s) account(%s) failed with %v", storage.SkipMatchingTag, resourceGroupName, accountName, err)
}

isOperationSucceeded = true
return &csi.DeleteVolumeResponse{}, nil
}

Expand Down Expand Up @@ -936,7 +949,13 @@ func (d *Driver) ControllerUnpublishVolume(_ context.Context, _ *csi.ControllerU
}

// CreateSnapshot create a snapshot
func (d *Driver) CreateSnapshot(ctx context.Context, req *csi.CreateSnapshotRequest) (*csi.CreateSnapshotResponse, error) {
func (d *Driver) CreateSnapshot(ctx context.Context, req *csi.CreateSnapshotRequest) (resp *csi.CreateSnapshotResponse, returnedErr error) {
requestName := "controller_create_snapshot"
csiMC := csiMetrics.NewCSIMetricContext(requestName)
defer func() {
csiMC.Observe(returnedErr == nil)
}()

sourceVolumeID := req.GetSourceVolumeId()
snapshotName := req.Name
if len(snapshotName) == 0 {
Expand Down Expand Up @@ -974,10 +993,9 @@ func (d *Driver) CreateSnapshot(ctx context.Context, req *csi.CreateSnapshotRequ
useDataPlaneAPI = d.useDataPlaneAPI(ctx, sourceVolumeID, accountName)
}

mc := metrics.NewMetricContext(azureFileCSIDriverName, "controller_create_snapshot", rgName, subsID, d.Name)
isOperationSucceeded := false
mc := metrics.NewMetricContext(azureFileCSIDriverName, requestName, rgName, subsID, d.Name)
defer func() {
mc.ObserveOperationWithResult(isOperationSucceeded, SourceResourceID, sourceVolumeID, SnapshotName, snapshotName)
mc.ObserveOperationWithResult(returnedErr == nil, SourceResourceID, sourceVolumeID, SnapshotName, snapshotName)
}()

exists, itemSnapshot, itemSnapshotTime, itemSnapshotQuota, err := d.snapshotExists(ctx, sourceVolumeID, snapshotName, req.GetSecrets(), useDataPlaneAPI)
Expand Down Expand Up @@ -1071,7 +1089,7 @@ func (d *Driver) CreateSnapshot(ctx context.Context, req *csi.CreateSnapshotRequ
}
}

createResp := &csi.CreateSnapshotResponse{
resp = &csi.CreateSnapshotResponse{
Snapshot: &csi.Snapshot{
SizeBytes: util.GiBToBytes(int64(itemSnapshotQuota)),
SnapshotId: sourceVolumeID + "#" + itemSnapshot + "#" + subsID,
Expand All @@ -1082,12 +1100,18 @@ func (d *Driver) CreateSnapshot(ctx context.Context, req *csi.CreateSnapshotRequ
},
}

isOperationSucceeded = true
return createResp, nil
return resp, nil
}

// DeleteSnapshot delete a snapshot (todo)
func (d *Driver) DeleteSnapshot(ctx context.Context, req *csi.DeleteSnapshotRequest) (*csi.DeleteSnapshotResponse, error) {
requestName := "controller_delete_snapshot"
csiMC := csiMetrics.NewCSIMetricContext(requestName)
isOperationSucceeded := false
defer func() {
csiMC.Observe(isOperationSucceeded)
}()

if len(req.SnapshotId) == 0 {
return nil, status.Error(codes.InvalidArgument, "Snapshot ID must be provided")
}
Expand All @@ -1109,8 +1133,7 @@ func (d *Driver) DeleteSnapshot(ctx context.Context, req *csi.DeleteSnapshotRequ
subsID = d.cloud.SubscriptionID
}

mc := metrics.NewMetricContext(azureFileCSIDriverName, "controller_delete_snapshot", rgName, subsID, d.Name)
isOperationSucceeded := false
mc := metrics.NewMetricContext(azureFileCSIDriverName, requestName, rgName, subsID, d.Name)
defer func() {
mc.ObserveOperationWithResult(isOperationSucceeded, SnapshotID, req.SnapshotId)
}()
Expand Down Expand Up @@ -1280,6 +1303,13 @@ func (d *Driver) execAzcopyCopy(srcPath, dstPath string, azcopyCopyOptions, auth

// ControllerExpandVolume controller expand volume
func (d *Driver) ControllerExpandVolume(ctx context.Context, req *csi.ControllerExpandVolumeRequest) (*csi.ControllerExpandVolumeResponse, error) {
requestName := "controller_expand_volume"
csiMC := csiMetrics.NewCSIMetricContext(requestName)
isOperationSucceeded := false
defer func() {
csiMC.Observe(isOperationSucceeded)
}()

volumeID := req.GetVolumeId()
if len(volumeID) == 0 {
return nil, status.Error(codes.InvalidArgument, "Volume ID missing in request")
Expand Down Expand Up @@ -1318,8 +1348,7 @@ func (d *Driver) ControllerExpandVolume(ctx context.Context, req *csi.Controller
}
}

mc := metrics.NewMetricContext(azureFileCSIDriverName, "controller_expand_volume", resourceGroupName, subsID, d.Name)
isOperationSucceeded := false
mc := metrics.NewMetricContext(azureFileCSIDriverName, requestName, resourceGroupName, subsID, d.Name)
defer func() {
mc.ObserveOperationWithResult(isOperationSucceeded, VolumeID, volumeID)
}()
Expand Down Expand Up @@ -1402,9 +1431,6 @@ func (d *Driver) snapshotExists(ctx context.Context, sourceVolumeID, snapshotNam

// List share snapshots.
listSnapshot := serviceURL.NewListSharesPager(&service.ListSharesOptions{Include: service.ListSharesInclude{Metadata: true, Snapshots: true}})
if err != nil {
return false, "", time.Time{}, 0, err
}
for listSnapshot.More() {
response, err := listSnapshot.NextPage(ctx)
if err != nil {
Expand Down
40 changes: 30 additions & 10 deletions pkg/azurefile/nodeserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ import (
"golang.org/x/net/context"
"google.golang.org/grpc"
mount_azurefile "sigs.k8s.io/azurefile-csi-driver/pkg/azurefile-proxy/pb"
csiMetrics "sigs.k8s.io/azurefile-csi-driver/pkg/metrics"
volumehelper "sigs.k8s.io/azurefile-csi-driver/pkg/util"
azcache "sigs.k8s.io/cloud-provider-azure/pkg/cache"
"sigs.k8s.io/cloud-provider-azure/pkg/metrics"
Expand All @@ -58,7 +59,12 @@ func NewMountClient(cc *grpc.ClientConn) *MountClient {
}

// NodePublishVolume mount the volume from staging to target path
func (d *Driver) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolumeRequest) (*csi.NodePublishVolumeResponse, error) {
func (d *Driver) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolumeRequest) (resp *csi.NodePublishVolumeResponse, returnedErr error) {
csiMC := csiMetrics.NewCSIMetricContext("node_publish_volume")
defer func() {
csiMC.Observe(returnedErr == nil)
}()

volCap := req.GetVolumeCapability()
if volCap == nil {
return nil, status.Error(codes.InvalidArgument, "Volume capability missing in request")
Expand Down Expand Up @@ -197,7 +203,12 @@ func (d *Driver) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolu
}

// NodeUnpublishVolume unmount the volume from the target path
func (d *Driver) NodeUnpublishVolume(_ context.Context, req *csi.NodeUnpublishVolumeRequest) (*csi.NodeUnpublishVolumeResponse, error) {
func (d *Driver) NodeUnpublishVolume(_ context.Context, req *csi.NodeUnpublishVolumeRequest) (resp *csi.NodeUnpublishVolumeResponse, returnedErr error) {
csiMC := csiMetrics.NewCSIMetricContext("node_unpublish_volume")
defer func() {
csiMC.Observe(returnedErr == nil)
}()

if len(req.GetVolumeId()) == 0 {
return nil, status.Error(codes.InvalidArgument, "Volume ID missing in request")
}
Expand All @@ -224,14 +235,19 @@ func (d *Driver) NodeUnpublishVolume(_ context.Context, req *csi.NodeUnpublishVo
return nil, status.Errorf(codes.Internal, "failed to direct volume remove mount info %s: %v", targetPath, err)
}
}

klog.V(2).Infof("NodeUnpublishVolume: unmount volume %s on %s successfully", volumeID, targetPath)

return &csi.NodeUnpublishVolumeResponse{}, nil
}

// NodeStageVolume mount the volume to a staging path
func (d *Driver) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRequest) (*csi.NodeStageVolumeResponse, error) {
func (d *Driver) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRequest) (resp *csi.NodeStageVolumeResponse, returnedErr error) {
requestName := "node_stage_volume"
csiMC := csiMetrics.NewCSIMetricContext(requestName)
defer func() {
csiMC.Observe(returnedErr == nil)
}()

if len(req.GetVolumeId()) == 0 {
return nil, status.Error(codes.InvalidArgument, "Volume ID missing in request")
}
Expand Down Expand Up @@ -261,10 +277,9 @@ func (d *Driver) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRe
klog.V(2).Infof("CSI volume is read-only, mounting with extra option ro")
}

mc := metrics.NewMetricContext(azureFileCSIDriverName, "node_stage_volume", d.cloud.ResourceGroup, "", d.Name)
isOperationSucceeded := false
mc := metrics.NewMetricContext(azureFileCSIDriverName, requestName, d.cloud.ResourceGroup, "", d.Name)
defer func() {
mc.ObserveOperationWithResult(isOperationSucceeded, VolumeID, volumeID)
mc.ObserveOperationWithResult(returnedErr == nil, VolumeID, volumeID)
}()

_, accountName, accountKey, fileShareName, diskName, _, err := d.GetAccountInfo(ctx, volumeID, req.GetSecrets(), context)
Expand Down Expand Up @@ -573,12 +588,18 @@ func (d *Driver) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRe
}
}

isOperationSucceeded = true
return &csi.NodeStageVolumeResponse{}, nil
}

// NodeUnstageVolume unmount the volume from the staging path
func (d *Driver) NodeUnstageVolume(_ context.Context, req *csi.NodeUnstageVolumeRequest) (*csi.NodeUnstageVolumeResponse, error) {
requestName := "node_unstage_volume"
csiMC := csiMetrics.NewCSIMetricContext(requestName)
isOperationSucceeded := false
defer func() {
csiMC.Observe(isOperationSucceeded)
}()

volumeID := req.GetVolumeId()
if len(volumeID) == 0 {
return nil, status.Error(codes.InvalidArgument, "Volume ID missing in request")
Expand All @@ -594,8 +615,7 @@ func (d *Driver) NodeUnstageVolume(_ context.Context, req *csi.NodeUnstageVolume
}
defer d.volumeLocks.Release(lockKey)

mc := metrics.NewMetricContext(azureFileCSIDriverName, "node_unstage_volume", d.cloud.ResourceGroup, "", d.Name)
isOperationSucceeded := false
mc := metrics.NewMetricContext(azureFileCSIDriverName, requestName, d.cloud.ResourceGroup, "", d.Name)
defer func() {
mc.ObserveOperationWithResult(isOperationSucceeded, VolumeID, volumeID)
}()
Expand Down
Loading
Loading