Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 22 additions & 1 deletion deploy/example/metrics/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,23 @@

## Metrics description

The metrics emitted by the Azure Disk CSI Driver fall broadly into two categories: CSI and Azure Cloud operation latency metrics. The CSI metrics record the latency of the CSI calls made to the driver, e,g, `ControllerPublishVolume`. The Azure Cloud metrics record the latency of Azure Cloud operations perform as part driver operation, e.g. `attach_disk`. The individual operation metrics are recorded in two different [histogram](https://prometheus.io/docs/concepts/metric_types/#histogram) metrics using the labels `request` and/or `source` to differentiate among the operations. The table below describes the values of the individual operation metrics.
The metrics emitted by the Azure Disk CSI Driver fall broadly into three categories: CSI driver-specific metrics, CSI operation latency metrics (via cloud-provider-azure), and Azure Cloud API metrics. The tables below describe the available metrics.

### CSI Driver Metrics

These metrics are native to the Azure Disk CSI Driver and provide detailed operation tracking.

| Metric Name | Type | Labels | Description |
|-------------|------|--------|-------------|
| `azuredisk_csi_driver_operations_total` | Counter | `operation`, `success` | Total number of CSI operations |
| `azuredisk_csi_driver_operation_duration_seconds` | Histogram | `operation`, `success` | Duration of CSI operations in seconds |
| `azuredisk_csi_driver_operation_duration_seconds_labeled` | Histogram | `operation`, `success`, `disk_sku` | Duration of CSI operations with additional disk-specific labels |

**Operation values:**
- Controller: `controller_create_volume`, `controller_delete_volume`, `controller_modify_volume`, `controller_publish_volume`, `controller_unpublish_volume`, `controller_expand_volume`, `controller_create_snapshot`, `controller_delete_snapshot`
- Node: `node_stage_volume`, `node_unstage_volume`, `node_publish_volume`, `node_unpublish_volume`, `node_expand_volume`

### CSI Operation Latency Metrics (via cloud-provider-azure)

| Name | `request` | `source` | Description |
|------|-----------|----------|-------------|
Expand All @@ -14,6 +30,11 @@ The metrics emitted by the Azure Disk CSI Driver fall broadly into two categorie
| | `azuredisk_csi_driver_controller_delete_snapshot` | `disk.csi.azure.com` | `ControllerDeleteSnapshot` latency |
| | `azuredisk_csi_driver_controller_publish_volume` | `disk.csi.azure.com` | `ControllerPublishVolume` latency |
| | `azuredisk_csi_driver_controller_unpublish_volume` | `disk.csi.azure.com` | `ControllerUnpublishVolume` latency |

### Azure Cloud API Metrics

| Name | `request` | `source` | Description |
|------|-----------|----------|-------------|
| `cloudprovider_azure_api_request_duration_seconds` | | | Records the Azure Cloud operation metrics |
| | `disks_create_or_update` | | `create_disk` latency |
| | `disks_delete` | | `delete_disk` latency |
Expand Down
100 changes: 76 additions & 24 deletions pkg/azuredisk/controllerserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ import (

consts "sigs.k8s.io/azuredisk-csi-driver/pkg/azureconstants"
"sigs.k8s.io/azuredisk-csi-driver/pkg/azureutils"
csiMetrics "sigs.k8s.io/azuredisk-csi-driver/pkg/metrics"
"sigs.k8s.io/azuredisk-csi-driver/pkg/optimization"
volumehelper "sigs.k8s.io/azuredisk-csi-driver/pkg/util"
azureconsts "sigs.k8s.io/cloud-provider-azure/pkg/consts"
Expand Down Expand Up @@ -380,6 +381,12 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest)
}
}

csiMC := csiMetrics.NewCSIMetricContext(metricsRequest)
isOperationSucceeded := false
defer func() {
csiMC.ObserveWithLabels(isOperationSucceeded, "disk_sku", string(skuName))
}()

if strings.HasSuffix(strings.ToLower(string(skuName)), "zrs") {
klog.V(2).Infof("diskZone(%s) is reset as empty since disk(%s) is ZRS(%s)", diskZone, diskParams.DiskName, skuName)
diskZone = ""
Expand Down Expand Up @@ -454,7 +461,6 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest)

var diskURI string
mc := metrics.NewMetricContext(consts.AzureDiskCSIDriverName, metricsRequest, d.cloud.ResourceGroup, d.cloud.SubscriptionID, d.Name)
isOperationSucceeded := false
defer func() {
mc.ObserveOperationWithResult(isOperationSucceeded, consts.VolumeID, diskURI)
}()
Expand Down Expand Up @@ -486,6 +492,13 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest)

// DeleteVolume delete an azure disk
func (d *Driver) DeleteVolume(ctx context.Context, req *csi.DeleteVolumeRequest) (*csi.DeleteVolumeResponse, error) {
metricsRequest := "controller_delete_volume"
csiMC := csiMetrics.NewCSIMetricContext(metricsRequest)
isOperationSucceeded := false
defer func() {
csiMC.Observe(isOperationSucceeded)
}()

volumeID := req.GetVolumeId()
if len(volumeID) == 0 {
return nil, status.Error(codes.InvalidArgument, "Volume ID missing in the request")
Expand All @@ -507,14 +520,14 @@ func (d *Driver) DeleteVolume(ctx context.Context, req *csi.DeleteVolumeRequest)
defer d.volumeLocks.Release(volumeID)

mc := metrics.NewMetricContext(consts.AzureDiskCSIDriverName, "controller_delete_volume", d.cloud.ResourceGroup, d.cloud.SubscriptionID, d.Name)
isOperationSucceeded := false
defer func() {
mc.ObserveOperationWithResult(isOperationSucceeded, consts.VolumeID, diskURI)
}()

klog.V(2).Infof("deleting azure disk(%s)", diskURI)
err := d.diskController.DeleteManagedDisk(ctx, diskURI)
klog.V(2).Infof("delete azure disk(%s) returned with %v", diskURI, err)

isOperationSucceeded = (err == nil)
return &csi.DeleteVolumeResponse{}, err
}
Expand Down Expand Up @@ -555,6 +568,13 @@ func (d *Driver) ControllerModifyVolume(ctx context.Context, req *csi.Controller
skuName = ""
}

metricsRequest := "controller_modify_volume"
csiMC := csiMetrics.NewCSIMetricContext(metricsRequest)
isOperationSucceeded := false
defer func() {
csiMC.ObserveWithLabels(isOperationSucceeded, "disk_sku", string(skuName))
}()

// Check if this is a SKU migration
var fromSKU armcompute.DiskStorageAccountTypes
var monitorSKUMigration bool
Expand Down Expand Up @@ -587,8 +607,7 @@ func (d *Driver) ControllerModifyVolume(ctx context.Context, req *csi.Controller
SourceType: consts.SourceVolume,
}

mc := metrics.NewMetricContext(consts.AzureDiskCSIDriverName, "controller_modify_volume", d.cloud.ResourceGroup, d.cloud.SubscriptionID, d.Name)
isOperationSucceeded := false
mc := metrics.NewMetricContext(consts.AzureDiskCSIDriverName, metricsRequest, d.cloud.ResourceGroup, d.cloud.SubscriptionID, d.Name)
defer func() {
mc.ObserveOperationWithResult(isOperationSucceeded, consts.VolumeID, diskURI)
}()
Expand All @@ -600,8 +619,6 @@ func (d *Driver) ControllerModifyVolume(ctx context.Context, req *csi.Controller
return nil, status.Errorf(codes.Internal, "%v", err)
}

isOperationSucceeded = true

// Start migration monitoring if this is a SKU change
if monitorSKUMigration {
volSizeBytes := int64(*currentDisk.Properties.DiskSizeGB) * 1024 * 1024 * 1024
Expand All @@ -610,11 +627,19 @@ func (d *Driver) ControllerModifyVolume(ctx context.Context, req *csi.Controller

klog.V(2).Infof("modify azure disk(%s) account type(%s) rg(%s) location(%s) successfully", diskParams.DiskName, skuName, diskParams.ResourceGroup, diskParams.Location)

isOperationSucceeded = true
return &csi.ControllerModifyVolumeResponse{}, err
}

// ControllerPublishVolume attach an azure disk to a required node
func (d *Driver) ControllerPublishVolume(ctx context.Context, req *csi.ControllerPublishVolumeRequest) (*csi.ControllerPublishVolumeResponse, error) {
metricsRequest := "controller_publish_volume"
csiMC := csiMetrics.NewCSIMetricContext(metricsRequest)
isOperationSucceeded := false
defer func() {
csiMC.Observe(isOperationSucceeded)
}()

diskURI := req.GetVolumeId()
if len(diskURI) == 0 {
return nil, status.Error(codes.InvalidArgument, "Volume ID not provided")
Expand Down Expand Up @@ -656,8 +681,7 @@ func (d *Driver) ControllerPublishVolume(ctx context.Context, req *csi.Controlle
return nil, status.Errorf(codes.Internal, "%v", err)
}

mc := metrics.NewMetricContext(consts.AzureDiskCSIDriverName, "controller_publish_volume", d.cloud.ResourceGroup, d.cloud.SubscriptionID, d.Name)
isOperationSucceeded := false
mc := metrics.NewMetricContext(consts.AzureDiskCSIDriverName, metricsRequest, d.cloud.ResourceGroup, d.cloud.SubscriptionID, d.Name)
defer func() {
mc.ObserveOperationWithResult(isOperationSucceeded, consts.VolumeID, diskURI, consts.Node, string(nodeName))
}()
Expand Down Expand Up @@ -713,9 +737,7 @@ func (d *Driver) ControllerPublishVolume(ctx context.Context, req *csi.Controlle
d.diskController.AttachDetachInitialDelayInMs = attachDiskInitialDelay
}
lun, err = d.diskController.AttachDisk(ctx, diskName, diskURI, nodeName, cachingMode, disk, occupiedLuns)
if err == nil {
klog.V(2).Infof("Attach operation successful: volume %s attached to node %s.", diskName, nodeName)
} else {
if err != nil {
if derr, ok := err.(*volerr.DanglingAttachError); ok {
if strings.EqualFold(string(nodeName), string(derr.CurrentNode)) {
err := status.Errorf(codes.Internal, "volume %s is actually attached to current node %s, return error", diskURI, nodeName)
Expand Down Expand Up @@ -754,6 +776,13 @@ func (d *Driver) ControllerPublishVolume(ctx context.Context, req *csi.Controlle

// ControllerUnpublishVolume detach an azure disk from a required node
func (d *Driver) ControllerUnpublishVolume(ctx context.Context, req *csi.ControllerUnpublishVolumeRequest) (*csi.ControllerUnpublishVolumeResponse, error) {
metricsRequest := "controller_unpublish_volume"
csiMC := csiMetrics.NewCSIMetricContext(metricsRequest)
isOperationSucceeded := false
defer func() {
csiMC.Observe(isOperationSucceeded)
}()

diskURI := req.GetVolumeId()
if len(diskURI) == 0 {
return nil, status.Error(codes.InvalidArgument, "Volume ID not provided")
Expand All @@ -770,8 +799,7 @@ func (d *Driver) ControllerUnpublishVolume(ctx context.Context, req *csi.Control
return nil, status.Errorf(codes.Internal, "%v", err)
}

mc := metrics.NewMetricContext(consts.AzureDiskCSIDriverName, "controller_unpublish_volume", d.cloud.ResourceGroup, d.cloud.SubscriptionID, d.Name)
isOperationSucceeded := false
mc := metrics.NewMetricContext(consts.AzureDiskCSIDriverName, metricsRequest, d.cloud.ResourceGroup, d.cloud.SubscriptionID, d.Name)
defer func() {
mc.ObserveOperationWithResult(isOperationSucceeded, consts.VolumeID, diskURI, consts.Node, string(nodeName))
}()
Expand Down Expand Up @@ -1065,6 +1093,14 @@ func (d *Driver) listVolumesByResourceGroup(ctx context.Context, resourceGroup s

// ControllerExpandVolume controller expand volume
func (d *Driver) ControllerExpandVolume(ctx context.Context, req *csi.ControllerExpandVolumeRequest) (*csi.ControllerExpandVolumeResponse, error) {
metricsRequest := "controller_expand_volume"
var diskSku string
csiMC := csiMetrics.NewCSIMetricContext(metricsRequest)
isOperationSucceeded := false
defer func() {
csiMC.ObserveWithLabels(isOperationSucceeded, "disk_sku", diskSku)
}()

if len(req.GetVolumeId()) == 0 {
return nil, status.Error(codes.InvalidArgument, "Volume ID missing in the request")
}
Expand All @@ -1083,13 +1119,17 @@ func (d *Driver) ControllerExpandVolume(ctx context.Context, req *csi.Controller
if rerr != nil {
return nil, status.Errorf(codes.Internal, "GetDiskByURI(%s) failed with error(%v)", diskURI, rerr)
}

if result.SKU != nil && result.SKU.Name != nil {
diskSku = string(*result.SKU.Name)
}

if result == nil || result.Properties == nil || result.Properties.DiskSizeGB == nil {
return nil, status.Errorf(codes.Internal, "could not get size of the disk(%s)", diskURI)
}
oldSize := *resource.NewQuantity(int64(*result.Properties.DiskSizeGB), resource.BinarySI)

mc := metrics.NewMetricContext(consts.AzureDiskCSIDriverName, "controller_expand_volume", d.cloud.ResourceGroup, d.cloud.SubscriptionID, d.Name)
isOperationSucceeded := false
mc := metrics.NewMetricContext(consts.AzureDiskCSIDriverName, metricsRequest, d.cloud.ResourceGroup, d.cloud.SubscriptionID, d.Name)
defer func() {
mc.ObserveOperationWithResult(isOperationSucceeded, consts.VolumeID, diskURI)
}()
Expand All @@ -1104,8 +1144,6 @@ func (d *Driver) ControllerExpandVolume(ctx context.Context, req *csi.Controller
if !ok {
return nil, status.Errorf(codes.Internal, "failed to transform disk size with error(%v)", err)
}

isOperationSucceeded = true
klog.V(2).Infof("expand azure disk(%s) successfully, currentSize(%d)", diskURI, currentSize)

if result.ManagedBy != nil {
Expand All @@ -1120,6 +1158,7 @@ func (d *Driver) ControllerExpandVolume(ctx context.Context, req *csi.Controller
}
}

isOperationSucceeded = true
return &csi.ControllerExpandVolumeResponse{
CapacityBytes: currentSize,
NodeExpansionRequired: true,
Expand All @@ -1128,6 +1167,17 @@ func (d *Driver) ControllerExpandVolume(ctx context.Context, req *csi.Controller

// CreateSnapshot create a snapshot
func (d *Driver) CreateSnapshot(ctx context.Context, req *csi.CreateSnapshotRequest) (*csi.CreateSnapshotResponse, error) {
metricsRequest := "controller_create_snapshot"
isOperationInProgress := false

csiMC := csiMetrics.NewCSIMetricContext(metricsRequest)
isOperationSucceeded := false
defer func() {
if !isOperationInProgress {
csiMC.Observe(isOperationSucceeded)
}
}()

sourceVolumeID := req.GetSourceVolumeId()
if len(sourceVolumeID) == 0 {
return nil, status.Error(codes.InvalidArgument, "CreateSnapshot Source Volume ID must be provided")
Expand Down Expand Up @@ -1270,13 +1320,10 @@ func (d *Driver) CreateSnapshot(ctx context.Context, req *csi.CreateSnapshotRequ
}
}

metricsRequest := "controller_create_snapshot"
if crossRegionSnapshotName != "" {
metricsRequest = "controller_create_snapshot_cross_region"
}
mc := metrics.NewMetricContext(consts.AzureDiskCSIDriverName, metricsRequest, d.cloud.ResourceGroup, d.cloud.SubscriptionID, d.Name)
isOperationSucceeded := false
isOperationInProgress := false
defer func() {
if !isOperationInProgress {
mc.ObserveOperationWithResult(isOperationSucceeded, consts.SourceResourceID, sourceVolumeID, consts.SnapshotName, snapshotName)
Expand Down Expand Up @@ -1367,18 +1414,24 @@ func (d *Driver) CreateSnapshot(ctx context.Context, req *csi.CreateSnapshotRequ
// replace the last token of csiSnapshot.SnapshotId with crossRegionSnapshotName
csiSnapshot.SnapshotId = strings.TrimSuffix(csiSnapshot.SnapshotId, snapshotName) + crossRegionSnapshotName
}

isOperationInProgress = !csiSnapshot.ReadyToUse

createResp := &csi.CreateSnapshotResponse{
Snapshot: csiSnapshot,
}

isOperationSucceeded = true
return createResp, nil
}

// DeleteSnapshot delete a snapshot
func (d *Driver) DeleteSnapshot(ctx context.Context, req *csi.DeleteSnapshotRequest) (*csi.DeleteSnapshotResponse, error) {
metricsRequest := "controller_delete_snapshot"
csiMC := csiMetrics.NewCSIMetricContext(metricsRequest)
isOperationSucceeded := false
defer func() {
csiMC.Observe(isOperationSucceeded)
}()

snapshotID := req.SnapshotId
if len(snapshotID) == 0 {
return nil, status.Error(codes.InvalidArgument, "Snapshot ID must be provided")
Expand All @@ -1396,8 +1449,7 @@ func (d *Driver) DeleteSnapshot(ctx context.Context, req *csi.DeleteSnapshotRequ
}
}

mc := metrics.NewMetricContext(consts.AzureDiskCSIDriverName, "controller_delete_snapshot", d.cloud.ResourceGroup, d.cloud.SubscriptionID, d.Name)
isOperationSucceeded := false
mc := metrics.NewMetricContext(consts.AzureDiskCSIDriverName, metricsRequest, d.cloud.ResourceGroup, d.cloud.SubscriptionID, d.Name)
defer func() {
mc.ObserveOperationWithResult(isOperationSucceeded, consts.SnapshotID, snapshotID)
}()
Expand Down
Loading
Loading