Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions pkg/collector/corechecks/gpu/spec/spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,24 @@ type MetricMetadataSpec struct {
Description string `yaml:"description,omitempty"`
}

// UnmarshalYAML validates metric metadata values while decoding.
func (m *MetricMetadataSpec) UnmarshalYAML(unmarshal func(interface{}) error) error {
type plain MetricMetadataSpec

var decoded plain
if err := unmarshal(&decoded); err != nil {
return fmt.Errorf("unmarshal metric metadata: %w", err)
}

switch decoded.MetricType {
case "", "gauge", "counter":
*m = MetricMetadataSpec(decoded)
return nil
default:
return fmt.Errorf("invalid metric_type %q: must be one of [gauge, counter]", decoded.MetricType)
}
}

// MetricSpec is a metric definition without the name (name is the map key).
type MetricSpec struct {
Metadata *MetricMetadataSpec `yaml:"metadata,omitempty"`
Expand Down
22 changes: 22 additions & 0 deletions pkg/collector/corechecks/gpu/spec/spec_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,28 @@ support:
require.Equal(t, "Example description", spec.Metadata.Description)
}

func TestMetricMetadataSpecUnmarshalYAMLRejectsInvalidMetricType(t *testing.T) {
var spec MetricSpec

err := yaml.Unmarshal([]byte(`
metadata:
metric_type: histogram
unit: byte/second
description: Example description
tagsets:
- device
support:
unsupported_architectures: []
device_modes:
physical: true
mig: true
vgpu: true
`), &spec)

require.Error(t, err)
require.ErrorContains(t, err, `invalid metric_type "histogram": must be one of [gauge, counter]`)
}

func TestLoadedMetricsIncludeMetadata(t *testing.T) {
specs, err := LoadSpecs()
require.NoError(t, err)
Expand Down
59 changes: 42 additions & 17 deletions pkg/collector/corechecks/gpu/spec/validation.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,10 @@ func NewGPUConfigFromTags(architecture, slicingMode, virtualizationMode string)

// MetricObservation is the normalized observation used by shared validation.
type MetricObservation struct {
Name string
Tags []string
Value *float64
Name string
MetricType string
Tags []string
Value *float64
}

const maxInvalidValueSamplesPerMetric = 5
Expand All @@ -67,6 +68,7 @@ type MetricStatus struct {
Missing int `json:"missing"`
Unknown int `json:"unknown"`
Unsupported int `json:"unsupported"`
WrongType int `json:"wrong_type"`
InvalidValue int `json:"invalid_value"`
InvalidValueSamples []string `json:"invalid_value_samples,omitempty"`
TagResults map[string]*TagSummary `json:"tag_results"`
Expand Down Expand Up @@ -95,17 +97,31 @@ type ValidationResult struct {
Metrics map[string]*MetricStatus `json:"metrics"`
}

// HasFailures returns true when the metric status contains metric-level or tag-level failures.
func (s *MetricStatus) HasFailures() bool {
if s == nil {
return false
}

if s.Missing+s.Unknown+s.Unsupported+s.WrongType+s.InvalidValue > 0 {
return true
}

for _, tagResult := range s.TagResults {
if tagResult.Missing > 0 || tagResult.Unknown > 0 || tagResult.InvalidValue > 0 {
return true
}
}

return false
}

// HasFailures returns true when the result contains metric-level or tag-level failures.
func (r *ValidationResult) HasFailures() bool {
for _, status := range r.Metrics {
if status.Missing > 0 || status.Unknown > 0 || status.Unsupported > 0 || status.InvalidValue > 0 {
if status.HasFailures() {
return true
}
for _, tagResult := range status.TagResults {
if tagResult.Missing > 0 || tagResult.Unknown > 0 || tagResult.InvalidValue > 0 {
return true
}
}
}
return false
}
Expand All @@ -127,6 +143,14 @@ func (r *ValidationResult) addInvalidValue(metricName string, sample string) {
}
}

func validateMetricType(expectedMetricType, observedMetricType string) error {
if expectedMetricType == "" || observedMetricType == "" || expectedMetricType == observedMetricType {
return nil
}

return fmt.Errorf("metric type %q does not match expected %q", observedMetricType, expectedMetricType)
}

// KnownGPUConfigs returns all supported architecture + mode combinations.
func KnownGPUConfigs(specs *Specs) []GPUConfig {
configs := make([]GPUConfig, 0, len(specs.Architectures.Architectures)*3)
Expand Down Expand Up @@ -310,16 +334,17 @@ func ValidateEmittedMetricsAgainstSpec(specs *Specs, config GPUConfig, emittedMe
metricStatus := results.getMetricStatus(metricName)
metricStatus.TagResults = tagResults

if metricSpec.Validator == nil {
continue
}

for _, sample := range metricSamples {
if sample.Value == nil {
continue
if metricSpec.Metadata != nil {
if err := validateMetricType(metricSpec.Metadata.MetricType, sample.MetricType); err != nil {
results.getMetricStatus(metricName).WrongType++
}
}
if err := metricSpec.Validator.Validate(*sample.Value); err != nil {
results.addInvalidValue(metricName, err.Error())

if metricSpec.Validator != nil && sample.Value != nil {
if err := metricSpec.Validator.Validate(*sample.Value); err != nil {
results.addInvalidValue(metricName, err.Error())
}
}
}
}
Expand Down
36 changes: 36 additions & 0 deletions pkg/collector/corechecks/gpu/spec/validation_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
// Unless explicitly stated otherwise all files in this repository are licensed
// under the Apache License Version 2.0.
// This product includes software developed at Datadog (https://www.datadoghq.com/).
// Copyright 2026-present Datadog, Inc.

//go:build linux && nvml

package spec

import (
"testing"

"github.com/stretchr/testify/require"
)

func TestValidateMetricType(t *testing.T) {
t.Run("exact match", func(t *testing.T) {
err := validateMetricType("gauge", "gauge")
require.NoError(t, err)
})

t.Run("mismatch returns error", func(t *testing.T) {
err := validateMetricType("gauge", "counter")
require.ErrorContains(t, err, "does not match expected")
})

t.Run("case mismatch returns error", func(t *testing.T) {
err := validateMetricType("gauge", "Gauge")
require.ErrorContains(t, err, "does not match expected")
})

t.Run("missing observed type is allowed", func(t *testing.T) {
err := validateMetricType("gauge", "")
require.NoError(t, err)
})
}
18 changes: 13 additions & 5 deletions pkg/collector/corechecks/gpu/testutil.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,16 @@ func GetEmittedGPUMetrics(mockSender *mocksender.MockSender) map[string][]gpuspe
metricsByName := make(map[string][]gpuspec.MetricObservation)

for _, call := range mockSender.Mock.Calls {
if call.Method != "GaugeWithTimestamp" && call.Method != "CountWithTimestamp" {
metricType := ""
switch call.Method {
case "GaugeWithTimestamp":
metricType = "gauge"
case "CountWithTimestamp":
metricType = "counter"
default:
continue
}

if len(call.Arguments) == 0 {
continue
}
Expand All @@ -70,9 +76,10 @@ func GetEmittedGPUMetrics(mockSender *mocksender.MockSender) map[string][]gpuspe
}

metricsByName[specMetricName] = append(metricsByName[specMetricName], gpuspec.MetricObservation{
Name: specMetricName,
Tags: tags,
Value: value,
Name: specMetricName,
MetricType: metricType,
Tags: tags,
Value: value,
})
}

Expand All @@ -89,6 +96,7 @@ func ValidateEmittedMetricsAgainstSpec(t *testing.T, specs *gpuspec.Specs, confi
assert.Zero(t, status.Missing, "metric %s missing in %d cases", metricName, status.Missing)
assert.Zero(t, status.Unknown, "metric %s unknown in %d cases", metricName, status.Unknown)
assert.Zero(t, status.Unsupported, "metric %s unsupported in %d cases", metricName, status.Unsupported)
assert.Zero(t, status.WrongType, "metric %s wrong type in %d cases", metricName, status.WrongType)
assert.Zero(t, status.InvalidValue, "metric %s invalid in %d cases", metricName, status.InvalidValue)

for tag, tagResult := range status.TagResults {
Expand Down
Loading