DataDog · gjulianm · Apr 20, 2026
@@ -96,6 +96,24 @@ type MetricMetadataSpec struct {
 	Description string `yaml:"description,omitempty"`
 }
 
+// UnmarshalYAML validates metric metadata values while decoding.
+func (m *MetricMetadataSpec) UnmarshalYAML(unmarshal func(interface{}) error) error {
+	type plain MetricMetadataSpec
+
+	var decoded plain
+	if err := unmarshal(&decoded); err != nil {
+		return fmt.Errorf("unmarshal metric metadata: %w", err)
+	}
+
+	switch decoded.MetricType {
+	case "", "gauge", "counter":
+		*m = MetricMetadataSpec(decoded)
+		return nil
+	default:
+		return fmt.Errorf("invalid metric_type %q: must be one of [gauge, counter]", decoded.MetricType)
+	}
+}
+
 // MetricSpec is a metric definition without the name (name is the map key).
 type MetricSpec struct {
 	Metadata     *MetricMetadataSpec `yaml:"metadata,omitempty"`

@@ -186,6 +186,28 @@ support:
 	require.Equal(t, "Example description", spec.Metadata.Description)
 }
 
+func TestMetricMetadataSpecUnmarshalYAMLRejectsInvalidMetricType(t *testing.T) {
+	var spec MetricSpec
+
+	err := yaml.Unmarshal([]byte(`
+metadata:
+  metric_type: histogram
+  unit: byte/second
+  description: Example description
+tagsets:
+  - device
+support:
+  unsupported_architectures: []
+  device_modes:
+    physical: true
+    mig: true
+    vgpu: true
+`), &spec)
+
+	require.Error(t, err)
+	require.ErrorContains(t, err, `invalid metric_type "histogram": must be one of [gauge, counter]`)
+}
+
 func TestLoadedMetricsIncludeMetadata(t *testing.T) {
 	specs, err := LoadSpecs()
 	require.NoError(t, err)

@@ -56,9 +56,10 @@ func NewGPUConfigFromTags(architecture, slicingMode, virtualizationMode string)
 
 // MetricObservation is the normalized observation used by shared validation.
 type MetricObservation struct {
-	Name  string
-	Tags  []string
-	Value *float64
+	Name       string
+	MetricType string
+	Tags       []string
+	Value      *float64
 }
 
 const maxInvalidValueSamplesPerMetric = 5
@@ -67,6 +68,7 @@ type MetricStatus struct {
 	Missing             int                    `json:"missing"`
 	Unknown             int                    `json:"unknown"`
 	Unsupported         int                    `json:"unsupported"`
+	WrongType           int                    `json:"wrong_type"`
 	InvalidValue        int                    `json:"invalid_value"`
 	InvalidValueSamples []string               `json:"invalid_value_samples,omitempty"`
 	TagResults          map[string]*TagSummary `json:"tag_results"`
@@ -95,17 +97,31 @@ type ValidationResult struct {
 	Metrics map[string]*MetricStatus `json:"metrics"`
 }
 
+// HasFailures returns true when the metric status contains metric-level or tag-level failures.
+func (s *MetricStatus) HasFailures() bool {
+	if s == nil {
+		return false
+	}
+
+	if s.Missing+s.Unknown+s.Unsupported+s.WrongType+s.InvalidValue > 0 {
+		return true
+	}
+
+	for _, tagResult := range s.TagResults {
+		if tagResult.Missing > 0 || tagResult.Unknown > 0 || tagResult.InvalidValue > 0 {
+			return true
+		}
+	}
+
+	return false
+}
+
 // HasFailures returns true when the result contains metric-level or tag-level failures.
 func (r *ValidationResult) HasFailures() bool {
 	for _, status := range r.Metrics {
-		if status.Missing > 0 || status.Unknown > 0 || status.Unsupported > 0 || status.InvalidValue > 0 {
+		if status.HasFailures() {
 			return true
 		}
-		for _, tagResult := range status.TagResults {
-			if tagResult.Missing > 0 || tagResult.Unknown > 0 || tagResult.InvalidValue > 0 {
-				return true
-			}
-		}
 	}
 	return false
 }
@@ -127,6 +143,14 @@ func (r *ValidationResult) addInvalidValue(metricName string, sample string) {
 	}
 }
 
+func validateMetricType(expectedMetricType, observedMetricType string) error {
+	if expectedMetricType == "" || observedMetricType == "" || expectedMetricType == observedMetricType {
+		return nil
+	}
+
+	return fmt.Errorf("metric type %q does not match expected %q", observedMetricType, expectedMetricType)
+}
+
 // KnownGPUConfigs returns all supported architecture + mode combinations.
 func KnownGPUConfigs(specs *Specs) []GPUConfig {
 	configs := make([]GPUConfig, 0, len(specs.Architectures.Architectures)*3)
@@ -310,16 +334,17 @@ func ValidateEmittedMetricsAgainstSpec(specs *Specs, config GPUConfig, emittedMe
 		metricStatus := results.getMetricStatus(metricName)
 		metricStatus.TagResults = tagResults
 
-		if metricSpec.Validator == nil {
-			continue
-		}
-
 		for _, sample := range metricSamples {
-			if sample.Value == nil {
-				continue
+			if metricSpec.Metadata != nil {
+				if err := validateMetricType(metricSpec.Metadata.MetricType, sample.MetricType); err != nil {
+					results.getMetricStatus(metricName).WrongType++
+				}
 			}
-			if err := metricSpec.Validator.Validate(*sample.Value); err != nil {
-				results.addInvalidValue(metricName, err.Error())
+
+			if metricSpec.Validator != nil && sample.Value != nil {
+				if err := metricSpec.Validator.Validate(*sample.Value); err != nil {
+					results.addInvalidValue(metricName, err.Error())
+				}
 			}
 		}
 	}

@@ -0,0 +1,36 @@
+// Unless explicitly stated otherwise all files in this repository are licensed
+// under the Apache License Version 2.0.
+// This product includes software developed at Datadog (https://www.datadoghq.com/).
+// Copyright 2026-present Datadog, Inc.
+
+//go:build linux && nvml
+
+package spec
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestValidateMetricType(t *testing.T) {
+	t.Run("exact match", func(t *testing.T) {
+		err := validateMetricType("gauge", "gauge")
+		require.NoError(t, err)
+	})
+
+	t.Run("mismatch returns error", func(t *testing.T) {
+		err := validateMetricType("gauge", "counter")
+		require.ErrorContains(t, err, "does not match expected")
+	})
+
+	t.Run("case mismatch returns error", func(t *testing.T) {
+		err := validateMetricType("gauge", "Gauge")
+		require.ErrorContains(t, err, "does not match expected")
+	})
+
+	t.Run("missing observed type is allowed", func(t *testing.T) {
+		err := validateMetricType("gauge", "")
+		require.NoError(t, err)
+	})
+}
@@ -42,10 +42,16 @@ func GetEmittedGPUMetrics(mockSender *mocksender.MockSender) map[string][]gpuspe
 	metricsByName := make(map[string][]gpuspec.MetricObservation)
 
 	for _, call := range mockSender.Mock.Calls {
-		if call.Method != "GaugeWithTimestamp" && call.Method != "CountWithTimestamp" {
+		metricType := ""
+		switch call.Method {
+		case "GaugeWithTimestamp":
+			metricType = "gauge"
+		case "CountWithTimestamp":
+			metricType = "counter"
+		default: 
 			continue
 		}
-
+		
 		if len(call.Arguments) == 0 {
 			continue
 		}
@@ -70,9 +76,10 @@ func GetEmittedGPUMetrics(mockSender *mocksender.MockSender) map[string][]gpuspe
 		}
 
 		metricsByName[specMetricName] = append(metricsByName[specMetricName], gpuspec.MetricObservation{
-			Name:  specMetricName,
-			Tags:  tags,
-			Value: value,
+			Name:       specMetricName,
+			MetricType: metricType,
+			Tags:       tags,
+			Value:      value,
 		})
 	}
 
@@ -89,6 +96,7 @@ func ValidateEmittedMetricsAgainstSpec(t *testing.T, specs *gpuspec.Specs, confi
 			assert.Zero(t, status.Missing, "metric %s missing in %d cases", metricName, status.Missing)
 			assert.Zero(t, status.Unknown, "metric %s unknown in %d cases", metricName, status.Unknown)
 			assert.Zero(t, status.Unsupported, "metric %s unsupported in %d cases", metricName, status.Unsupported)
+			assert.Zero(t, status.WrongType, "metric %s wrong type in %d cases", metricName, status.WrongType)
 			assert.Zero(t, status.InvalidValue, "metric %s invalid in %d cases", metricName, status.InvalidValue)
 
 			for tag, tagResult := range status.TagResults {