Skip to content

Commit 4cbee6b

Browse files
committed
feat(telemetry): add structured status code classification for genai errors
Introduce a `classifyByStatusCode` helper that probes for an HTTP status code via a `StatusCode() int` method before falling back to substring matching. This prevents false positives when error messages incidentally contain strings like "401", "403", or "429" in request IDs, byte counts, or status-line fragments. Providers that expose HTTP status codes through a structured interface now get classified from the structural signal, while text-only errors continue to use the existing heuristic. Also add documentation clarifying that `getInstruments` binds to the global MeterProvider on first call via `sync.Once`, which affects test setup requirements.
1 parent a4ce95e commit 4cbee6b

3 files changed

Lines changed: 47 additions & 0 deletions

File tree

pkg/telemetry/genai/errors.go

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,18 @@ func ClassifyError(err error) string {
3232
return "deadline_exceeded"
3333
}
3434

35+
// Prefer a structured status-code probe before falling back to
36+
// substring matching. The string heuristic below trips on any
37+
// error message that incidentally contains "401", "403", "429" —
38+
// request IDs, byte counts, status-line fragments, etc. Providers
39+
// that surface HTTP status codes via a `StatusCode() int` method
40+
// (or via an OTel-style `HTTPStatusCode() int`) get classified
41+
// from the structural signal, while text-only errors fall through
42+
// to the heuristic.
43+
if t := classifyByStatusCode(err); t != "" {
44+
return t
45+
}
46+
3547
msg := strings.ToLower(err.Error())
3648
switch {
3749
case strings.Contains(msg, "context length") || strings.Contains(msg, "context_length"):
@@ -61,6 +73,27 @@ func ClassifyError(err error) string {
6173
return ErrorTypeOther
6274
}
6375

76+
// classifyByStatusCode returns a low-cardinality `error.type` when err
77+
// (or anything in its wrap chain) exposes an HTTP status code via a
78+
// `StatusCode() int` method and the value matches one of the cases
79+
// ClassifyError handles. Returns "" when no structural signal is
80+
// available so the caller can fall through to substring heuristics.
81+
func classifyByStatusCode(err error) string {
82+
var sc interface{ StatusCode() int }
83+
if !errors.As(err, &sc) {
84+
return ""
85+
}
86+
switch sc.StatusCode() {
87+
case 401:
88+
return "auth"
89+
case 403:
90+
return "forbidden"
91+
case 429:
92+
return "rate_limit"
93+
}
94+
return ""
95+
}
96+
6497
// applyExtraAttribute converts a StreamAttributer KeyValue into an OTel
6598
// attribute and applies it to the span. Unsupported value types are
6699
// dropped silently — telemetry must never crash request paths.

pkg/telemetry/genai/metrics.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,14 @@ var (
4444
// previously a single early return left every metric permanently
4545
// disabled, which surprised production debugging when one bucket
4646
// configuration tripped a registration error.
47+
//
48+
// Test note: instruments are bound to the global MeterProvider on first
49+
// call and frozen for the process lifetime via sync.Once. Replacing the
50+
// provider with otel.SetMeterProvider after any production code path
51+
// has already triggered getInstruments will NOT rebind the histograms,
52+
// so tests that inspect emitted metrics must install their provider
53+
// before any code under test runs (typically in TestMain or per-test
54+
// setup before the first instrumented call).
4755
func getInstruments() *instruments {
4856
instOnce.Do(func() {
4957
meter := otel.Meter(instrumentationName)

pkg/telemetry/genai/span.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -353,6 +353,12 @@ func (s *ChatSpan) End() {
353353
attribute.String(AttrOperationName, OperationChat),
354354
attribute.String(AttrProviderName, s.provider),
355355
}
356+
// `gen_ai.request.model` is required here by the OTel GenAI
357+
// semconv but is unbounded in practice — every dated variant
358+
// (e.g. `model-YYYYMMDD`) opens a new metric series. Operators
359+
// concerned about backend cardinality should drop or canonicalise
360+
// this label at the collector rather than at the agent, so spans
361+
// keep full detail while metrics stay bounded.
356362
if s.model != "" {
357363
commonAttrs = append(commonAttrs, attribute.String(AttrRequestModel, s.model))
358364
}

0 commit comments

Comments
 (0)