diff --git a/libdd-data-pipeline/src/trace_exporter/mod.rs b/libdd-data-pipeline/src/trace_exporter/mod.rs index a3a19aed78..1b4c844c12 100644 --- a/libdd-data-pipeline/src/trace_exporter/mod.rs +++ b/libdd-data-pipeline/src/trace_exporter/mod.rs @@ -350,7 +350,7 @@ impl>>, ) -> Result { self.check_agent_info().await; - self.send_trace_chunks_inner(trace_chunks).await + self.send_trace_chunks_inner(trace_chunks, true).await } /// Sends trace chunks via OTLP HTTP (JSON or protobuf) when OTLP config is enabled. @@ -658,9 +666,18 @@ impl( &self, mut traces: Vec>>, + truncate: bool, ) -> Result { let mut header_tags: TracerHeaderTags = self.metadata.borrow().into(); + // Truncate over-long string fields before any downstream processing so that stats, + // serialisation, and the OTLP path all operate on the same normalised payload. + // Skipped on the msgpack path (`send`/`send_async`) where the tracer is responsible + // for enforcing field-length limits before encoding. + if truncate { + libdd_trace_utils::span::trace_utils::truncate_span_strings(&mut traces); + } + // Process stats computation and drop non-sampled (p0) chunks. // This must run before the OTLP path so that unsampled spans are not exported. stats::process_traces_for_stats( @@ -2126,6 +2143,50 @@ mod tests { ); mock_otlp.assert(); } + + // Documents the `truncate=false` path: spans decoded from msgpack via from_slice + // have T::Text = &str, for which truncate_span_strings is a no-op. This proves + // that send/send_async correctly leaves over-long fields unchanged. + // + // Note: there is no integration test that exercises send_trace_chunks_inner with + // truncate=true through the full send_trace_chunks_async call chain. The unit + // tests in trace_utils.rs prove that truncate_span_strings works on BytesData + // spans, and the trace_serializer round-trip tests verify that truncated data + // survives encoding, but neither goes through send_trace_chunks_inner itself. + + use libdd_trace_utils::span::trace_utils::MAX_SPAN_STRING_LEN; + + /// send_async decodes via from_slice (&str spans); truncate_span_strings is a + /// no-op on &str, so over-long fields pass through unchanged (tracer's responsibility). + #[test] + fn test_send_async_does_not_truncate_over_long_fields() { + let over_limit: String = std::iter::repeat_n('b', MAX_SPAN_STRING_LEN + 1).collect(); + let span = SpanBytes { + resource: BytesString::from_string(over_limit), + name: BytesString::from_slice(b"op").unwrap(), + service: BytesString::from_slice(b"svc").unwrap(), + span_id: 1, + trace_id: 1, + start: 1_000_000, + duration: 1_000, + ..Default::default() + }; + let payload = libdd_trace_utils::msgpack_encoder::v04::to_vec(&[vec![span]]); + + // Decode via from_slice — produces SpanSlice<'_> where T::Text = &str. + let (mut traces, _) = + libdd_trace_utils::msgpack_decoder::v04::from_slice(&payload).unwrap(); + + // truncate_span_strings is a no-op for &str spans regardless of the truncate + // flag; calling it here proves the no-op property directly. + libdd_trace_utils::span::trace_utils::truncate_span_strings(&mut traces); + + assert_eq!( + traces[0][0].resource.chars().count(), + MAX_SPAN_STRING_LEN + 1, + "send_async must not truncate — tracer is responsible for field-length limits" + ); + } } #[cfg(test)] diff --git a/libdd-data-pipeline/src/trace_exporter/trace_serializer.rs b/libdd-data-pipeline/src/trace_exporter/trace_serializer.rs index 7cfd62a5a0..9dc1e3fe88 100644 --- a/libdd-data-pipeline/src/trace_exporter/trace_serializer.rs +++ b/libdd-data-pipeline/src/trace_exporter/trace_serializer.rs @@ -460,4 +460,174 @@ mod tests { assert!(!headers.contains_key("datadog-client-computed-stats")); assert!(headers.contains_key("datadog-client-computed-top-level")); } + + // ----------------------------------------------------------------------- + // Truncation end-to-end regression tests + // + // These tests verify that over-long string fields survive the full + // truncate → encode → decode round-trip correctly. They mirror the + // dd-trace-py snapshot tests for + // `test_encode_span_with_large_string_attributes` (ASCII) and + // `test_encode_span_with_large_unicode_string_attributes` (multi-byte). + // ----------------------------------------------------------------------- + + use libdd_trace_utils::span::trace_utils::{ + truncate_span_strings, MAX_SPAN_STRING_LEN, TRUNCATED_SPAN_STRING_LEN, + }; + + const TRUNCATION_SUFFIX: &str = "..."; + + fn long_bytes_string(c: char, n: usize) -> BytesString { + BytesString::from_string(std::iter::repeat_n(c, n).collect()) + } + + /// Build a span whose `resource`, one meta key, and one meta value are + /// each at interesting boundary lengths, matching the dd-trace-py snapshot + /// test fixture: + /// - name: 25 000 'a' chars → exactly at the limit → NOT truncated + /// - resource: 25 001 'b' chars → one over the limit → truncated to 2 500 + /// - meta key: 25 001 'c' chars → truncated to 2 500 + /// - meta value: 2 000 'd' chars → well under limit → unchanged + fn create_large_string_span() -> SpanBytes { + SpanBytes { + name: long_bytes_string('a', MAX_SPAN_STRING_LEN), + resource: long_bytes_string('b', MAX_SPAN_STRING_LEN + 1), + service: BytesString::from_slice(b"svc").unwrap(), + meta: vec![( + long_bytes_string('c', MAX_SPAN_STRING_LEN + 1), + long_bytes_string('d', 2_000), + )] + .into(), + span_id: 1, + trace_id: 1, + start: 1_000_000, + duration: 1_000, + ..Default::default() + } + } + + fn assert_truncation_invariants(span: &libdd_trace_utils::span::v04::SpanBytes) { + // name at exactly the limit — must be unchanged + assert_eq!( + span.name.as_str().chars().count(), + MAX_SPAN_STRING_LEN, + "name should not be truncated" + ); + + // resource one over the limit — must be truncated + assert_eq!( + span.resource.as_str().chars().count(), + TRUNCATED_SPAN_STRING_LEN, + "resource should be truncated to {TRUNCATED_SPAN_STRING_LEN}" + ); + assert!( + span.resource.as_str().ends_with(TRUNCATION_SUFFIX), + "truncated resource must end with the suffix" + ); + + // meta: key was over the limit, value was under + let (k, v) = span.meta.iter().next().expect("meta should be non-empty"); + assert_eq!( + k.as_str().chars().count(), + TRUNCATED_SPAN_STRING_LEN, + "meta key should be truncated" + ); + assert!(k.as_str().ends_with(TRUNCATION_SUFFIX)); + assert_eq!( + v.as_str().chars().count(), + 2_000, + "meta value under limit must be unchanged" + ); + } + + #[test] + fn test_truncation_survives_v04_encode_decode_round_trip() { + let serializer = TraceSerializer::new(); + let mut traces = vec![vec![create_large_string_span()]]; + + truncate_span_strings(&mut traces); + + let payload = serializer + .collect_and_process_traces(traces, TraceExporterOutputFormat::V04) + .unwrap(); + let serialized = serializer + .serialize_payload(&payload, &TracerMetadata::default()) + .unwrap(); + + let (decoded, _) = + libdd_trace_utils::msgpack_decoder::v04::from_slice(&serialized).unwrap(); + assert_eq!(decoded.len(), 1); + assert_eq!(decoded[0].len(), 1); + + // Decoded spans use &str (SliceData); re-check lengths via char count. + let span = &decoded[0][0]; + assert_eq!(span.name.chars().count(), MAX_SPAN_STRING_LEN); + assert_eq!(span.resource.chars().count(), TRUNCATED_SPAN_STRING_LEN); + assert!(span.resource.ends_with(TRUNCATION_SUFFIX)); + let (k, v) = span.meta.iter().next().unwrap(); + assert_eq!(k.chars().count(), TRUNCATED_SPAN_STRING_LEN); + assert_eq!(v.chars().count(), 2_000); + } + + #[test] + fn test_truncation_survives_v05_encode_decode_round_trip() { + let serializer = TraceSerializer::new(); + let mut traces = vec![vec![create_large_string_span()]]; + + truncate_span_strings(&mut traces); + + // Verify truncation happened in memory before we encode. + assert_truncation_invariants(&traces[0][0]); + + let payload = serializer + .collect_and_process_traces(traces, TraceExporterOutputFormat::V05) + .unwrap(); + let serialized = serializer + .serialize_payload(&payload, &TracerMetadata::default()) + .unwrap(); + + let (decoded, _) = + libdd_trace_utils::msgpack_decoder::v05::from_slice(&serialized).unwrap(); + assert_eq!(decoded.len(), 1); + assert_eq!(decoded[0].len(), 1); + + let span = &decoded[0][0]; + assert_eq!(span.name.chars().count(), MAX_SPAN_STRING_LEN); + assert_eq!(span.resource.chars().count(), TRUNCATED_SPAN_STRING_LEN); + assert!(span.resource.ends_with(TRUNCATION_SUFFIX)); + let (k, v) = span.meta.iter().next().unwrap(); + assert_eq!(k.chars().count(), TRUNCATED_SPAN_STRING_LEN); + assert_eq!(v.chars().count(), 2_000); + } + + #[test] + fn test_truncation_unicode_survives_v04_encode_decode_round_trip() { + // Each '€' is 3 bytes; 25 001 euros → truncated to 2 500 code points. + let serializer = TraceSerializer::new(); + let mut traces = vec![vec![SpanBytes { + name: long_bytes_string('€', MAX_SPAN_STRING_LEN + 1), + resource: BytesString::from_slice(b"r").unwrap(), + service: BytesString::from_slice(b"svc").unwrap(), + span_id: 1, + trace_id: 1, + start: 1_000_000, + duration: 1_000, + ..Default::default() + }]]; + + truncate_span_strings(&mut traces); + + let payload = serializer + .collect_and_process_traces(traces, TraceExporterOutputFormat::V04) + .unwrap(); + let serialized = serializer + .serialize_payload(&payload, &TracerMetadata::default()) + .unwrap(); + + let (decoded, _) = + libdd_trace_utils::msgpack_decoder::v04::from_slice(&serialized).unwrap(); + let name = decoded[0][0].name; + assert_eq!(name.chars().count(), TRUNCATED_SPAN_STRING_LEN); + assert!(name.ends_with(TRUNCATION_SUFFIX)); + } } diff --git a/libdd-trace-utils/src/span/mod.rs b/libdd-trace-utils/src/span/mod.rs index 1a122efe99..d1ba0fd88e 100644 --- a/libdd-trace-utils/src/span/mod.rs +++ b/libdd-trace-utils/src/span/mod.rs @@ -24,18 +24,69 @@ use std::{fmt, ptr}; /// from a static str and check if the string is empty. pub trait SpanText: Debug + Eq + Hash + Borrow + Serialize + Default { fn from_static_str(value: &'static str) -> Self; + + /// If `self` exceeds `max_chars` Unicode code points, return a new value consisting of the + /// first `result_chars - suffix.chars().count()` code points followed by `suffix`; otherwise + /// return `self` unchanged. + /// + /// Implementations that cannot allocate (e.g. `&str`) return `self` unmodified. + fn maybe_truncate(self, max_chars: usize, result_chars: usize, suffix: &str) -> Self { + // Default: no allocation possible, so return unchanged. + // Implementations that own their storage (e.g. `BytesString`) should override this. + let _ = (max_chars, result_chars, suffix); + self + } } impl SpanText for &str { fn from_static_str(value: &'static str) -> Self { value } + // maybe_truncate uses the default (no-op): &str is borrowed and cannot allocate. + // The only path that produces &str spans is the zero-copy msgpack decoder + // (SpanSlice / SliceData), whose callers enforce length limits upstream. } impl SpanText for BytesString { fn from_static_str(value: &'static str) -> Self { BytesString::from_static(value) } + + fn maybe_truncate(self, max_chars: usize, result_chars: usize, suffix: &str) -> Self { + let s = self.as_str(); + // Fast path: UTF-8 byte length >= char count, so byte length within limit ⇒ chars fit. + if s.len() <= max_chars { + return self; + } + // Single pass: find the byte offset of char `keep_chars` and count total chars together, + // avoiding a separate O(n) `chars().count()` scan followed by another `char_indices()` + // walk. + let suffix_chars = suffix.chars().count(); + debug_assert!( + result_chars >= suffix_chars, + "result_chars ({result_chars}) must be >= suffix length ({suffix_chars})" + ); + let keep_chars = result_chars.saturating_sub(suffix_chars); + let mut keep_byte_end = None; + let mut total_chars = 0usize; + for (byte_pos, _) in s.char_indices() { + if total_chars == keep_chars { + keep_byte_end = Some(byte_pos); + } + total_chars += 1; + if total_chars > max_chars { + break; + } + } + if total_chars <= max_chars { + return self; + } + let end = keep_byte_end.unwrap_or(s.len()); + let mut truncated = String::with_capacity(end + suffix.len()); + truncated.push_str(&s[..end]); + truncated.push_str(suffix); + BytesString::from_string(truncated) + } } pub trait SpanBytes: Debug + Eq + Hash + Borrow<[u8]> + Serialize + Default { diff --git a/libdd-trace-utils/src/span/trace_utils.rs b/libdd-trace-utils/src/span/trace_utils.rs index 60790aa3cb..56908979a8 100644 --- a/libdd-trace-utils/src/span/trace_utils.rs +++ b/libdd-trace-utils/src/span/trace_utils.rs @@ -5,9 +5,120 @@ use tracing::debug; -use super::{v04::Span, SpanText, TraceData}; +use super::{ + v04::{AttributeAnyValue, AttributeArrayValue, Span}, + SpanText, TraceData, +}; use std::collections::{HashMap, HashSet}; +/// Fields whose Unicode code-point count exceeds this threshold are truncated. +pub const MAX_SPAN_STRING_LEN: usize = 25_000; +/// Length (in Unicode code points) to which over-long fields are truncated, including the suffix. +pub const TRUNCATED_SPAN_STRING_LEN: usize = 2_500; +/// Suffix appended to every truncated field. +const TRUNCATION_SUFFIX: &str = "..."; + +/// Truncate all text fields in every span across all trace chunks. +/// +/// Any field whose Unicode code-point count exceeds [`MAX_SPAN_STRING_LEN`] is replaced with +/// the first `TRUNCATED_SPAN_STRING_LEN - 14` code points followed by `"..."`, +/// giving a total of [`TRUNCATED_SPAN_STRING_LEN`] code points. Numeric fields and +/// `meta_struct` bytes are left untouched. +pub fn truncate_span_strings(traces: &mut [Vec>]) { + for chunk in traces.iter_mut() { + for span in chunk.iter_mut() { + truncate_span(span); + } + } +} + +fn trunc(v: S) -> S { + v.maybe_truncate( + MAX_SPAN_STRING_LEN, + TRUNCATED_SPAN_STRING_LEN, + TRUNCATION_SUFFIX, + ) +} + +fn trunc_in_place(field: &mut S) { + *field = trunc(std::mem::take(field)); +} + +fn truncate_attribute_value(v: AttributeAnyValue) -> AttributeAnyValue { + match v { + AttributeAnyValue::SingleValue(AttributeArrayValue::String(s)) => { + AttributeAnyValue::SingleValue(AttributeArrayValue::String(trunc(s))) + } + AttributeAnyValue::Array(vec) => AttributeAnyValue::Array( + vec.into_iter() + .map(|item| match item { + AttributeArrayValue::String(s) => AttributeArrayValue::String(trunc(s)), + other => other, + }) + .collect(), + ), + other => other, + } +} + +fn truncate_span(span: &mut Span) { + trunc_in_place(&mut span.service); + trunc_in_place(&mut span.name); + trunc_in_place(&mut span.resource); + trunc_in_place(&mut span.r#type); + + // If truncation makes two keys identical, the downstream span.dedup() call keeps the + // last original entry (VecMap dedup semantics). This mirrors the backend's own behavior + // when a tracer submits a span with duplicate keys. + for (key, value) in span.meta.iter_mut() { + trunc_in_place(key); + trunc_in_place(value); + } + + for (key, _value) in span.metrics.iter_mut() { + trunc_in_place(key); + } + + for (key, _value) in span.meta_struct.iter_mut() { + trunc_in_place(key); + } + + if !span.span_links.is_empty() { + span.span_links = std::mem::take(&mut span.span_links) + .into_iter() + .map(|mut link| { + trunc_in_place(&mut link.tracestate); + // Use entry API so that if truncation maps two originally-distinct keys to the + // same string, the first entry's value is kept and the second is dropped without + // allocating a truncated value for it. + let mut new_attrs = HashMap::with_capacity(link.attributes.len()); + for (k, v) in std::mem::take(&mut link.attributes) { + new_attrs.entry(trunc(k)).or_insert_with(|| trunc(v)); + } + link.attributes = new_attrs; + link + }) + .collect(); + } + + if !span.span_events.is_empty() { + span.span_events = std::mem::take(&mut span.span_events) + .into_iter() + .map(|mut event| { + trunc_in_place(&mut event.name); + let mut new_attrs = HashMap::with_capacity(event.attributes.len()); + for (k, v) in std::mem::take(&mut event.attributes) { + new_attrs + .entry(trunc(k)) + .or_insert_with(|| truncate_attribute_value(v)); + } + event.attributes = new_attrs; + event + }) + .collect(); + } +} + /// Span metric the mini agent must set for the backend to recognize top level span const TOP_LEVEL_KEY: &str = "_top_level"; /// Span metric the tracer sets to denote a top level span @@ -205,7 +316,10 @@ where #[cfg(test)] mod tests { use super::*; - use crate::span::v04::{SpanBytes, VecMap}; + use crate::span::v04::{ + AttributeAnyValue, AttributeArrayValue, SpanBytes, SpanEvent, SpanLink, VecMap, + }; + use std::collections::HashMap; fn create_test_span( trace_id: u64, @@ -437,4 +551,167 @@ mod tests { } } } + + // ----------------------------------------------------------------------- + // truncate_span_strings tests + // ----------------------------------------------------------------------- + + fn long_str(c: char, n: usize) -> String { + std::iter::repeat_n(c, n).collect() + } + + fn bs(s: &str) -> libdd_tinybytes::BytesString { + libdd_tinybytes::BytesString::from_string(s.to_string()) + } + + fn make_span(name: &str, resource: &str, meta_key: &str, meta_val: &str) -> SpanBytes { + SpanBytes { + name: bs(name), + resource: bs(resource), + meta: vec![(bs(meta_key), bs(meta_val))].into(), + ..Default::default() + } + } + + #[test] + fn test_no_truncation_at_limit() { + // Exactly 25_000 chars — should NOT be truncated. + let name = long_str('a', MAX_SPAN_STRING_LEN); + let mut traces = vec![vec![make_span(&name, "r", "k", "v")]]; + truncate_span_strings(&mut traces); + assert_eq!( + traces[0][0].name.as_str().chars().count(), + MAX_SPAN_STRING_LEN + ); + } + + #[test] + fn test_truncation_over_limit() { + // 25_001 chars — should be truncated to 2_500. + let resource = long_str('b', MAX_SPAN_STRING_LEN + 1); + let mut traces = vec![vec![make_span("n", &resource, "k", "v")]]; + truncate_span_strings(&mut traces); + let result = traces[0][0].resource.as_str(); + assert_eq!(result.chars().count(), TRUNCATED_SPAN_STRING_LEN); + assert!(result.ends_with(TRUNCATION_SUFFIX)); + } + + #[test] + fn test_meta_key_and_value_truncated() { + let long_key = long_str('c', MAX_SPAN_STRING_LEN + 1); + let short_val = long_str('d', 2_000); // under limit — unchanged + let mut traces = vec![vec![make_span("n", "r", &long_key, &short_val)]]; + truncate_span_strings(&mut traces); + let (k, v) = traces[0][0].meta.iter().next().unwrap(); + assert_eq!(k.as_str().chars().count(), TRUNCATED_SPAN_STRING_LEN); + assert!(k.as_str().ends_with(TRUNCATION_SUFFIX)); + assert_eq!(v.as_str().chars().count(), 2_000); // unchanged + } + + #[test] + fn test_unicode_truncation_by_code_points() { + // Each '€' is 3 bytes; 25_001 euros exceed the threshold. + let s = long_str('€', MAX_SPAN_STRING_LEN + 1); + let mut traces = vec![vec![make_span(&s, "r", "k", "v")]]; + truncate_span_strings(&mut traces); + let result = traces[0][0].name.as_str(); + // Result must be exactly TRUNCATED_SPAN_STRING_LEN code points. + assert_eq!(result.chars().count(), TRUNCATED_SPAN_STRING_LEN); + assert!(result.ends_with(TRUNCATION_SUFFIX)); + } + + #[test] + fn test_span_link_fields_truncated() { + let long_tracestate = long_str('x', MAX_SPAN_STRING_LEN + 1); + let long_attr_key = long_str('y', MAX_SPAN_STRING_LEN + 1); + let long_attr_val = long_str('z', MAX_SPAN_STRING_LEN + 1); + let mut traces = vec![vec![SpanBytes { + span_links: vec![SpanLink { + tracestate: long_tracestate.into(), + attributes: HashMap::from([(long_attr_key.into(), long_attr_val.into())]), + ..Default::default() + }], + ..Default::default() + }]]; + truncate_span_strings(&mut traces); + let link = &traces[0][0].span_links[0]; + assert_eq!( + link.tracestate.as_str().chars().count(), + TRUNCATED_SPAN_STRING_LEN + ); + let (k, v) = link.attributes.iter().next().unwrap(); + assert_eq!(k.as_str().chars().count(), TRUNCATED_SPAN_STRING_LEN); + assert_eq!(v.as_str().chars().count(), TRUNCATED_SPAN_STRING_LEN); + } + + #[test] + fn test_span_event_name_and_string_attribute_truncated() { + let long_name = long_str('e', MAX_SPAN_STRING_LEN + 1); + let long_str_attr = long_str('f', MAX_SPAN_STRING_LEN + 1); + let mut traces = vec![vec![SpanBytes { + span_events: vec![SpanEvent { + name: long_name.into(), + attributes: HashMap::from([ + ( + "str_attr".into(), + AttributeAnyValue::SingleValue(AttributeArrayValue::String( + long_str_attr.into(), + )), + ), + ( + "int_attr".into(), + AttributeAnyValue::SingleValue(AttributeArrayValue::Integer(42)), + ), + ]), + ..Default::default() + }], + ..Default::default() + }]]; + truncate_span_strings(&mut traces); + let event = &traces[0][0].span_events[0]; + assert_eq!( + event.name.as_str().chars().count(), + TRUNCATED_SPAN_STRING_LEN + ); + match event.attributes.get("str_attr").unwrap() { + AttributeAnyValue::SingleValue(AttributeArrayValue::String(s)) => { + assert_eq!(s.as_str().chars().count(), TRUNCATED_SPAN_STRING_LEN); + } + _ => panic!("expected string attribute"), + } + // Integer attribute untouched + match event.attributes.get("int_attr").unwrap() { + AttributeAnyValue::SingleValue(AttributeArrayValue::Integer(42)) => {} + _ => panic!("expected integer attribute"), + } + } + + #[test] + fn test_metric_key_truncated() { + let long_key = long_str('g', MAX_SPAN_STRING_LEN + 1); + let mut traces = vec![vec![SpanBytes { + metrics: vec![(bs(&long_key), 1.0_f64)].into(), + ..Default::default() + }]]; + truncate_span_strings(&mut traces); + let (k, v) = traces[0][0].metrics.iter().next().unwrap(); + assert_eq!(k.as_str().chars().count(), TRUNCATED_SPAN_STRING_LEN); + assert!(k.as_str().ends_with(TRUNCATION_SUFFIX)); + assert_eq!(*v, 1.0_f64); + } + + #[test] + fn test_meta_struct_key_truncated() { + use libdd_tinybytes::Bytes; + let long_key = long_str('h', MAX_SPAN_STRING_LEN + 1); + let payload = Bytes::from_static(b"some bytes"); + let mut traces = vec![vec![SpanBytes { + meta_struct: vec![(bs(&long_key), payload)].into(), + ..Default::default() + }]]; + truncate_span_strings(&mut traces); + let (k, v) = traces[0][0].meta_struct.iter().next().unwrap(); + assert_eq!(k.as_str().chars().count(), TRUNCATED_SPAN_STRING_LEN); + assert_eq!(v.as_ref(), b"some bytes"); // value unchanged + } }