Skip to content

Commit c996d5a

Browse files
committed
Fix ZstandardStream truncating multi-frame zstd input
ZSTD_decompressStream returns 0 at the end of each frame, not the end of the stream, but ZstandardDecoder.Decompress treated that as end-of-stream via a permanent _finished latch. A zstd stream made of multiple concatenated frames (valid per RFC 8878) -- e.g. large HTTP Content-Encoding: zstd responses -- was therefore silently truncated to its first frame, surfacing downstream as truncated payloads (e.g. JSON parse errors at exactly 65536 bytes). ZstandardStream now continues decoding subsequent frames on the same native context (mirroring GZipStream multi-member handling), distinguishes a following frame from trailing data via the frame magic number, and only treats end-of-input as a clean end when at a frame boundary. Adds regression tests for concatenated frames (buffered, split across reads, and with trailing data). Fixes #129038.
1 parent ecaa1c5 commit c996d5a

3 files changed

Lines changed: 293 additions & 30 deletions

File tree

src/libraries/System.IO.Compression/src/System/IO/Compression/Zstandard/ZstandardDecoder.cs

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -323,6 +323,22 @@ public void Reset()
323323
_finished = false;
324324
}
325325

326+
/// <summary>
327+
/// Clears the end-of-frame state so the decoder can continue decoding the next frame
328+
/// in a stream that contains multiple concatenated Zstandard frames (valid per RFC 8878 §3).
329+
/// </summary>
330+
/// <remarks>
331+
/// <see cref="Interop.Zstd.ZSTD_decompressStream"/> returns 0 at the end of each frame, not at the
332+
/// end of the stream, and the native context is automatically ready to begin the next frame on the
333+
/// following call. Only the managed end-of-frame latch is cleared here; the native context is left
334+
/// intact so window-size and dictionary settings carry over to the next frame. Must only be called
335+
/// after <see cref="Decompress"/> reported <see cref="OperationStatus.Done"/>.
336+
/// </remarks>
337+
internal void PrepareForNextFrame()
338+
{
339+
_finished = false;
340+
}
341+
326342
/// <summary>References a prefix for the next decompression operation.</summary>
327343
/// <remarks>The prefix will be used only for the next decompression frame and will be removed when <see cref="Reset"/> is called. The referenced data must remain valid and unmodified for the duration of the decompression operation.</remarks>
328344
/// <exception cref="ObjectDisposedException">The decoder has been disposed.</exception>

src/libraries/System.IO.Compression/src/System/IO/Compression/Zstandard/ZstandardStream.Decompress.cs

Lines changed: 126 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
// The .NET Foundation licenses this file to you under the MIT license.
33

44
using System.Buffers;
5+
using System.Buffers.Binary;
56
using System.Diagnostics;
67
using System.Diagnostics.CodeAnalysis;
78
using System.Threading;
@@ -14,6 +15,15 @@ public sealed partial class ZstandardStream
1415
private ZstandardDecoder? _decoder;
1516
private bool _nonEmptyInput;
1617

18+
// Set when the decoder reports the end of a frame (OperationStatus.Done). A zstd stream may
19+
// contain multiple frames concatenated back-to-back (RFC 8878 §3), so reaching the end of a
20+
// frame is not necessarily the end of the stream. While at a frame boundary, a subsequent
21+
// end-of-input from the underlying stream is a clean end rather than truncated data.
22+
private bool _atFrameBoundary;
23+
24+
// Length of a Zstandard frame magic number, in bytes.
25+
private const int ZstdFrameMagicLength = sizeof(uint);
26+
1727
/// <summary>Initializes a new instance of the <see cref="ZstandardStream" /> class by using the specified stream and decoder instance.</summary>
1828
/// <param name="stream">The stream from which data to decompress is read.</param>
1929
/// <param name="decoder">The decoder instance to use for decompression. The stream will not dispose this decoder; instead, it will reset it when the stream is disposed.</param>
@@ -36,44 +46,124 @@ private bool TryDecompress(Span<byte> destination, out int bytesWritten, out Ope
3646
{
3747
Debug.Assert(_decoder != null);
3848

39-
// Decompress any data we may have in our buffer.
40-
lastResult = _decoder.Decompress(_buffer.ActiveSpan, destination, out int bytesConsumed, out bytesWritten);
41-
_buffer.Discard(bytesConsumed);
49+
bytesWritten = 0;
50+
lastResult = OperationStatus.Done;
4251

43-
if (lastResult == OperationStatus.InvalidData)
52+
while (true)
4453
{
45-
throw new InvalidDataException(SR.ZstandardStream_Decompress_InvalidData);
46-
}
54+
// Decompress any data we may have in our buffer into the remaining destination.
55+
OperationStatus status = _decoder.Decompress(_buffer.ActiveSpan, destination, out int bytesConsumed, out int written);
56+
_buffer.Discard(bytesConsumed);
57+
bytesWritten += written;
58+
destination = destination.Slice(written);
59+
lastResult = status;
60+
61+
if (status == OperationStatus.InvalidData)
62+
{
63+
throw new InvalidDataException(SR.ZstandardStream_Decompress_InvalidData);
64+
}
4765

48-
// If we successfully decompressed any bytes, or if we've reached the end of the decompression, we're done.
49-
if (bytesWritten != 0 || lastResult == OperationStatus.Done)
50-
{
51-
return true;
52-
}
66+
if (status == OperationStatus.Done)
67+
{
68+
// The decoder finished a frame. A zstd stream may be a sequence of frames
69+
// concatenated back-to-back (RFC 8878 §3) — produced by many encoders/CDNs that
70+
// flush a frame per buffer — so the end of a frame is not necessarily the end of
71+
// the stream. We're now at a frame boundary; end-of-input here is a clean end.
72+
_atFrameBoundary = true;
73+
74+
// If the next frame is already buffered, keep decoding it on the same native
75+
// context (no reset needed: ZSTD_decompressStream automatically begins the next
76+
// frame on the following call) into whatever destination space remains.
77+
if (_buffer.ActiveLength >= ZstdFrameMagicLength && StartsWithZstdFrame(_buffer.ActiveSpan))
78+
{
79+
_decoder.PrepareForNextFrame();
80+
_atFrameBoundary = false;
81+
82+
if (destination.IsEmpty)
83+
{
84+
// No room left to decode the next frame in this call. Hand back what we
85+
// have; the stream is not finished, so this must not be reported as Done
86+
// (which would trigger end-of-stream handling such as rewinding a seekable
87+
// base stream).
88+
lastResult = OperationStatus.DestinationTooSmall;
89+
return true;
90+
}
91+
92+
continue;
93+
}
5394

54-
if (destination.IsEmpty)
55-
{
56-
// The caller provided a zero-byte buffer. This is typically done in order to avoid allocating/renting
57-
// a buffer until data is known to be available. We don't have perfect knowledge here, as _decoder.Decompress
58-
// will return DestinationTooSmall whether or not more data is required. As such, we assume that if there's
59-
// any data in our input buffer, it would have been decompressible into at least one byte of output, and
60-
// otherwise we need to do a read on the underlying stream. This isn't perfect, because having input data
61-
// doesn't necessarily mean it'll decompress into at least one byte of output, but it's a reasonable approximation
62-
// for the 99% case. If it's wrong, it just means that a caller using zero-byte reads as a way to delay
63-
// getting a buffer to use for a subsequent call may end up getting one earlier than otherwise preferred.
64-
Debug.Assert(lastResult == OperationStatus.DestinationTooSmall);
65-
if (_buffer.ActiveLength != 0)
95+
// Enough leftover input to rule out another frame: this is trailing data after the
96+
// final frame. The stream is complete; leave the trailing bytes untouched (a seekable
97+
// base stream is rewound to the end of the compressed data by the caller), mirroring
98+
// how DeflateStream handles data after the last gzip member.
99+
if (_buffer.ActiveLength >= ZstdFrameMagicLength)
100+
{
101+
lastResult = OperationStatus.Done;
102+
return true;
103+
}
104+
105+
// Fewer than ZstdFrameMagicLength bytes remain: not enough to tell whether another
106+
// frame follows (its magic number may be split across reads) or this was the last
107+
// frame. Hand back any output now and resolve on the next call / underlying read.
108+
// Because we're at a frame boundary, end-of-input is treated as a clean end rather
109+
// than truncation (see _atFrameBoundary checks in Read/ReadAsync).
110+
lastResult = OperationStatus.NeedMoreData;
111+
return bytesWritten != 0;
112+
}
113+
114+
// If we successfully decompressed any bytes, we're done for this call.
115+
if (bytesWritten != 0)
66116
{
67-
Debug.Assert(bytesWritten == 0);
117+
_atFrameBoundary = false;
68118
return true;
69119
}
120+
121+
if (destination.IsEmpty)
122+
{
123+
// The caller provided a zero-byte buffer. This is typically done in order to avoid allocating/renting
124+
// a buffer until data is known to be available. We don't have perfect knowledge here, as _decoder.Decompress
125+
// will return DestinationTooSmall whether or not more data is required. As such, we assume that if there's
126+
// any data in our input buffer, it would have been decompressible into at least one byte of output, and
127+
// otherwise we need to do a read on the underlying stream. This isn't perfect, because having input data
128+
// doesn't necessarily mean it'll decompress into at least one byte of output, but it's a reasonable approximation
129+
// for the 99% case. If it's wrong, it just means that a caller using zero-byte reads as a way to delay
130+
// getting a buffer to use for a subsequent call may end up getting one earlier than otherwise preferred.
131+
Debug.Assert(status == OperationStatus.DestinationTooSmall);
132+
if (_buffer.ActiveLength != 0)
133+
{
134+
Debug.Assert(bytesWritten == 0);
135+
return true;
136+
}
137+
}
138+
139+
Debug.Assert(
140+
status == OperationStatus.NeedMoreData ||
141+
(status == OperationStatus.DestinationTooSmall && destination.IsEmpty && _buffer.ActiveLength == 0), $"{nameof(status)} == {status}, {nameof(destination.Length)} == {destination.Length}");
142+
143+
_atFrameBoundary = false;
144+
return false;
145+
}
146+
}
147+
148+
/// <summary>
149+
/// Returns whether <paramref name="data"/> begins with a Zstandard frame magic number — a standard
150+
/// frame (0xFD2FB528) or a skippable frame (0x184D2A50–0x184D2A5F) — which indicates that another
151+
/// concatenated frame follows. Used to distinguish a subsequent frame from trailing data after the
152+
/// final frame. Requires at least <see cref="ZstdFrameMagicLength"/> bytes.
153+
/// </summary>
154+
private static bool StartsWithZstdFrame(ReadOnlySpan<byte> data)
155+
{
156+
if (data.Length < ZstdFrameMagicLength)
157+
{
158+
return false;
70159
}
71160

72-
Debug.Assert(
73-
lastResult == OperationStatus.NeedMoreData ||
74-
(lastResult == OperationStatus.DestinationTooSmall && destination.IsEmpty && _buffer.ActiveLength == 0), $"{nameof(lastResult)} == {lastResult}, {nameof(destination.Length)} == {destination.Length}");
161+
const uint ZstdFrameMagic = 0xFD2FB528;
162+
const uint SkippableFrameMagicMin = 0x184D2A50;
163+
const uint SkippableFrameMagicMax = 0x184D2A5F;
75164

76-
return false;
165+
uint magic = BinaryPrimitives.ReadUInt32LittleEndian(data);
166+
return magic == ZstdFrameMagic || (magic >= SkippableFrameMagicMin && magic <= SkippableFrameMagicMax);
77167
}
78168

79169
/// <summary>Reads decompressed bytes from the underlying stream and places them in the specified array.</summary>
@@ -122,7 +212,10 @@ public override int Read(Span<byte> buffer)
122212
int bytesRead = _stream.Read(_buffer.AvailableSpan);
123213
if (bytesRead <= 0)
124214
{
125-
if (_nonEmptyInput && !buffer.IsEmpty)
215+
// Only treat end-of-input as truncation if we're in the middle of a frame.
216+
// If we're at a frame boundary (_atFrameBoundary), the stream ended cleanly
217+
// after the last of one or more concatenated frames.
218+
if (_nonEmptyInput && !buffer.IsEmpty && !_atFrameBoundary)
126219
ThrowTruncatedInvalidData();
127220
break;
128221
}
@@ -199,7 +292,10 @@ public override async ValueTask<int> ReadAsync(Memory<byte> buffer, Cancellation
199292
int bytesRead = await _stream.ReadAsync(_buffer.AvailableMemory, cancellationToken).ConfigureAwait(false);
200293
if (bytesRead <= 0)
201294
{
202-
if (_nonEmptyInput && !buffer.IsEmpty)
295+
// Only treat end-of-input as truncation if we're in the middle of a frame.
296+
// If we're at a frame boundary (_atFrameBoundary), the stream ended cleanly
297+
// after the last of one or more concatenated frames.
298+
if (_nonEmptyInput && !buffer.IsEmpty && !_atFrameBoundary)
203299
ThrowTruncatedInvalidData();
204300
break;
205301
}

0 commit comments

Comments
 (0)