From 8deed257700d79d3a83a9a827d38774eb1ebfabf Mon Sep 17 00:00:00 2001 From: Mark Molinaro Date: Wed, 24 Jun 2026 18:16:01 +0000 Subject: [PATCH 1/2] THRIFT-6069: python: avoid utf8 copy during fastbinary string encode Client: py Use PyUnicode_AsUTF8AndSize when available, with a fallback for older Python 3 releases, so unicode strings can be encoded without allocating an intermediate PyBytes object. The const-correct buffer is forwarded through a new writeString(const char*, int32_t) overload on each protocol so length prefixing uses the protocol-native encoding (int32 for binary, varint for compact). The internal writeBuffer helper was made const-correct so the zero-copy path does not need to const_cast CPython-managed string storage. Add a fastbinary unicode round-trip test for TApplicationException to cover the new encode path, and a TSerializer regression test that asserts pure-Python and accelerated CompactProtocol produce identical bytes for unicode payloads. Performance (50k iterations, warmed): | Workload | Baseline | This commit | Speedup | |-------------------------------|----------|-------------|---------| | encode simple (30B, 1 string) | 0.60 us | 0.55 us | 1.09x | | encode 10-string (182B) | 1.44 us | 1.01 us | 1.43x | | encode complex (395B) | 3.02 us | 2.56 us | 1.18x | The more string fields a struct has, the larger the gain. Decode is unchanged. Co-Authored-By: Claude Sonnet 4.6 --- lib/py/src/ext/binary.h | 7 ++++- lib/py/src/ext/compact.h | 7 ++++- lib/py/src/ext/protocol.h | 2 +- lib/py/src/ext/protocol.tcc | 20 +++++++++++--- lib/py/test/thrift_TBinaryProtocol.py | 38 +++++++++++++++++++++++++++ lib/py/test/thrift_TSerializer.py | 11 ++++++++ 6 files changed, 79 insertions(+), 6 deletions(-) diff --git a/lib/py/src/ext/binary.h b/lib/py/src/ext/binary.h index dd7750b49a8..906f1981290 100644 --- a/lib/py/src/ext/binary.h +++ b/lib/py/src/ext/binary.h @@ -67,6 +67,11 @@ class BinaryProtocol : public ProtocolBase { writeBuffer(PyBytes_AS_STRING(value), len); } + void writeString(const char* value, int32_t len) { + writeI32(len); + writeBuffer(value, len); + } + bool writeListBegin(PyObject* value, const SetListTypeArgs& parsedargs, int32_t len) { writeByte(parsedargs.element_type); writeI32(len); @@ -88,7 +93,7 @@ class BinaryProtocol : public ProtocolBase { return encodeValue(value, parsedspec.type, parsedspec.typeargs); } - void writeUuid(char* value) { + void writeUuid(const char* value) { writeBuffer(value, 16); } diff --git a/lib/py/src/ext/compact.h b/lib/py/src/ext/compact.h index 0d8946b3441..7874d79c165 100644 --- a/lib/py/src/ext/compact.h +++ b/lib/py/src/ext/compact.h @@ -61,6 +61,11 @@ class CompactProtocol : public ProtocolBase { writeBuffer(PyBytes_AS_STRING(value), len); } + void writeString(const char* value, int32_t len) { + writeVarint(len); + writeBuffer(value, len); + } + bool writeListBegin(PyObject* value, const SetListTypeArgs& args, int32_t len) { int ctype = toCompactType(args.element_type); if (len <= 14) { @@ -104,7 +109,7 @@ class CompactProtocol : public ProtocolBase { void writeFieldStop() { writeByte(0); } - void writeUuid(char* value) { + void writeUuid(const char* value) { writeBuffer(value, 16); } diff --git a/lib/py/src/ext/protocol.h b/lib/py/src/ext/protocol.h index 20911c89724..2e737239526 100644 --- a/lib/py/src/ext/protocol.h +++ b/lib/py/src/ext/protocol.h @@ -71,7 +71,7 @@ class ProtocolBase { return true; } - bool writeBuffer(char* data, size_t len); + bool writeBuffer(const char* data, size_t len); void writeByte(uint8_t val) { writeBuffer(reinterpret_cast(&val), 1); } diff --git a/lib/py/src/ext/protocol.tcc b/lib/py/src/ext/protocol.tcc index 448fc6f105e..aab53e383bd 100644 --- a/lib/py/src/ext/protocol.tcc +++ b/lib/py/src/ext/protocol.tcc @@ -89,7 +89,7 @@ PyObject* ProtocolBase::getEncodedValue() { } template -inline bool ProtocolBase::writeBuffer(char* data, size_t size) { +inline bool ProtocolBase::writeBuffer(const char* data, size_t size) { if (!PycStringIO) { PycString_IMPORT; } @@ -169,7 +169,7 @@ PyObject* ProtocolBase::getEncodedValue() { } template -inline bool ProtocolBase::writeBuffer(char* data, size_t size) { +inline bool ProtocolBase::writeBuffer(const char* data, size_t size) { size_t need = size + output_->pos; if (output_->buf.capacity() < need) { try { @@ -456,18 +456,32 @@ bool ProtocolBase::encodeValue(PyObject* value, TType type, PyObject* type case T_STRING: { ScopedPyObject nval; + Py_ssize_t len; if (PyUnicode_Check(value)) { +#if PY_VERSION_HEX >= 0x03030000 + const char* str = PyUnicode_AsUTF8AndSize(value, &len); + if (!str) { + return false; + } + if (!detail::check_ssize_t_32(len)) { + return false; + } + + impl()->writeString(str, static_cast(len)); + return true; +#else nval.reset(PyUnicode_AsUTF8String(value)); if (!nval) { return false; } +#endif } else { Py_INCREF(value); nval.reset(value); } - Py_ssize_t len = PyBytes_Size(nval.get()); + len = PyBytes_Size(nval.get()); if (!detail::check_ssize_t_32(len)) { return false; } diff --git a/lib/py/test/thrift_TBinaryProtocol.py b/lib/py/test/thrift_TBinaryProtocol.py index d4269eb6175..14e6115f997 100644 --- a/lib/py/test/thrift_TBinaryProtocol.py +++ b/lib/py/test/thrift_TBinaryProtocol.py @@ -22,7 +22,9 @@ import uuid import _import_local_thrift # noqa +from thrift.Thrift import TApplicationException from thrift.protocol.TBinaryProtocol import TBinaryProtocol +from thrift.protocol.TBinaryProtocol import TBinaryProtocolAcceleratedFactory from thrift.protocol.TProtocol import TProtocolException from thrift.transport import TTransport @@ -167,6 +169,13 @@ def testField(type, data): protocol.readStructEnd() +APPLICATION_EXCEPTION_THRIFT_SPEC = ( + None, + (1, 11, "message", "UTF8", None), + (2, 8, "type", None, None), +) + + def testMessage(data, strict=True): message = {} message['name'] = data[0] @@ -196,6 +205,13 @@ def testMessage(data, strict=True): class TestTBinaryProtocol(unittest.TestCase): + def setUp(self): + try: + from thrift.protocol import fastbinary # noqa: F401 + self._has_fastbinary = True + except ImportError: + self._has_fastbinary = False + def test_TBinaryProtocol_write_read(self): try: testNaked('Byte', 123) @@ -280,6 +296,28 @@ def test_TBinaryProtocol_write_read(self): print("Assertion fail") raise e + def test_accelerated_utf8_roundtrip_on_application_exception(self): + if not self._has_fastbinary: + self.skipTest("C extension not available") + + original = TApplicationException( + type=TApplicationException.PROTOCOL_ERROR, + message=("snowman-\u2603-rocket-\U0001F680-" * 32), + ) + + typeargs = [TApplicationException, APPLICATION_EXCEPTION_THRIFT_SPEC] + + otrans = TTransport.TMemoryBuffer() + oproto = TBinaryProtocolAcceleratedFactory(fallback=False).getProtocol(otrans) + oproto.trans.write(oproto._fast_encode(original, typeargs)) + + itrans = TTransport.TMemoryBuffer(otrans.getvalue()) + iproto = TBinaryProtocolAcceleratedFactory(fallback=False).getProtocol(itrans) + decoded = iproto._fast_decode(None, iproto, typeargs) + + self.assertEqual(decoded.message, original.message) + self.assertEqual(decoded.type, original.type) + def test_TBinaryProtocol_no_strict_write_read(self): TMessageType = {"T_CALL": 1, "T_REPLY": 2, "T_EXCEPTION": 3, "T_ONEWAY": 4} test_data = [("short message name", TMessageType['T_CALL'], 0), diff --git a/lib/py/test/thrift_TSerializer.py b/lib/py/test/thrift_TSerializer.py index 396e68d9d02..8e62e00cc5d 100644 --- a/lib/py/test/thrift_TSerializer.py +++ b/lib/py/test/thrift_TSerializer.py @@ -70,6 +70,17 @@ def test_TCompactProtocolAccelerated(self): factory = TCompactProtocolAcceleratedFactory() self.verify(self.compact_serialized, factory) + def test_TCompactProtocolAccelerated_unicode_matches_python(self): + message = Message("é", 42) + expected = serialize(message, TCompactProtocolFactory()) + actual = serialize(message, TCompactProtocolAcceleratedFactory()) + + self.assertEqual(expected, actual) + self.assertEqual( + message.body, + deserialize(Message(), actual, TCompactProtocolAcceleratedFactory()).body, + ) + if __name__ == "__main__": unittest.main() From d3634758dc4bc7dedf317b6c284b6cc388e688dd Mon Sep 17 00:00:00 2001 From: Mark Molinaro <16494982+markjm@users.noreply.github.com> Date: Wed, 24 Jun 2026 11:52:10 -0700 Subject: [PATCH 2/2] Apply suggestions from code review Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- lib/py/test/thrift_TBinaryProtocol.py | 2 +- lib/py/test/thrift_TSerializer.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/py/test/thrift_TBinaryProtocol.py b/lib/py/test/thrift_TBinaryProtocol.py index 14e6115f997..dbd521b8785 100644 --- a/lib/py/test/thrift_TBinaryProtocol.py +++ b/lib/py/test/thrift_TBinaryProtocol.py @@ -302,7 +302,7 @@ def test_accelerated_utf8_roundtrip_on_application_exception(self): original = TApplicationException( type=TApplicationException.PROTOCOL_ERROR, - message=("snowman-\u2603-rocket-\U0001F680-" * 32), + message=(u"snowman-\u2603-rocket-\U0001F680-" * 32), ) typeargs = [TApplicationException, APPLICATION_EXCEPTION_THRIFT_SPEC] diff --git a/lib/py/test/thrift_TSerializer.py b/lib/py/test/thrift_TSerializer.py index 8e62e00cc5d..1a626fdca5f 100644 --- a/lib/py/test/thrift_TSerializer.py +++ b/lib/py/test/thrift_TSerializer.py @@ -71,7 +71,7 @@ def test_TCompactProtocolAccelerated(self): self.verify(self.compact_serialized, factory) def test_TCompactProtocolAccelerated_unicode_matches_python(self): - message = Message("é", 42) + message = Message(u"\u00e9", 42) expected = serialize(message, TCompactProtocolFactory()) actual = serialize(message, TCompactProtocolAcceleratedFactory())