Optimize decode for ~27-75% better performance.

davecgh · davecgh · commit 342c7ce70642 · 2025-09-10T13:31:48.000-05:00
This optimizes the decode function to significantly improve its
performance.

It accomplishes this by making the following overall changes:

- uses uint32 words to reduce the total number of multiplications that
  need to be done
- changes to a more efficient total size approximation which reduces the
  overall internal buffer sizes
- employs a stack allocated array for the internal array when working
  with typical sizes
- skips all leading zeros

Note that there is in an additional heap allocation for larger inputs as
compared to the existing code, but the overall perf gains on such large
inputs is even better than on the more typical smaller inputs due to
fewer overall calculations which more than makes up for it as can be
seen in the benchmarks.

Finally, in an effort to help ensure correctness, the new code was fuzz
tested for 24 hours on 16 cores for a total effective fuzz time of 384
hours with no issues found.

name                             old time/op   new time/op   delta
--------------------------------------------------------------------
Base58Decode/20_bytes_addrhash    240ns ± 2%    133ns ± 1%   -44.34%
Base58Decode/53_chars_wif         798ns ± 2%    299ns ± 1%   -62.51%
Base58Decode/111_chars_extkey    3.54µs ± 1%   1.01µs ± 2%   -71.51%
Base58Decode/50_zeros             128ns ± 1%     69ns ± 1%   -46.14%
Base58Decode/200_bytes_large     20.0µs ± 1%    5.1µs ± 0%   -74.29%
CheckDecode                       677ns ± 1%    493ns ± 0%   -27.24%

name                             old allocs/op new allocs/op  delta
----------------------------------------------------------------------
Base58Decode/20_bytes_addrhash   1.00 ± 0%     1.00 ± 0%      ~
Base58Decode/53_chars_wif        1.00 ± 0%     1.00 ± 0%      ~
Base58Decode/111_chars_extkey    1.00 ± 0%     1.00 ± 0%      ~
Base58Decode/50_zeros            1.00 ± 0%     1.00 ± 0%      ~
Base58Decode/200_bytes_large     1.00 ± 0%     2.00 ± 0%      +100.00%
CheckDecode                      1.00 ± 0%     1.00 ± 0%      ~
diff --git a/base58.go b/base58.go
@@ -1,10 +1,14 @@
 // Copyright (c) 2013-2015 The btcsuite developers
-// Copyright (c) 2015-2020 The Decred developers
+// Copyright (c) 2015-2025 The Decred developers
 // Use of this source code is governed by an ISC
 // license that can be found in the LICENSE file.
 
 package base58
 
+import (
+	"math/bits"
+)
+
 //go:generate go run genalphabet.go
 
 // Decode decodes a modified base58 string to a byte slice.
@@ -13,45 +17,92 @@ func Decode(input string) []byte {
 		return []byte("")
 	}
 
-	// The max possible output size is when a base58 encoding consists of
-	// nothing but the alphabet character at index 0 which would result in the
-	// same number of bytes as the number of input chars.
-	output := make([]byte, len(input))
+	// Determine the maximum possible output size.
+	//
+	// Since the conversion is from base58 to base256, the max possible number
+	// of bytes of output per input byte, excluding the leading zeros, is
+	// log_256(58).  Therefore, the max total output size is the number of
+	// leading zero bytes plus ceil(inputSizeMinusLeadingZeros * log_256(58)).
+	//
+	// Note that log_256(58) ~= 0.7322 < 47/64 which is within 0.3% of the true
+	// value and efficient to compute as it only involves division by a power of
+	// 2 and thus serves as a good approximation.  So, the calculation below is
+	// the integer division equivalent of nlz + ceil(len(input[nlz:]) * 47/64).
+	//
+	// Finally, in order to avoid additional conditional branches in the
+	// conversion from uint32s to bytes, the max output size is rounded up to
+	// the next multiple of 4.
+	var nlz int
+	for i := 0; i < len(input) && input[i] == alphabetIdx0; i++ {
+		nlz++
+	}
+	maxOutputSizeNoLZ := (len(input[nlz:])*47 + 63) / 64
+	maxOutputSize := nlz + maxOutputSizeNoLZ
+	maxOutputSize = ((maxOutputSize + 3) / 4) * 4
+	output := make([]byte, maxOutputSize)
+
+	// The algorithm below performs the calculations with uint32s for better
+	// performance and the total number of uint32s is ceil(maxOutputSizeNoLZ /
+	// 4).  Note that the leading zeros are skipped here, so the calculation is
+	// based on the max output size excluding them.
+	//
+	// In order to avoid an additional heap allocation for the vast majority of
+	// typical cases, use an array on the stack for inputs of up to 120 chars
+	// (plus any leading zeros) and fall back to a heap alloc for larger inputs.
+	// Note that 120 input chars, excluding leading zeros, equates to a max
+	// output size of 92 when applying the same calculations as above.
+	//
+	// This value was chosen because it provides a good balance between alloc
+	// size, speed, and the max chars in the vast majority of inputs decoded in
+	// the most common use cases.
+	const maxOut32StackAlloc = 92 / 4
+	maxOut32Size := (maxOutputSizeNoLZ + 3) / 4
+	var out32 []uint32
+	if maxOut32Size <= maxOut32StackAlloc {
+		var out32Arr [maxOut32StackAlloc]uint32
+		out32 = out32Arr[:maxOut32Size]
+	} else {
+		out32 = make([]uint32, maxOut32Size)
+	}
 
-	// Encode to base256 in reverse order to avoid extra calculations to
-	// determine the final output size in favor of just keeping track while
-	// iterating.
-	var index int
-	for _, r := range []byte(input) {
+	// Decode to base256 in reverse order to reduce the total number of overall
+	// calculations.
+	var out32Idx int
+	for _, r := range []byte(input[nlz:]) {
 		// Invalid base58 character.
-		val := uint32(b58[r])
+		val := uint64(b58[r])
 		if val == 255 {
 			return []byte("")
 		}
 
-		// Multiply each byte in the output by 58 and encode to base256 while
-		// propagating the carry.
-		for i, b := range output[:index] {
-			val += uint32(b) * 58
-			output[i] = byte(val)
-			val >>= 8
+		for i, ui32 := range out32[:out32Idx] {
+			val += uint64(ui32) * 58
+			out32[i] = uint32(val) // nolint:gosec
+			val >>= 32
 		}
-		for ; val > 0; val >>= 8 {
-			output[index] = byte(val)
-			index++
+		if val > 0 {
+			out32[out32Idx] = uint32(val) // nolint:gosec
+			out32Idx++
 		}
 	}
 
-	// Account for the leading zeros in the input.  They are appended since the
-	// encoding is happening in reverse order.
-	for _, r := range []byte(input) {
-		if r != alphabetIdx0 {
-			break
-		}
+	// Convert uint32 words to bytes.
+	var index int
+	for _, ui32 := range out32[:out32Idx] {
+		output[index] = byte(ui32)
+		output[index+1] = byte(ui32 >> 8)
+		output[index+2] = byte(ui32 >> 16)
+		output[index+3] = byte(ui32 >> 24)
+		index += 4
+	}
 
-		output[index] = 0
-		index++
+	// Adjust the output index to the position of the most significant byte and
+	// to account for the leading zeros in the input.  They come last since the
+	// decoding is happening in reverse order.
+	if out32Idx > 0 {
+		index -= bits.LeadingZeros32(out32[out32Idx-1]) / 8
 	}
+	index += nlz
 
 	// Truncate the output buffer to the actual number of decoded bytes and
 	// reverse it since it was calculated in reverse order.