From e14575a5331f4b95866c3ef2a7c148d80244c17c Mon Sep 17 00:00:00 2001 From: Digant Desai Date: Thu, 11 Jun 2026 22:16:04 -0700 Subject: [PATCH] Update [ghstack-poisoned] --- benchmark/.gitignore | 1 + benchmark/CMakeLists.txt | 21 ++++ benchmark/hf_tokenizer_encode_latency.cpp | 133 ++++++++++++++++++++++ benchmark/run.sh | 15 +++ 4 files changed, 170 insertions(+) create mode 100644 benchmark/.gitignore create mode 100644 benchmark/CMakeLists.txt create mode 100644 benchmark/hf_tokenizer_encode_latency.cpp create mode 100755 benchmark/run.sh diff --git a/benchmark/.gitignore b/benchmark/.gitignore new file mode 100644 index 00000000..567609b1 --- /dev/null +++ b/benchmark/.gitignore @@ -0,0 +1 @@ +build/ diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt new file mode 100644 index 00000000..7b0cd3ca --- /dev/null +++ b/benchmark/CMakeLists.txt @@ -0,0 +1,21 @@ +# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +# +# Build the encode latency benchmark. Standalone: pulls in the parent +# tokenizers library via FetchContent (mirrors test/CMakeLists.txt) so it can be +# configured on its own and always rebuilds the library under test. +# +cmake_minimum_required(VERSION 3.18) +set(CMAKE_CXX_STANDARD 17) + +project(TokenizersBenchmark) + +include(FetchContent) +FetchContent_Declare( + tokenizers SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/.. BUILD_ALWAYS ON +) +set(SUPPORT_REGEX_LOOKAHEAD ON) +FetchContent_MakeAvailable(tokenizers) + +add_executable(hf_tokenizer_encode_latency hf_tokenizer_encode_latency.cpp) +target_link_libraries(hf_tokenizer_encode_latency PUBLIC tokenizers) diff --git a/benchmark/hf_tokenizer_encode_latency.cpp b/benchmark/hf_tokenizer_encode_latency.cpp new file mode 100644 index 00000000..d2549134 --- /dev/null +++ b/benchmark/hf_tokenizer_encode_latency.cpp @@ -0,0 +1,133 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +// @lint-ignore-every LICENSELINT + +// Latency benchmark for HFTokenizer::encode. Encodes prompt templates +// (prose/code/dialogue) repeated to growing lengths and prints the mean wall +// time per encode. Correctness is covered by the unit tests; this only measures +// performance. +// +// hf_tokenizer_encode_latency [reps] + +#include + +#include +#include +#include +#include +#include +#include + +using tokenizers::HFTokenizer; + +namespace { + +struct Vector { + std::string name; + std::string base; // repeated to reach the target length + size_t repeats; +}; + +std::vector vectors() { + return { + {"prose_0.5k", + "The quick brown fox jumps over the lazy dog near the riverbank. ", + 35}, + {"code_1k", + "for (int i = 0; i < n; ++i) { sum += arr[i] * weights[i]; } // accumulate\n", + 61}, + {"dialogue_1.5k", + "User: Can you summarize the previous section in two sentences please? " + "Assistant: Sure, here is a concise summary of the main points raised. ", + 48}, + {"prose_2k", + "In the beginning the system parsed every token sequentially, which made " + "long prompts increasingly expensive to process as length grew larger. ", + 63}, + {"prose_8k", + "In the beginning the system parsed every token sequentially, which made " + "long prompts increasingly expensive to process as length grew larger. ", + 348}, + }; +} + +} // namespace + +int main(int argc, char** argv) { + if (argc < 2 || argc > 3) { + std::fprintf(stderr, "usage: %s [reps]\n", argv[0]); + return 2; + } + long reps = 5; + if (argc > 2) { + char* end = nullptr; + errno = 0; + reps = std::strtol(argv[2], &end, 10); + if (*end != '\0' || errno == ERANGE || reps <= 0) { + std::fprintf(stderr, "reps must be a positive integer, got '%s'\n", argv[2]); + return 2; + } + } + + HFTokenizer tok; + if (tok.load(argv[1]) != tokenizers::Error::Ok) { + std::fprintf(stderr, "failed to load tokenizer: %s\n", argv[1]); + return 1; + } + + std::printf( + "%-14s %10s %9s %12s %12s\n", + "vector", + "chars", + "ids", + "mean_ms", + "ns/token"); + for (const auto& v : vectors()) { + std::string text; + text.reserve(v.base.size() * v.repeats); + for (size_t i = 0; i < v.repeats; ++i) { + text += v.base; + } + + auto warm = tok.encode(text, 0, 0); // also warms up + if (!warm.ok()) { + std::fprintf( + stderr, + "encode failed for %s (error %d)\n", + v.name.c_str(), + static_cast(warm.error())); + return 1; + } + const size_t ids = warm.get().size(); + + double total_ms = 0; + for (long r = 0; r < reps; ++r) { + const auto t0 = std::chrono::steady_clock::now(); + auto res = tok.encode(text, 0, 0); + const auto t1 = std::chrono::steady_clock::now(); + if (!res.ok()) { + std::fprintf( + stderr, + "encode failed for %s (error %d)\n", + v.name.c_str(), + static_cast(res.error())); + return 1; + } + total_ms += std::chrono::duration(t1 - t0).count(); + } + const double mean_ms = total_ms / reps; + std::printf( + "%-14s %10zu %9zu %12.3f %12.1f\n", + v.name.c_str(), + text.size(), + ids, + mean_ms, + ids ? mean_ms * 1e6 / static_cast(ids) : 0.0); + } + return 0; +} diff --git a/benchmark/run.sh b/benchmark/run.sh new file mode 100755 index 00000000..bb00b81a --- /dev/null +++ b/benchmark/run.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +# Configure (Release) + build + run the encode latency benchmark. +# +# run.sh [reps] +set -euo pipefail + +here="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +tokenizer="${1:?usage: run.sh [reps]}" +shift || true + +build_dir="${here}/build" +cmake -S "${here}" -B "${build_dir}" -DCMAKE_BUILD_TYPE=Release >/dev/null +cmake --build "${build_dir}" --target hf_tokenizer_encode_latency --parallel + +"${build_dir}/hf_tokenizer_encode_latency" "${tokenizer}" "$@"