meta-pytorch · digantdesai · Jun 12, 2026
diff --git a/benchmark/.gitignore b/benchmark/.gitignore
@@ -0,0 +1 @@
+build/
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
@@ -0,0 +1,21 @@
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#
+# Build the encode latency benchmark. Standalone: pulls in the parent
+# tokenizers library via FetchContent (mirrors test/CMakeLists.txt) so it can be
+# configured on its own and always rebuilds the library under test.
+#
+cmake_minimum_required(VERSION 3.18)
+set(CMAKE_CXX_STANDARD 17)
+
+project(TokenizersBenchmark)
+
+include(FetchContent)
+FetchContent_Declare(
+  tokenizers SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/.. BUILD_ALWAYS ON
+)
+set(SUPPORT_REGEX_LOOKAHEAD ON)
+FetchContent_MakeAvailable(tokenizers)
+
+add_executable(hf_tokenizer_encode_latency hf_tokenizer_encode_latency.cpp)
+target_link_libraries(hf_tokenizer_encode_latency PUBLIC tokenizers)
diff --git a/benchmark/hf_tokenizer_encode_latency.cpp b/benchmark/hf_tokenizer_encode_latency.cpp
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// @lint-ignore-every LICENSELINT
+
+// Latency benchmark for HFTokenizer::encode. Encodes prompt templates
+// (prose/code/dialogue) repeated to growing lengths and prints the mean wall
+// time per encode. Correctness is covered by the unit tests; this only measures
+// performance.
+//
+//   hf_tokenizer_encode_latency <tokenizer.json> [reps]
+
+#include <pytorch/tokenizers/hf_tokenizer.h>
+
+#include <cerrno>
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <vector>
+
+using tokenizers::HFTokenizer;
+
+namespace {
+
+struct Vector {
+  std::string name;
+  std::string base; // repeated to reach the target length
+  size_t repeats;
+};
+
+std::vector<Vector> vectors() {
+  return {
+      {"prose_0.5k",
+       "The quick brown fox jumps over the lazy dog near the riverbank. ",
+       35},
+      {"code_1k",
+       "for (int i = 0; i < n; ++i) { sum += arr[i] * weights[i]; } // accumulate\n",
+       61},
+      {"dialogue_1.5k",
+       "User: Can you summarize the previous section in two sentences please? "
+       "Assistant: Sure, here is a concise summary of the main points raised. ",
+       48},
+      {"prose_2k",
+       "In the beginning the system parsed every token sequentially, which made "
+       "long prompts increasingly expensive to process as length grew larger. ",
+       63},
+      {"prose_8k",
+       "In the beginning the system parsed every token sequentially, which made "
+       "long prompts increasingly expensive to process as length grew larger. ",
+       348},
+  };
+}
+
+} // namespace
+
+int main(int argc, char** argv) {
+  if (argc < 2 || argc > 3) {
+    std::fprintf(stderr, "usage: %s <tokenizer.json> [reps]\n", argv[0]);
+    return 2;
+  }
+  long reps = 5;
+  if (argc > 2) {
+    char* end = nullptr;
+    errno = 0;
+    reps = std::strtol(argv[2], &end, 10);
+    if (*end != '\0' || errno == ERANGE || reps <= 0) {
+      std::fprintf(stderr, "reps must be a positive integer, got '%s'\n", argv[2]);
+      return 2;
+    }
+  }
+
+  HFTokenizer tok;
+  if (tok.load(argv[1]) != tokenizers::Error::Ok) {
+    std::fprintf(stderr, "failed to load tokenizer: %s\n", argv[1]);
+    return 1;
+  }
+
+  std::printf(
+      "%-14s %10s %9s %12s %12s\n",
+      "vector",
+      "chars",
+      "ids",
+      "mean_ms",
+      "ns/token");
+  for (const auto& v : vectors()) {
+    std::string text;
+    text.reserve(v.base.size() * v.repeats);
+    for (size_t i = 0; i < v.repeats; ++i) {
+      text += v.base;
+    }
+
+    auto warm = tok.encode(text, 0, 0); // also warms up
+    if (!warm.ok()) {
+      std::fprintf(
+          stderr,
+          "encode failed for %s (error %d)\n",
+          v.name.c_str(),
+          static_cast<int>(warm.error()));
+      return 1;
+    }
+    const size_t ids = warm.get().size();
+
+    double total_ms = 0;
+    for (long r = 0; r < reps; ++r) {
+      const auto t0 = std::chrono::steady_clock::now();
+      auto res = tok.encode(text, 0, 0);
+      const auto t1 = std::chrono::steady_clock::now();
+      if (!res.ok()) {
+        std::fprintf(
+            stderr,
+            "encode failed for %s (error %d)\n",
+            v.name.c_str(),
+            static_cast<int>(res.error()));
+        return 1;
+      }
+      total_ms += std::chrono::duration<double, std::milli>(t1 - t0).count();
+    }
+    const double mean_ms = total_ms / reps;
+    std::printf(
+        "%-14s %10zu %9zu %12.3f %12.1f\n",
+        v.name.c_str(),
+        text.size(),
+        ids,
+        mean_ms,
+        ids ? mean_ms * 1e6 / static_cast<double>(ids) : 0.0);
+  }
+  return 0;
+}
diff --git a/benchmark/run.sh b/benchmark/run.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+# Configure (Release) + build + run the encode latency benchmark.
+#
+#   run.sh <tokenizer.json> [reps]
+set -euo pipefail
+
+here="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+tokenizer="${1:?usage: run.sh <tokenizer.json> [reps]}"
+shift || true
+
+build_dir="${here}/build"
+cmake -S "${here}" -B "${build_dir}" -DCMAKE_BUILD_TYPE=Release >/dev/null
+cmake --build "${build_dir}" --target hf_tokenizer_encode_latency --parallel
+
+"${build_dir}/hf_tokenizer_encode_latency" "${tokenizer}" "$@"