@@ -241,11 +241,15 @@ export class TikTokenizer {
241241 const piece = match [ 0 ] ;
242242 if ( this . cache . has ( piece ) ) {
243243 let tokens = this . cache . get ( piece ) ;
244- tokenCount += tokens ! . length ;
245- if ( tokenCount <= maxTokenCount ) {
244+ if ( tokenCount + tokens ! . length <= maxTokenCount ) {
245+ tokenCount += tokens ! . length ;
246246 encodeLength += piece . length ;
247247 tokenIds . push ( ...tokens ! ) ;
248248 } else {
249+ let remainingTokens = maxTokenCount - tokenCount ;
250+ tokenCount += remainingTokens ;
251+ encodeLength += piece . length ;
252+ tokenIds . push ( ...tokens ! . slice ( 0 , remainingTokens ) ) ;
249253 break ;
250254 }
251255 } else {
@@ -254,8 +258,8 @@ export class TikTokenizer {
254258 const token = this . encoder ! . get ( uint8ArrayToString ( bytes ) ) ;
255259 if ( token !== undefined ) {
256260 this . cache . set ( piece , [ token ] ) ;
257- tokenCount ++ ;
258- if ( tokenCount <= maxTokenCount ) {
261+ if ( tokenCount + 1 <= maxTokenCount ) {
262+ tokenCount ++ ;
259263 encodeLength += piece . length ;
260264 tokenIds . push ( token ) ;
261265 } else {
@@ -264,11 +268,15 @@ export class TikTokenizer {
264268 } else {
265269 const encodedTokens = bytePairEncode ( bytes , this . encoder ! ) ;
266270 this . cache . set ( piece , encodedTokens ) ;
267- tokenCount += encodedTokens . length ;
268- if ( tokenCount <= maxTokenCount ) {
271+ if ( tokenCount + encodedTokens . length <= maxTokenCount ) {
272+ tokenCount += encodedTokens . length ;
269273 encodeLength += piece . length ;
270274 tokenIds . push ( ...encodedTokens ) ;
271275 } else {
276+ let remainingTokens = maxTokenCount - tokenCount ;
277+ tokenCount += remainingTokens ;
278+ encodeLength += piece . length ;
279+ tokenIds . push ( ...encodedTokens . slice ( 0 , remainingTokens ) ) ;
272280 break ;
273281 }
274282 }
@@ -443,6 +451,16 @@ export class TikTokenizer {
443451 }
444452 }
445453
454+ // Naive approach if chunks are incorrect
455+ if ( actualPrefixTokenCount > maxTokenCount ) {
456+ const encodedTokens = this . encode ( text , allowedSpecial ) ;
457+ const slicedTokens = encodedTokens . slice ( encodedTokens . length - maxTokenCount ) ;
458+ return {
459+ tokenIds : slicedTokens ,
460+ text : this . decode ( slicedTokens )
461+ } ;
462+ }
463+
446464 return {
447465 tokenIds : tokenIds . slice ( actualPrefixTokenCount ) ,
448466 text : text . slice ( actualPrefixStrLength )
0 commit comments