@@ -341,3 +341,79 @@ def test_invalid_format(self):
341341 def test_invalid_hex_value (self ):
342342 with self .assertRaisesRegex (ValueError , 'Invalid hex value' ):
343343 local_tokenizer ._parse_hex_byte ('<0xFG>' )
344+
345+
346+ class TestLocalTokenizerHuggingFace (unittest .TestCase ):
347+
348+ def setUp (self ):
349+ self .mock_load_model_proto = patch (
350+ 'genai._local_tokenizer_loader.load_model_proto'
351+ ).start ()
352+ self .mock_get_huggingface_tokenizer = patch (
353+ 'genai._local_tokenizer_loader.get_huggingface_tokenizer'
354+ ).start ()
355+
356+ self .mock_load_model_proto .return_value = MagicMock ()
357+ self .mock_tokenizer = MagicMock ()
358+ self .mock_get_huggingface_tokenizer .return_value = self .mock_tokenizer
359+
360+ # gemini-3.5-flash maps to gemma4 (HuggingFace)
361+ self .tokenizer = local_tokenizer .LocalTokenizer (model_name = 'gemini-3.5-flash' )
362+
363+ def tearDown (self ):
364+ patch .stopall ()
365+
366+ def test_count_tokens_simple_string (self ):
367+ self .mock_tokenizer .encode .return_value = [[1 , 2 , 3 ]]
368+ result = self .tokenizer .count_tokens ('Hello world' )
369+ self .assertEqual (result .total_tokens , 3 )
370+ self .mock_tokenizer .encode .assert_called_once_with (['Hello world' ])
371+
372+ def test_compute_tokens_simple_string (self ):
373+ self .mock_tokenizer .encode .return_value = [[1 , 2 , 3 ]]
374+ self .mock_tokenizer .convert_ids_to_tokens .return_value = ['He' , 'llo' , ' world' ]
375+
376+ result = self .tokenizer .compute_tokens ('Hello world' )
377+
378+ self .assertEqual (len (result .tokens_info ), 1 )
379+ self .assertEqual (result .tokens_info [0 ].token_ids , [1 , 2 , 3 ])
380+ self .assertEqual (result .tokens_info [0 ].tokens , [b'He' , b'llo' , b' world' ])
381+ self .assertEqual (result .tokens_info [0 ].role , 'user' )
382+
383+ self .mock_tokenizer .encode .assert_called_once_with (['Hello world' ])
384+ self .mock_tokenizer .convert_ids_to_tokens .assert_called_once_with ([1 , 2 , 3 ])
385+
386+ def test_compute_tokens_special_characters (self ):
387+ self .mock_tokenizer .encode .return_value = [[1 , 2 ]]
388+ # Use U+2581 (lower one eighth block) and underscore
389+ self .mock_tokenizer .convert_ids_to_tokens .return_value = ['_world' , '\u2581 hello' ]
390+
391+ result = self .tokenizer .compute_tokens ('dummy' )
392+
393+ self .assertEqual (result .tokens_info [0 ].tokens , [b' world' , b' hello' ])
394+
395+ def test_compute_tokens_with_chat_history (self ):
396+ self .mock_tokenizer .encode .return_value = [[1 ], [2 , 3 ]]
397+ self .mock_tokenizer .convert_ids_to_tokens .side_effect = [
398+ ['Hello' ],
399+ ['Hi' , ' there!' ]
400+ ]
401+ history = [
402+ types .Content (role = 'user' , parts = [types .Part (text = 'Hello' )]),
403+ types .Content (role = 'model' , parts = [types .Part (text = 'Hi there!' )]),
404+ ]
405+ result = self .tokenizer .compute_tokens (history )
406+ self .assertEqual (len (result .tokens_info ), 2 )
407+ self .assertEqual (result .tokens_info [0 ].token_ids , [1 ])
408+ self .assertEqual (result .tokens_info [0 ].tokens , [b'Hello' ])
409+ self .assertEqual (result .tokens_info [0 ].role , 'user' )
410+ self .assertEqual (result .tokens_info [1 ].token_ids , [2 , 3 ])
411+ self .assertEqual (result .tokens_info [1 ].tokens , [b'Hi' , b' there!' ])
412+ self .assertEqual (result .tokens_info [1 ].role , 'model' )
413+
414+ self .mock_tokenizer .encode .assert_called_once_with (['Hello' , 'Hi there!' ])
415+ self .mock_tokenizer .convert_ids_to_tokens .assert_has_calls ([
416+ unittest .mock .call ([1 ]),
417+ unittest .mock .call ([2 , 3 ])
418+ ])
419+
0 commit comments