This commit is contained in:
kiankyars 2025-11-23 08:11:54 -07:00 committed by GitHub
commit 52c7d23a63
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -77,8 +77,7 @@ vocab_size = tokenizer.get_vocab_size()
special_set = set(tokenizer.get_special_tokens()) special_set = set(tokenizer.get_special_tokens())
token_strings = [tokenizer.decode([token_id]) for token_id in range(vocab_size)] token_strings = [tokenizer.decode([token_id]) for token_id in range(vocab_size)]
token_bytes = [] token_bytes = []
for token_id in range(vocab_size): for token_str in token_strings: # the Python string representation of this token
token_str = token_strings[token_id] # the Python string representation of this token
if token_str in special_set: if token_str in special_set:
token_bytes.append(0) # special characters are not counted token_bytes.append(0) # special characters are not counted
else: else: