mirror of
https://github.com/m-bain/whisperX.git
synced 2025-07-01 18:17:27 -04:00
init commit
This commit is contained in:
14
tests/test_tokenizer.py
Normal file
14
tests/test_tokenizer.py
Normal file
@ -0,0 +1,14 @@
|
||||
from whisper.tokenizer import get_tokenizer
|
||||
|
||||
|
||||
def test_tokenizer():
|
||||
gpt2_tokenizer = get_tokenizer(multilingual=False)
|
||||
multilingual_tokenizer = get_tokenizer(multilingual=True)
|
||||
|
||||
text = "다람쥐 헌 쳇바퀴에 타고파"
|
||||
gpt2_tokens = gpt2_tokenizer.encode(text)
|
||||
multilingual_tokens = multilingual_tokenizer.encode(text)
|
||||
|
||||
assert gpt2_tokenizer.decode(gpt2_tokens) == text
|
||||
assert multilingual_tokenizer.decode(multilingual_tokens) == text
|
||||
assert len(gpt2_tokens) > len(multilingual_tokens)
|
Reference in New Issue
Block a user