init commit

This commit is contained in:
Max Bain
2022-12-14 18:59:12 +00:00
parent 9791862c45
commit 9f6fa61160
38 changed files with 105726 additions and 2 deletions

14
tests/test_tokenizer.py Normal file
View File

@ -0,0 +1,14 @@
from whisper.tokenizer import get_tokenizer
def test_tokenizer():
gpt2_tokenizer = get_tokenizer(multilingual=False)
multilingual_tokenizer = get_tokenizer(multilingual=True)
text = "다람쥐 헌 쳇바퀴에 타고파"
gpt2_tokens = gpt2_tokenizer.encode(text)
multilingual_tokens = multilingual_tokenizer.encode(text)
assert gpt2_tokenizer.decode(gpt2_tokens) == text
assert multilingual_tokenizer.decode(multilingual_tokens) == text
assert len(gpt2_tokens) > len(multilingual_tokens)