414 Commits

Author SHA1 Message Date
036b5b0717 Merge c89b4f898f into d700b56c9c 2025-06-13 15:33:03 +02:00
d700b56c9c docs: add missing torch import to Python usage example in README 2025-06-08 03:34:49 -06:00
bog
b343241253 feat: add diarize_model arg to CLI (#1101) 2025-05-31 13:32:31 +02:00
6fe0a8784a docs: add troubleshooting section for libcudnn dependencies in README 2025-05-31 05:20:06 -06:00
c89b4f898f fix: incorrect type annotation in get_writer return value
The audio_path attribute that the __call__ method of the ResultWriter class takes is a str, not TextIO
2025-05-13 02:45:33 +02:00
5012650d0f chore: update lockfile 2025-05-03 16:25:43 +02:00
108bd0c400 chore: add lockfile check step to CI workflows 2025-05-03 16:25:43 +02:00
b2d50a027b chore: bump version 2025-05-03 11:38:54 +02:00
36d552cad3 fix: remove DiarizationPipeline from public API 2025-05-03 09:25:59 +02:00
7d36b832f9 refactor: update CLI entry point 2025-05-03 09:25:59 +02:00
d2a493e910 refactor: implement lazy loading for module imports in whisperx 2025-05-03 09:25:59 +02:00
f5b40b5366 chore: update version to 3.3.3 in pyproject.toml and uv.lock 2025-05-01 11:08:54 +02:00
ac0c8bd79a feat: add version and Python version arguments to CLI 2025-05-01 11:08:54 +02:00
cd59f21d1a fix: downgrade ctranslate2 dependency version 2025-05-01 11:08:54 +02:00
0aed874589 Remove duplicated item
"lv": "latvian"
2025-04-12 11:08:15 +02:00
f10dbf6ab1 fix: update setuptools configuration to include package discovery for whisperx 2025-03-25 18:49:44 +01:00
a7564c2ad6 docs: update installation instructions 2025-03-25 17:02:41 +01:00
e7712f496e refactor: update import statements to use explicit module paths across multiple files 2025-03-25 16:24:21 +01:00
8e53866704 feat: pass hotwords argument to get_prompt (#1073)
Co-authored-by: Jade Moillic <jade.moillic@radiofrance.com>
2025-03-24 10:47:47 +01:00
3205436d58 Merge pull request #1002 from Barabazs/feat/uv 2025-03-23 12:59:46 +00:00
8c58c54635 Revert "feat: add Basque alignment model (#1074)" (#1077)
This reverts commit 0d9807adc5.
2025-03-05 15:19:23 +01:00
0d9807adc5 feat: add Basque alignment model (#1074) 2025-03-04 14:55:30 +01:00
4db839018c feat: add Tagalog (tl - Filipino) Phoneme-based ASR Model (#1067) 2025-02-23 09:59:48 +01:00
f8d11df727 docs: Update README example commands with generic audio path 2025-02-19 08:24:04 +01:00
d2f0e53f71 chore: remove tmp workflow 2025-02-12 08:23:23 +01:00
7489ebf876 feat: update build and release workflow to use uv for package installation and publishing 2025-02-12 08:23:23 +01:00
90256cc481 feat: use uv recommended setup 2025-02-12 08:23:23 +01:00
b41ebd4871 chore: add numpy to deps 2025-02-12 08:23:23 +01:00
63bc1903c1 feat: update Python compatibility workflow to use uv 2025-02-12 08:23:23 +01:00
272714e07d feat: use uv for building package 2025-02-12 08:23:23 +01:00
44e8bf5bb6 Merge pull request #1024 from philmcmahon/local-files-only-param
Add models_cache_only param
2025-01-27 14:26:19 +00:00
7b3c9ce629 Add models_cache_only param 2025-01-27 12:16:37 +00:00
36d2622e27 feat: add Latvian align model 2025-01-25 09:45:17 +01:00
8bfa12193b Merge pull request #1006 from tan90xx/main
chore: fix variable naming inconsistency from `segments` to `segments_list`
2025-01-20 14:05:34 +00:00
acbeba6057 Update silero.py 2025-01-20 20:01:21 +08:00
fca563a782 Update silero.py 2025-01-20 19:52:37 +08:00
2117909bf6 Merge pull request #1005 from tan90xx/main
chore: handle empty segments_list case in silero
2025-01-19 13:51:34 +00:00
de0d8fe313 chore: handle empty segments_list case in silero
prevent errors
2025-01-19 21:20:56 +08:00
355f8e06f7 Merge pull request #1003 from Barabazs/chore/remove-aws-url
chore: remove deprecated VAD_SEGMENTATION_URL
2025-01-17 15:28:24 +00:00
86e2b3ee74 chore: remove deprecated VAD_SEGMENTATION_URL 2025-01-17 09:12:05 +01:00
70c639cdb5 doc: refer to DEFAULT_ALIGN_MODELS_HF for other langs 2025-01-17 08:47:44 +01:00
235536e28d Update links to language models in README 2025-01-17 08:47:44 +01:00
12604a48ea Merge pull request #986 from bfs18/main
support timestamp for numbers.
2025-01-14 21:03:51 +00:00
ffbc73664c change the docstrings and comments to English 2025-01-13 22:56:48 +08:00
289eadfc76 fix a merge error. 2025-01-13 20:26:27 +08:00
22a93f2932 Merge branch 'main' into main 2025-01-13 19:34:21 +08:00
1027367b79 Merge pull request #995 from winking324/main
fix vad_method is none
2025-01-13 10:10:29 +00:00
5e54b872a9 Merge branch 'main' into main 2025-01-13 10:09:20 +00:00
6be02cccfa Update asr.py 2025-01-13 10:08:09 +00:00
2f93e029c7 feat: add SegmentData type for temporary processing during alignment 2025-01-13 10:45:50 +01:00
024bc8481b refactor: consolidate segment data handling in alignment function 2025-01-13 10:45:50 +01:00
f286e7f3de refactor: improve type hints and clean up imports 2025-01-13 10:45:50 +01:00
73e644559d refactor: remove namespace for consistency 2025-01-13 10:45:50 +01:00
1ec527375a fix vad_method is none 2025-01-13 13:53:35 +08:00
6695426a85 fix new vad paths 2025-01-12 12:50:15 +00:00
7a98456321 Merge pull request #888 from 3manifold/silero-vad
Silero VAD support
2025-01-11 17:15:27 +00:00
aaddb83aa5 switch from case to ifelse 2025-01-11 17:11:21 +00:00
c288f4812a Merge branch 'main' into silero-vad 2025-01-11 17:05:53 +00:00
4ebfb078c5 make no beam consistent with backtrack. 2025-01-09 23:13:11 +08:00
65b2332e13 make align a bit faster. 2025-01-09 19:33:26 +08:00
69281f3a29 support timestamps for numbers. 2025-01-09 15:23:40 +08:00
734084cdf6 bump: update version to 3.3.1 2025-01-08 18:00:34 +01:00
9395b0de18 Update tmp.yml 2025-01-08 17:59:28 +01:00
d57f9dc54c Create tmp.yml 2025-01-08 17:59:28 +01:00
a90bd1ce3f dataclasses replace method 2025-01-08 17:59:13 +01:00
79eb8fa53d Accept alternative VAD methods. Extend to use Silero VAD. 2025-01-06 13:41:46 +01:00
10b05fc43f refactor: replace NamedTuple with TranscriptionOptions in FasterWhisperPipeline 2025-01-05 18:56:19 +01:00
26d9b46888 feat: include speaker information in WriteTXT when diarizing 2025-01-05 18:21:34 +01:00
9a8967f27e refactor: add type hints 2025-01-05 11:48:24 +01:00
0f7f9f9f83 refactor: simplify imports for better type inference 2025-01-05 11:48:24 +01:00
c60594fa3b fix: update import statement for conjunctions module 2025-01-05 11:48:24 +01:00
4916192246 chore: bump whisperX to 3.3.0 2025-01-02 14:09:10 +01:00
cbdac53e87 chore: update ctranslate2 version to restrict <4.5.0 2025-01-02 14:09:10 +01:00
940a223219 fix: add UTF-8 encoding when reading README.md 2025-01-02 12:43:59 +01:00
a0eb31019b chore: update license in setup.py 2025-01-02 08:41:04 +01:00
b08ad67a72 docs: update installation instructions in README 2025-01-02 08:35:45 +01:00
c18f9f979b fix: update README image source and enhance setup.py for long description 2025-01-02 08:30:04 +01:00
948b3e368b chore: update gitignore 2025-01-01 18:47:40 +01:00
e9ac5b63bc chore: clean up MANIFEST.in by removing unnecessary asset inclusions 2025-01-01 18:47:40 +01:00
90b45459d9 feat: add build and release workflow 2025-01-01 18:47:40 +01:00
81c4af96a6 feat: add Python compatibility testing workflow
feat: restrict Python versions to 3.9 - 3.12
2025-01-01 15:29:03 +01:00
1c6d9327bc feat: use model_dir as cache_dir for wav2vec2 (#681) 2025-01-01 13:22:27 +01:00
0fdb55d317 feat: add local_files_only option on whisperx.load_model for offline mode (#867)
Adds the parameter local_files_only (default False for consistency) to whisperx.load_model so that the user can avoid downloading the file and return the path to the local cached file if it exists.

---------

Co-authored-by: Barabazs <31799121+Barabazs@users.noreply.github.com>
2025-01-01 13:16:45 +01:00
51da22771f feat: add verbose output (#759)
---------

Co-authored-by: Abhishek Sharma <abhishek@zipteams.com>
Co-authored-by: Barabazs <31799121+Barabazs@users.noreply.github.com>
2025-01-01 13:07:52 +01:00
15ad5bf7df feat: update versions for pyannote:3.3.2 and faster-whisper:1.1.0 (#936)
* chore: bump faster-whisper to 1.1.0

* chore: bump pyannote to 3.3.2

* feat: add multilingual option in load_model function

---------

Co-authored-by: Barabazs <31799121+Barabazs@users.noreply.github.com>
2024-12-31 10:41:09 +01:00
7fdbd21fe3 feat: add support for faster-whisper 1.0.3 (#875)
---------

Co-authored-by: Barabazs <31799121+Barabazs@users.noreply.github.com>
2024-12-31 10:07:42 +01:00
3ff625c561 feat: update faster-whisper to 1.0.2 (#814)
* Update faster-whisper to 1.0.2 to enable model distil-large-v3

* feat: add hotwords option to default_asr_options

---------

Co-authored-by: Barabazs <31799121+Barabazs@users.noreply.github.com>
2024-12-31 09:41:22 +01:00
7307306a9d chore: bump version 2024-12-18 09:03:04 +01:00
3027cc32bc Update MANIFEST.in to include necessary files 2024-12-17 08:11:49 +01:00
9e4b1b4c49 fix: Force ctranslate to version 4.4.0
Force ctranslate to version 4.4.0 due libcudnn_ops_infer.so.8:
https://github.com/SYSTRAN/faster-whisper/issues/729

Co-authored-by: Icaro Bombonato <ibombonatosites@gmail.com>
2024-12-16 13:30:08 +01:00
9b9e03c4cc feat: update Norwegian models (#687)
Updated Norwegian Bokmål and Norwegian Nynorsk models

Co-authored-by: Barabazs <31799121+Barabazs@users.noreply.github.com>
2024-12-16 11:08:48 +01:00
19eff8e79a feat: add new align models (#922)
Co-authored-by: Barabazs <31799121+Barabazs@users.noreply.github.com>
2024-12-16 11:06:43 +01:00
6f3bc5b7b8 Added Romanian phoneme-based ASR model (#791)
Co-authored-by: Barabazs <31799121+Barabazs@users.noreply.github.com>
2024-12-16 08:09:53 +01:00
9809336db6 Fix link in README.md 2024-12-16 08:04:59 +01:00
a898b3ba94 Remove typo in error message 2024-12-16 08:02:42 +01:00
c141074cbd Merge pull request #945 from m-bain/m-bain/local_model
move model to assets
2024-12-14 22:54:56 -06:00
a9e50ef0af move model to assets 2024-12-14 22:53:53 -06:00
161ae1f7ad Merge pull request #944 from m-bain/m-bain/local_model
local vad model
2024-12-14 22:34:38 -06:00
a83ddbdf9b local vad model 2024-12-14 22:16:43 -06:00
9e3a9e0e38 Merge pull request #852 from jan-panoch/main
Update alignment.py - added alignment for  sk and sl languages
2024-08-20 00:05:56 +08:00
3f339f9515 Update alignment.py - remove commented-out alignment modules for hr language 2024-08-09 13:00:12 +02:00
9a9b6171e6 Update alignment.py - trying another hr alignment 2024-08-08 08:37:55 +02:00
59b4d88d1d Update alignment.py - trying another hr alignment file 2024-08-08 08:29:11 +02:00
6f70aa6beb Update alignment.py - added croatian (hr) language 2024-08-08 08:10:55 +02:00
912920c591 Update alignment.py - added alignment for sk and sl languages 2024-08-07 10:05:17 +02:00
58f00339af BSD 2 LICENSE 2024-07-11 13:01:15 +04:00
f2da2f858e Update README.md 2024-03-20 15:47:18 +00:00
78dcfaab51 upgrade faster-whisper 2024-02-23 09:30:12 +00:00
d6562c26da Merge pull request #716 from cococig/fix/faster-whisper-from-pypi
fix: update faster-whisper dependencies
2024-02-22 16:51:06 +00:00
c313f4dd5c fix: update faster-whisper dependencies 2024-02-23 01:42:22 +09:00
bbaa2f0d1a update kwargs 2024-02-22 15:59:14 +00:00
e906be9688 Merge pull request #703 from victor-upmeet/large-v3-demo
Add Replicate large-v3 demo
2024-02-18 15:43:51 +00:00
fbbd07bece Merge pull request #669 from KossaiSbai/ks/supress-numeral-symbol-tokens-message
Get rid of numeral_symbol_tokens variable in printed message
2024-02-18 15:43:23 +00:00
d8c9196346 Add Replicate large-v3 demo 2024-02-18 12:17:11 +01:00
2686f74bc9 Get rid of numeral_symbol_tokens variable in printed message 2024-01-19 22:25:21 +00:00
8227807fa9 Delete build/lib/whisperx directory 2024-01-02 19:36:36 -07:00
59962a70be Merge pull request #646 from santialferez/diarize-patch-1
Update pyannote to v3.1.1 to fix a diarization problem (and diarize.py)
2024-01-03 02:35:53 +00:00
06e30b2a25 Merge pull request #654 from Swami-Abhinav/provide-custom-load-vad
Added option to load Custom VAD model to load model method
2024-01-01 17:38:30 +00:00
6bb2f1cd48 Added Vad custom option 2024-01-01 14:56:51 +05:30
f8cc46c6f7 Merge pull request #648 from canoalberto/main
Fixes --model_dir path
2023-12-28 21:23:42 +00:00
942c336b8f Fixes --model_dir path 2023-12-27 14:03:54 -05:00
8ae6416594 update setup.py to install pyannote.audio==3.1.1, update diarize.py to include num_speakers; to fix Issue #592 2023-12-26 13:01:49 +01:00
8540ff5985 Merge pull request #636 from NbAiLab/peregilk-patch-1
Adding Norwegian Bokmål and Norwegian Nynorsk
2023-12-19 15:55:20 +00:00
5dfbfcbdc0 Adding Norwegian Bokmål and Norwegian Nynorsk
Adding Wav2Vec2-models for Norwegian Bokmål and Norwegian Nynorsk. The models are testet together with WhisperX, and works great. For Bokmål I have added the 1B model, even if I see fairly little difference between that and the 300M model. For Norwegian Nynorsk only a 300M exist.The quality of the Wav2Vec models are also reported here: https://arxiv.org/abs/2307.01672
2023-12-19 08:48:21 +01:00
1c7b1a87da Merge pull request #630 from mlopsengr/patch-1
Update README.md
2023-12-17 15:53:44 +00:00
9f23739f90 Update README.md
Demonstrates use of argument to save model to local path.
2023-12-15 13:46:32 +00:00
19ab91c5a6 Merge pull request #618 from gillens/main
Update README to correct speaker diarization version link
2023-12-10 17:35:42 -06:00
089cd5ab21 Merge pull request #585 from kurianbenoy/ml-asr
Add alignment model for Malayalam
2023-12-10 17:35:14 -06:00
2b7ab95ad6 Update README to Correct Speaker Diarization Version Link
Currently errors if user just accepts terms for README link version
3.0. Version 3.1 introduced in pull request #586
2023-12-07 12:48:21 -08:00
4553e0d4ed Merge pull request #617 from MahmoudAshraf97/main 2023-12-04 16:15:48 +00:00
f865dfe710 fix typo 2023-12-04 17:38:50 +03:00
4acbdd75be add "yue" to supported languages that was added along with Large-V3 2023-12-04 17:27:54 +03:00
e9c507ce5d Merge pull request #605 from M0HID/patch-1
fix link
2023-11-28 11:56:29 +00:00
a5dca2cc65 Merge pull request #603 from spbisc97/patch-1
pip compliance for git+ installs
2023-11-28 01:24:35 +00:00
8a8eeb33ee Update README.md 2023-11-27 17:15:28 +00:00
b4d7b1a422 pip compliance for git+ installs
Minimal change to let pip install requirements
2023-11-26 18:37:04 +01:00
5a16e59217 Merge pull request #599 from MahmoudAshraf97/main
support for `large-v3`
2023-11-26 12:34:16 +00:00
b4e4143e3b install faster-whisper using git as pypi is not updated anymore 2023-11-25 17:42:36 +00:00
4b05198eed bump faster-whisper to 0.10 2023-11-25 12:11:08 +00:00
71a5281bde support for large-v3 2023-11-25 12:09:00 +00:00
d97cdb7bcf Merge pull request #586 from remic33/main 2023-11-17 10:48:57 +00:00
20161935a1 feat: pass model to 3.1 in code 2023-11-17 11:12:16 +01:00
1d7f8ccbf1 feat: get rid of pyannote versioning and go to 3.1 2023-11-17 11:03:23 +01:00
5756b0fb13 Update alignment.py 2023-11-17 05:21:23 +05:30
aaaa3de810 Update alignment.py 2023-11-17 05:18:19 +05:30
ba30365344 Merge pull request #584 from DougTrajano/patch-1
Move load_model after WhisperModel
2023-11-16 12:09:21 +00:00
bd3aa03b6f Move load_model after WhisperModel 2023-11-16 08:59:28 -03:00
f5c544ff90 Merge pull request #581 from davidmartinrius/catalan_align_model
Add align model for catalan language.
2023-11-16 10:54:24 +00:00
7c2a9a8b7b Merge pull request #580 from kaka1909/main
Update asr.py and make the model parameter be used
2023-11-16 10:54:02 +00:00
9f41c49fe5 Add align model for catalan language. 2023-11-16 11:43:36 +01:00
48d651e5ea Update asr.py and make the model parameter be used 2023-11-16 15:29:24 +08:00
4ece2369d7 Merge pull request #556 from sorgfresser/remove-space-segment-align
no align based on space
2023-11-11 02:03:56 +00:00
52fbe5c26f Merge pull request #570 from hidenori-endo/main
Drop ffmpeg-python dependency and call ffmpeg directly.
2023-11-09 18:39:53 +00:00
6703d2774b Drop ffmpeg-python dependency 2023-11-10 03:26:47 +09:00
a2af569838 Merge pull request #554 from sorgfresser/fix-binarize-unbound
fix unboundlocalerror
2023-11-07 10:54:24 +00:00
0c7f32f55c no align based on space 2023-11-03 19:47:00 +01:00
6936dd6991 default t 2023-11-03 18:50:15 +01:00
6b1100a919 Merge pull request #549 from amolinasalazar/minor_fixes
Minor fixes for word options and subtitles
2023-10-31 12:26:47 -07:00
d4a600b568 REMOVE duplicated code 2023-10-31 18:55:50 +01:00
afd5ef1d58 FIX warnings for word options 2023-10-31 18:55:35 +01:00
dbeb8617f2 Merge pull request #521 from kaihe-stori/update-readme
Add a special note about Speaker-Diarization-3.0 in readme
2023-10-25 11:18:47 -07:00
c6fe379d9e Merge pull request #517 from jkukul/support-language-names-as-parameters
Support language names in `--language` parameter.
2023-10-25 11:16:30 -07:00
e9a6385d3c Merge pull request #541 from justinwlin/main
Update setup.py to download pyannote depending on platform
2023-10-25 11:14:11 -07:00
b522133340 Update setup.py to be adaptive to platform 2023-10-24 18:42:14 -04:00
49e0130e4e Merge pull request #531 from accessful-ai/main 2023-10-17 06:54:22 -07:00
d4ac9531d9 Update setup.py 2023-10-17 15:23:38 +02:00
66808f6147 Merge pull request #529 from MahmoudAshraf97/main 2023-10-16 10:53:18 -07:00
b69956d725 . 2023-10-16 20:43:37 +03:00
a150df4310 Merge pull request #527 from jkukul/pass-beam-size-to-fast-whisper 2023-10-15 07:15:13 -07:00
02c0323777 fix 2023-10-15 16:25:15 +03:00
14a7cab8eb Pass patience and beam_size to faster-whisper. 2023-10-14 13:51:29 +02:00
acf31b754f update readme 2023-10-11 22:56:38 -04:00
4cdce3b927 Merge pull request #518 from characat0/main
fix(diarize): key error on empty track
2023-10-10 12:54:35 -07:00
a5356509b6 fix(diarize): key error on empty track 2023-10-10 14:50:41 -05:00
1001a055db Support language names in --language. 2023-10-10 13:55:47 +02:00
051047bb25 Merge pull request #510 from MahmoudAshraf97/main
fix minimum input length for torch wav2vec2 models
2023-10-05 15:31:08 -07:00
c1b821a08d fix list markdown 2023-10-05 15:14:29 -07:00
78e20a16a8 update links 2023-10-05 15:14:03 -07:00
be07c13f75 read does actually work... 2023-10-05 14:48:39 -07:00
8049dba2f7 fix minimum input length for torch wav2vec2 models 2023-10-06 00:41:23 +03:00
d077abdbdf Merge pull request #509 from valentt/patch-1
Update README.md
2023-10-05 14:13:20 -07:00
84423ca517 Update README.md
Added info that Hugging Face token has to be write token because read token doesn't work.
2023-10-05 19:14:28 +02:00
a22b8b009b Merge pull request #507 from compasspathways/fix/pass-vad-options
Fix: Allow vad options to be configurable by passing to FasterWhisperPipeline and merge_chunks.
2023-10-05 07:48:19 -07:00
79801167ac Fix: Allow vad options to be configurable by correctly passing down to FasterWhisperPipeline. 2023-10-05 10:06:34 -04:00
07fafa37b3 Merge pull request #494 from mvoggu/main
fix: ZeroDivisionError when --print_progress True
2023-09-27 07:46:06 -07:00
a0b6459c8b fix: ZeroDivisionError when --print_progress True 2023-09-27 20:10:43 +05:30
2a11ce3ef0 Merge pull request #487 from piuy11/main
Update alignment.py
2023-09-26 14:17:46 -07:00
18abcf46ee Merge pull request #492 from remic33/pyannote3
Pyannote3
2023-09-26 14:16:57 -07:00
652aa24919 change pyannote version 2023-09-26 23:04:28 +02:00
b17908473d correct 3.0 pyannote weights 2023-09-26 17:18:20 +02:00
f137f31de6 Update alignment.py 2023-09-25 15:33:06 +09:00
e94b904308 Merge pull request #474 from sorgfresser/pin-faster-whisper 2023-09-19 16:53:42 -07:00
ffd6167b26 Merge pull request #473 from sorgfresser/fix-faster-whisper-threads 2023-09-19 16:53:34 -07:00
4c7ce14fed pin faster whisper 2023-09-14 13:19:11 +02:00
0ae0d49d1d add faster whisper threading 2023-09-14 11:47:51 +02:00
b1a98b78c9 Merge pull request #472 from darwintree/main
chore(writer): improve text display(ja etc) in json file
2023-09-10 08:37:39 -06:00
c6d9e6cb67 chore(writer): improve text display(ja etc) in json file 2023-09-10 22:02:47 +08:00
31f5233949 Merge pull request #459 from awerks/main
A solution to long subitles and words without timestamps
2023-09-06 10:09:27 -06:00
2ca99ce909 A solution to long subitles
Example usage: 
subtitles_proccessor = SubtitlesProcessor(output["segments"], detected_language, max_line_length = 50, min_char_length_splitter = 35)
subtitles_proccessor.save("subtitles.srt", advanced_splitting = True)
2023-09-04 21:49:34 +02:00
15d9e08d3e Merge pull request #458 from remic33/correct_default_asr_options
fix: correct defaut_asr_options with new options (patch 0.8)
2023-09-04 09:22:16 -06:00
15451d0f1c fix: correct defaut_asr_options with new options (patch 0.8) 2023-09-04 17:08:19 +02:00
8c4a21b66d Merge pull request #440 from jim60105/main
chore(writer): Join words without spaces for ja, zh
2023-08-29 11:22:30 -06:00
5223de2a41 fix: UnboundLocalError: local variable 'align_language' referenced before assignment 2023-08-30 01:11:09 +08:00
f505702dc7 chore(writer): Join words without spaces for ja, zh
fix #248, fix #310
2023-08-30 01:11:09 +08:00
adf455a97c Merge pull request #445 from jim60105/add-merge-chunk-size-as-argument
feat: Add merge chunks chunk_size as arguments.
2023-08-29 10:05:14 -06:00
9647f60fca Merge branch 'main' into add-merge-chunk-size-as-argument 2023-08-29 10:05:05 -06:00
a8bfac6bef Merge pull request #427 from awerks/main
Update alignment.py
2023-08-29 10:03:46 -06:00
6d414e20e2 Merge pull request #438 from invisprints/fix-speaker-missing
fix missing speaker prefix
2023-08-29 10:03:06 -06:00
3c7b03935b Merge pull request #430 from dotgrid/dotgrid-docs-patch
Document --compute_type command line option
2023-08-29 10:02:51 -06:00
eb771cf56d feat: Add merge chunks chunk_size as arguments.
Suggest from https://github.com/m-bain/whisperX/issues/200#issuecomment-1666507780
2023-08-29 23:09:02 +08:00
cc81ab7db7 fix missing prefix
Fixed missing the speaker part when enable --highlight_words
2023-08-25 12:08:16 +08:00
ef965a03ed Merge pull request #431 from CaRniFeXeR/main
adds link to whisperX medium on replicate.com
2023-08-21 17:25:15 +01:00
6f2ff16aad Merge pull request #1 from CaRniFeXeR/CaRniFeXeR-replicate-models
adds link to whisperX medium on replicate and updates replicate bades…
2023-08-21 08:20:25 +08:00
81b12af321 adds link to whisperX medium on replicate and updates replicate bades in README.md 2023-08-21 08:16:46 +08:00
c1197c490e Document --compute_type command line option 2023-08-19 08:19:49 +01:00
4e28492dbd Update alignment.py 2023-08-17 14:57:53 +02:00
6cb7267dc2 Update alignment.py 2023-08-17 14:56:54 +02:00
abbb66b58e Update alignment.py 2023-08-17 14:53:53 +02:00
ea7bb91a56 Update asr.py 2023-08-17 14:49:57 +02:00
d2d840f06c Update utils.py 2023-08-17 14:45:23 +02:00
0a1137e41c Merge pull request #429 from sorgfresser/no-segments-writer
fix writer fail on segments 0
2023-08-17 13:20:38 +01:00
0767597bff fix writer fail on segments 0 2023-08-17 14:18:16 +02:00
cb3ed4ab9d Update transcribe.py 2023-08-16 16:22:29 +02:00
65688208c9 Update alignment.py 2023-08-16 16:18:00 +02:00
72685d0398 Update asr.py 2023-08-16 16:15:24 +02:00
1bb4839b0f Update alignment.py 2023-08-16 16:13:28 +02:00
4acb5b3abc Update asr.py 2023-08-16 16:11:46 +02:00
14e593f60b Update alignment.py 2023-08-16 16:08:25 +02:00
66da4b3eb7 Merge pull request #418 from Ayushi-Desynova/main-1
Update alignment.py
2023-08-10 12:14:08 +01:00
18d5fdc995 Add telugu language to alignment.py 2023-08-10 12:13:52 +01:00
423667f00b Update alignment.py 2023-08-09 17:08:56 +05:30
1b092de19a Merge pull request #395 from Joemgu7/main
Fix repeat transcription on different languages and proper suppress_numerals use
2023-08-02 13:44:27 +01:00
69a52b00c7 Merge pull request #400 from davidas1/fast-diarize
make diarization faster
2023-08-02 13:43:20 +01:00
9e3145cead more 2023-08-02 10:36:56 +03:00
577db33430 more 2023-08-02 10:35:20 +03:00
da6ed83dc9 more 2023-08-02 10:34:42 +03:00
7eb9692cb9 more 2023-08-02 10:32:02 +03:00
8de0e2af51 make diarization faster 2023-08-02 10:11:43 +03:00
225f6b4d69 fix suppress_numerals 2023-07-29 19:34:51 +02:00
864976af23 fix issue by resetting tokenizer 2023-07-29 18:56:33 +02:00
9d736dca1c add some warning if languages do not match 2023-07-29 18:20:59 +02:00
d87f6268d0 fix preset language 2023-07-29 18:13:36 +02:00
d80b98601b Merge pull request #255 from tijszwinkels/cuda-11.8
Suggest using pytorch-cuda 11.8 instead of 11.7
2023-07-25 00:29:08 +01:00
aa37509362 Merge branch 'main' into cuda-11.8 2023-07-25 00:28:53 +01:00
15b4c558c2 Merge pull request #352 from daanelson/replicate-demo
adding link to Replicate demo
2023-07-24 10:48:24 +01:00
54504a2be8 Merge pull request #374 from abCods/main
Add Urdu model support for alignment
2023-07-24 10:47:52 +01:00
8c0fee90d3 Update alignment.py 2023-07-24 10:47:41 +01:00
016f0293cd Merge pull request #378 from baer/patch-1
Remove torchvision from README
2023-07-24 10:47:14 +01:00
44daf50501 Merge pull request #382 from mabergerx/patch-1
Update transcribe.py -> small change in `batch_size` description
2023-07-24 10:46:55 +01:00
48e7caad77 Update transcribe.py -> small change in batch_size description
Changed the description of the `batch_size` parameter.
2023-07-24 11:45:38 +02:00
8673064658 Remove torchvision from README 2023-07-20 17:02:34 -07:00
e6ecbaa68f Remove spacing 2023-07-20 03:20:47 +05:00
e92325b7eb Remove the fix 2023-07-20 03:19:37 +05:00
eb712f3999 Rectify refernce to the word 2023-07-20 02:54:06 +05:00
30eff5a01f Replace double quotes to single for JSON parsing 2023-07-20 02:32:37 +05:00
734ecc2844 Add Urdu model support for alignment 2023-07-17 19:29:41 +05:00
512ab1acf9 adding Replicate demo 2023-06-30 18:22:10 -07:00
befe2b242e torch 2+ 2023-06-07 22:43:29 +01:00
f9c5ff9f08 Merge pull request #309 from Ca-ressemble-a-du-fake/patch-1
Add Audacity export
2023-06-07 11:50:05 +01:00
d39c1b2319 add "aud" to output_format 2023-06-07 11:48:49 +01:00
b13778fefd make aud optional 2023-06-07 11:47:49 +01:00
076ff96eb2 Add Audacity export
This exports the transcript to a text file that can be directly imported in Audacity as label file. This is useful to quickly check the transcript-audio alignment.
2023-06-07 05:49:49 +02:00
0c84c26d92 Merge pull request #303 from m-bain/v3
Suppress numerals
2023-06-05 15:46:26 +01:00
d7f1d16f19 suppress numerals change logic 2023-06-05 15:44:17 +01:00
74a00eecd7 suppress numerals fix 2023-06-05 15:33:04 +01:00
b026407fd9 Merge branch 'v3' of https://github.com/m-bain/whisperX into v3
Conflicts:
	whisperx/asr.py
2023-06-05 15:30:02 +01:00
a323cff654 --suppress_numerals option, ensures non-numerical words, for wav2vec2 alignment 2023-06-05 15:27:42 +01:00
93ed6cfa93 interspeech 2023-06-01 16:54:16 +01:00
9797a67391 Merge pull request #294 from SohaibAnwaar/fix/typehint-bug-fix
fix: Bug  in type  hinting
2023-05-30 11:13:22 +01:00
5a4382ae4d fix: Bug in type hinting 2023-05-30 15:11:07 +05:00
ec6a110cdf Merge pull request #290 from m-bain/main
push contributions from main
2023-05-29 12:55:24 +01:00
8d8c027a92 Merge pull request #278 from Mr-Turtleeeee/add_align_for_vi
Add war2vec model for Vietnamese
2023-05-29 12:54:37 +01:00
4cbd3030cc no sentence split on mr. mrs. dr... 2023-05-29 12:48:14 +01:00
1c528d1a3c Merge pull request #284 from prameshbajra/main 2023-05-27 11:19:13 +01:00
c65e7ba9b4 Merge pull request #280 from Thebys/patch-1 2023-05-27 11:18:27 +01:00
5a47f458ac Added download path parameter. 2023-05-27 11:38:54 +02:00
f1032bb40a VAD unequal stack size, remove debug change 2023-05-26 20:39:19 +01:00
bc8a03881a Merge pull request #281 from m-bain/v3
fix Unequal Stack Size VAD error
2023-05-26 20:37:57 +01:00
42b4909bc0 fix Unequal Stack Size VAD error 2023-05-26 20:36:03 +01:00
bb15d6b68e Add Czech alignment model
This PR adds the following Czech alignment model: https://huggingface.co/comodoro/wav2vec2-xls-r-300m-cs-250.

I have successfully tested this with several Czech audio recordings with length of up to 3 hours, and the results are satisfactory.

However, I have received the following warnings and I am not sure how relevant it is:
```
Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.0.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint --file C:\Users\Thebys\.cache\torch\whisperx-vad-segmentation.bin`
Model was trained with pyannote.audio 0.0.1, yours is 2.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.0.0. Bad things might happen unless you revert torch to 1.x.
```
2023-05-26 21:17:01 +02:00
23d405e1cf Merge branch 'main' into add_align_for_vi 2023-05-26 17:14:09 +01:00
17e2f7f859 Merge pull request #277 from Boulaouaney/add-Korean-alignment-model
added Korean wav2vec2 model
2023-05-26 17:12:47 +01:00
1d9d630fb9 added Korean wav2vec2 model 2023-05-26 20:33:16 +09:00
9c042c2d28 Add war2vec model for Vietnamese 2023-05-26 16:46:55 +07:00
a23f2aa3f7 Merge pull request #269 from sorgfresser/transcribe_keywords
Add transcribe keywords
2023-05-21 12:08:44 +01:00
7c5468116f Merge branch 'm-bain:main' into transcribe_keywords 2023-05-20 16:03:40 +02:00
a1c705b3a7 fix tokenizer is None 2023-05-20 15:52:45 +02:00
29a5e0b236 Merge pull request #266 from sorgfresser/main
Add device_index option
2023-05-20 14:45:34 +01:00
715435db42 add tokenizer is None case 2023-05-20 15:42:21 +02:00
1fc965bc1a add task, language keyword to transcribe 2023-05-20 15:30:25 +02:00
74b98ebfaa ensure device_index not None 2023-05-20 13:11:30 +02:00
53396adb21 add device_index 2023-05-20 13:02:46 +02:00
63fb5fc46f Suggest using pytorch-cuda 11.8 instead of 11.7
This prevents CuFFT errors on newer cards such as the RTX 4090 and RTX 6000 Ada.

fixes #254
2023-05-16 12:07:09 +02:00
d8a2b4ffc9 Merge pull request #246 from m-bain/v3
V3
2023-05-13 12:18:09 +01:00
9ffb7e7a23 Merge branch 'v3' of https://github.com/m-bain/whisperX into v3
Conflicts:
	setup.py
2023-05-13 12:16:33 +01:00
fd8f1003cf add translate, fix word_timestamp error 2023-05-13 12:14:06 +01:00
46b416296f Merge pull request #123 from koldbrandt/danish_alignment
Danish alignment model
2023-05-09 23:10:24 +01:00
7642390d0a Merge branch 'main' into danish_alignment 2023-05-09 23:10:13 +01:00
8b05ad4dae Merge pull request #235 from sorgfresser/main
Add custom typing for results
2023-05-09 23:05:02 +01:00
5421f1d7ca remove v3 tag on pip install 2023-05-09 13:42:50 +01:00
91e959ec4f Merge branch 'm-bain:main' into main 2023-05-08 20:46:25 +02:00
eabf35dff0 Custom result types 2023-05-08 20:45:34 +02:00
4919ad21fc Merge pull request #233 from sorgfresser/main
Fix tuple unpacking
2023-05-08 19:05:47 +01:00
b50aafb17b Fix tuple unpacking 2023-05-08 20:03:42 +02:00
2efa136114 update python usage example 2023-05-08 17:20:38 +01:00
0b839f3f01 Update README.md 2023-05-07 20:36:08 +01:00
1caddfb564 Merge pull request #225 from m-bain/v3
V3
2023-05-07 20:31:16 +01:00
7ad554c64f Merge branch 'main' into v3 2023-05-07 20:30:57 +01:00
4603f010a5 update readme, setup, add option to return char_timestamps 2023-05-07 20:28:33 +01:00
24008aa1ed fix long segments, break into sentences using nltk, improve align logic, improve diarize (sentence-based) 2023-05-07 15:32:58 +01:00
07361ba1d7 add device to dia pipeline @sorgfresser 2023-05-05 11:53:51 +01:00
4e2ac4e4e9 torch2.0, remove compile for now, round to times to 3 decimal 2023-05-04 20:38:13 +01:00
d2116b98ca Merge pull request #210 from sorgfresser/v3
Update pyannote and torch version
2023-05-04 20:32:06 +01:00
d8f0ef4a19 Set diarization device manually 2023-05-04 16:25:34 +02:00
1b62c61c71 Merge pull request #216 from aramlang/blank_id-fix
Enable Hebrew support
2023-05-04 01:13:23 +01:00
2d59eb9726 Add torch compile to log mel spectrogram 2023-05-03 23:17:44 +02:00
cb53661070 Enable Hebrew support 2023-05-03 11:26:12 -05:00
2a6830492c Fix pyannote to specific commit 2023-05-02 20:25:56 +02:00
da3aabe181 Merge branch 'm-bain:v3' into v3 2023-05-02 18:55:43 +02:00
067189248f Use pyannote develop branch and torch version 2 2023-05-02 18:44:43 +02:00
b666523004 add v3 pre-release comment, and v4 progress update 2023-05-02 15:10:40 +01:00
69e038cbc4 Merge pull request #209 from SohaibAnwaar/feat-dockerfile
feat: adding the docker file
2023-05-02 14:55:30 +01:00
9fb51412c0 Merge pull request #208 from arnavmehta7/patch-1 2023-05-02 10:55:13 +01:00
a693a779fa feat: adding the docker file 2023-05-02 13:28:20 +05:00
64ca208cc8 Fixed the word_start variable not initialized bug. 2023-05-02 13:13:02 +05:30
5becc99e56 Version bump pyannote, pytorch 2023-05-01 13:47:41 +02:00
e24ca9e0a2 Merge pull request #205 from prashanthellina/v3-fix-diarization 2023-04-30 21:08:45 +01:00
601c91140f references #202, attempt to fix speaker diarization failing in v3 2023-04-30 17:33:24 +00:00
31a9ec7466 Merge pull request #204 from sorgfresser/v3 2023-04-30 18:29:46 +01:00
b9c8c5072b Pad language detection if audio is too short 2023-04-30 18:34:18 +02:00
a903e57cf1 Merge pull request #199 from thomasmol/v3 2023-04-29 23:35:42 +01:00
cb176a186e added num_workers to fix pickling error 2023-04-29 19:51:05 +02:00
5b85c5433f Update setup.py 2023-04-28 16:47:04 +01:00
cc7e168d2b add checkout command 2023-04-25 12:14:23 +01:00
db97f29678 update pip install 2023-04-25 11:19:23 +01:00
25be8210e5 add v3 tag for install 2023-04-25 10:07:34 +01:00
0efad26066 pass compute_type 2023-04-24 21:26:44 +01:00
2a29f0ec6a add compute types 2023-04-24 21:24:22 +01:00
558d980535 v3 init 2023-04-24 21:08:43 +01:00
da458863d7 allow custom model_dir for torchaudio models 2023-04-14 21:40:36 +01:00
cf252a8592 allow custom path for vad model 2023-04-14 15:02:58 +01:00
6a72b61564 clamp end_timestamp to prevent infinite loop 2023-04-11 20:15:37 +01:00
48ed89834e Merge pull request #169 from invisprints/v2-opt-load-model
Optimize the inference process and reduce the memory usage
2023-04-09 13:39:13 +01:00
bb15c9428f opti the inference loop 2023-04-09 15:58:55 +08:00
9482d324d0 Merge pull request #162 from dev-nomi/cli_argument_type
Added vad_filter type
2023-04-05 13:40:04 -07:00
4146e56d5b Added vad_filter type 2023-04-05 17:11:29 +05:00
118e7deedb Merge pull request #161 from diasks2/fix_typo
Fix typo in utils.py
2023-04-04 19:00:18 -07:00
70a4a0a25c Fix typo 2023-04-05 10:50:49 +09:00
40948a3d00 fix whisper version to 20230314 for no breaking 2023-04-04 12:42:34 -07:00
c8be6ac94d update python example 2023-04-03 12:18:31 -07:00
a582a59493 mkdir for torch cache in case it doesnt exist 2023-04-01 13:05:40 -07:00
861379edc3 Merge pull request #157 from Ryan5453/fix/whisper-req
Fix Requirements
2023-03-31 16:40:19 -07:00
4af345434a Update requirements.txt 2023-03-31 19:36:38 -04:00
634799b3be hf token only for diarization 2023-03-31 16:15:40 -07:00
189aeac83e v2 lets goo 2023-04-01 00:10:45 +01:00
bc2776017e v2 lets go 2023-04-01 00:09:29 +01:00
11a78d7ced handle tmp wav file better 2023-04-01 00:06:40 +01:00
b9ca701d69 .wav conversion, handle audio with no detected speech 2023-03-31 23:02:38 +01:00
d0fa028045 fix tfile naming 2023-03-30 19:24:42 +01:00
ae4a9de307 add vad model external dl 2023-03-30 18:57:55 +01:00
18b63d46e2 skeleton v2 2023-03-30 05:31:57 +01:00
1e7c2c337b Merge pull request #148 from FernanOrtega/main
Update decoding.py
2023-03-24 07:57:43 -07:00
33dd3b9bcd Update decoding.py
Changes from https://github.com/openai/whisper/pull/914/
2023-03-24 11:56:41 +01:00
d1b4ff8228 Merge pull request #114 from mshakirDr/patch-1
Fix hugging face error
2023-03-23 15:12:09 -07:00
d31f6e0b8a Merge branch 'm-bain:main' into danish_alignment 2023-03-06 10:52:47 +01:00
809700e286 remove soundfile version constraint 2023-03-06 00:20:31 +00:00
cea42ca470 Fix hugging face error
Model should be loaded with an id to avoid this error:
huggingface_hub.utils._validators.HFValidationError: Repo id must use alphanumeric chars or '-', '_', '.', '--' and '..' are forbidden, '-' and '.' cannot start or end the name, max length is 96: 'pyannote\segmentation'.
2023-03-04 19:12:13 +01:00
c8404d9805 added a danish alignment model 2023-03-04 13:20:40 +01:00
d1d420e70c Merge pull request #111 from Barabazs/patch-1
fix: force soundfile version update for mp3 support
2023-03-04 11:46:57 +00:00
844eb30710 fix: force soundfile version update for mp3 support 2023-03-04 11:01:26 +01:00
31e6fe7e36 Merge pull request #107 from JCGoran/fix/python3.7_compatibility
Added Python 3.7 compatibility
2023-03-02 15:31:36 +00:00
cfcede41f6 Added Python 3.7 compatibility
- removed use of walrus operator in favor of `np.cumsum`
2023-03-02 15:46:07 +01:00
186b06e032 paper drop 2023-03-02 12:04:16 +00:00
847a3cd85b Merge pull request #96 from smly/fix-batch-processing
FIX: Assertion error in batch processing
2023-02-22 12:11:01 +00:00
2b1ffa12b8 Merge pull request #97 from smly/gpu-vad-filter
GPU acceleration when using VAD filters
2023-02-21 18:57:14 +00:00
57f5957e0e Pass device to pyannote.audio.Inference 2023-02-22 03:48:20 +09:00
27fe502344 Fix assertion error in batch processing 2023-02-22 02:45:13 +09:00
f7093e60d3 Merge pull request #90 from Pikauba/translation_starting_point_improvement
Improvement to transcription starting point with VAD
2023-02-18 21:59:57 +00:00
a1d2229416 Improvement to transcription starting point with VAD 2023-02-18 11:12:23 -05:00
4cb167a225 Merge pull request #74 from Camb-ai/level-bug-fix
added if clause for checking 'level-1'
2023-02-14 19:22:22 +00:00
2e307814dd added if clause for checking 2023-02-10 14:48:51 +05:30
d687cf3358 Merge pull request #58 from MahmoudAshraf97/main
added turkish wav2vec2 model
2023-02-01 22:11:51 +00:00
0a3fd11562 update readme 2023-02-01 22:09:11 +00:00
29e95b746b Merge pull request #57 from TengdaHan/main
support batch processing
2023-02-01 20:37:54 +00:00
039af89a86 support batch processing 2023-02-01 19:41:20 +00:00
9f26112d5c added turkish wav2vec2 model 2023-02-01 21:38:50 +02:00
fd2a093754 Merge pull request #55 from jonatasgrosman/main
FIX: Error when loading Hugging Face's models with embedded LM
2023-02-01 10:27:45 +00:00
31f069752f Merge pull request #53 from MahmoudAshraf97/main
Add more languages to models list
2023-02-01 10:27:25 +00:00
4cdf7ef856 Merge pull request #48 from Barabazs/main
doc: format checklist
2023-02-01 10:26:58 +00:00
d294e29ad9 fix: error when loading huggingface model with embedded language model 2023-01-31 23:24:26 -03:00
0eae9e1f50 added several wav2vec2 models by jonatasgrosman
since his models were used in other languages before and I tested the arabic model myself, I assumed it's safe to include all the available models
2023-02-01 03:02:10 +02:00
1b08661e42 change arabic model to jonatasgrosman 2023-01-31 19:32:31 +02:00
a49799294b add arabic wav2vec2 model form elgeish 2023-01-31 19:07:48 +02:00
d83c74a79f doc: format checklist 2023-01-29 16:07:58 +01:00
acaefa09a1 Merge pull request #46 from Barabazs/main
Add sponsor link to sidebar
2023-01-28 19:05:36 +00:00
76f79f600a fix short seg timestamps bug 2023-01-28 19:04:19 +00:00
33073f9bba Create FUNDING.yml 2023-01-28 19:43:27 +01:00
50f3965fdb fix tsv file ext 2023-01-28 17:39:07 +00:00
df2b1b70cb increase vad cut default 2023-01-28 14:49:53 +00:00
c19cf407d8 handle non-alignable whole segments 2023-01-28 13:53:03 +00:00
8081ef2dcd add custom vad binarization for vad cut 2023-01-28 00:22:33 +00:00
c6dbac76c8 cut up vad segments when too long to prevent OOM 2023-01-28 00:01:39 +00:00
69673eb39b buy-me-a-coffee 2023-01-27 15:12:49 +00:00
5b8c8a7bd3 pandas fix 2023-01-27 15:05:08 +00:00
7f2159a953 Merge branch 'main' of https://github.com/m-bain/whisperX into main 2023-01-26 10:46:36 +00:00
16d24b1c96 only pad timestamps if not using VAD 2023-01-26 10:46:13 +00:00
d20a2a4ea2 typo in --diarize flag 2023-01-26 10:28:54 +00:00
312f1cc50c Merge pull request #40 from MahmoudAshraf97/main
Added arguments and instructions to enable the usage VAD and Diarization
2023-01-26 00:34:03 +00:00
99b6e79fbf Update README.md
added additional instructions to use PyAnnote modules
2023-01-26 00:56:10 +02:00
e7773358a3 Update transcribe.py
added the ability to include HF access token in order to use PyAnnote models
2023-01-26 00:42:35 +02:00
6b2aa4ff3e Merge pull request #1 from MahmoudAshraf97/patch-1
Update README.md
2023-01-26 00:37:38 +02:00
c3de5e9580 Update README.md
fixed model name
2023-01-26 00:36:29 +02:00
58d7191949 add diarize 2023-01-25 19:40:41 +00:00
286a2f2c14 clean up logic, use pandas where possibl 2023-01-25 18:42:52 +00:00
72 changed files with 5927 additions and 107372 deletions

1
.github/FUNDING.yml vendored Normal file
View File

@ -0,0 +1 @@
custom: https://www.buymeacoffee.com/maxhbain

34
.github/workflows/build-and-release.yml vendored Normal file
View File

@ -0,0 +1,34 @@
name: Build and release
on:
release:
types: [published]
jobs:
build:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Install uv
uses: astral-sh/setup-uv@v5
with:
version: "0.5.14"
python-version: "3.9"
- name: Check if lockfile is up to date
run: uv lock --check
- name: Build package
run: uv build
- name: Release to Github
uses: softprops/action-gh-release@v2
with:
files: dist/*.whl
- name: Publish package to PyPi
run: uv publish
env:
UV_PUBLISH_TOKEN: ${{ secrets.PYPI_API_TOKEN }}

View File

@ -0,0 +1,34 @@
name: Python Compatibility Test
on:
push:
branches: [main]
pull_request:
branches: [main]
workflow_dispatch: # Allows manual triggering from GitHub UI
jobs:
test:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.9", "3.10", "3.11", "3.12"]
steps:
- uses: actions/checkout@v4
- name: Install uv
uses: astral-sh/setup-uv@v5
with:
version: "0.5.14"
python-version: ${{ matrix.python-version }}
- name: Check if lockfile is up to date
run: uv lock --check
- name: Install the project
run: uv sync --all-extras
- name: Test import
run: |
uv run python -c "import whisperx; print('Successfully imported whisperx')"

173
.gitignore vendored
View File

@ -1,2 +1,171 @@
whisperx.egg-info/
**/__pycache__/
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# UV
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
#uv.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
# PyPI configuration file
.pypirc

View File

@ -2,7 +2,7 @@
## Other Languages
For non-english ASR, it is best to use the `large` whisper model. Alignment models are automatically picked by the chosen language from the default [lists](https://github.com/m-bain/whisperX/blob/e909f2f766b23b2000f2d95df41f9b844ac53e49/whisperx/transcribe.py#L22).
For non-english ASR, it is best to use the `large` whisper model. Alignment models are automatically picked by the chosen language from the default [lists](https://github.com/m-bain/whisperX/blob/main/whisperx/alignment.py#L18).
Currently support default models tested for {en, fr, de, es, it, ja, zh, nl}

39
LICENSE
View File

@ -1,27 +1,24 @@
Copyright (c) 2022, Max Bain
All rights reserved.
BSD 2-Clause License
Copyright (c) 2024, Max Bain
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. All advertising materials mentioning features or use of this software
must display the following acknowledgement:
This product includes software developed by Max Bain.
4. Neither the name of Max Bain nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ''AS IS'' AND ANY
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -1,4 +1,3 @@
include whisperx/assets/*
include whisperx/assets/gpt2/*
include whisperx/assets/multilingual/*
include whisperx/normalizers/english.json
include LICENSE
include requirements.txt

316
README.md
View File

@ -13,81 +13,125 @@
<img src="https://img.shields.io/github/license/m-bain/whisperX.svg"
alt="GitHub license">
</a>
<a href="https://arxiv.org/abs/2303.00747">
<img src="http://img.shields.io/badge/Arxiv-2303.00747-B31B1B.svg"
alt="ArXiv paper">
</a>
<a href="https://twitter.com/intent/tweet?text=&url=https%3A%2F%2Fgithub.com%2Fm-bain%2FwhisperX">
<img src="https://img.shields.io/twitter/url/https/github.com/m-bain/whisperX.svg?style=social" alt="Twitter">
</a>
</p>
<p align="center">
<a href="#what-is-it">What is it</a>
<a href="#setup">Setup</a>
<a href="#example">Usage</a>
<a href="#other-languages">Multilingual</a>
<a href="#contribute">Contribute</a>
<a href="EXAMPLES.md">More examples</a>
</p>
<img width="1216" align="center" alt="whisperx-arch" src="https://raw.githubusercontent.com/m-bain/whisperX/refs/heads/main/figures/pipeline.png">
<h6 align="center">Made by Max Bain • :globe_with_meridians: <a href="https://www.maxbain.com">https://www.maxbain.com</a></h6>
<!-- <p align="left">Whisper-Based Automatic Speech Recognition (ASR) with improved timestamp accuracy + quality via forced phoneme alignment and voice-activity based batching for fast inference.</p> -->
<img width="1216" align="center" alt="whisperx-arch" src="https://user-images.githubusercontent.com/36994049/211200186-8b779e26-0bfd-4127-aee2-5a9238b95e1f.png">
<!-- <h2 align="left", id="what-is-it">What is it 🔎</h2> -->
This repository provides fast automatic speech recognition (70x realtime with large-v2) with word-level timestamps and speaker diarization.
<p align="left">Whisper-Based Automatic Speech Recognition (ASR) with improved timestamp accuracy using forced alignment.
- ⚡️ Batched inference for 70x realtime transcription using whisper large-v2
- 🪶 [faster-whisper](https://github.com/guillaumekln/faster-whisper) backend, requires <8GB gpu memory for large-v2 with beam_size=5
- 🎯 Accurate word-level timestamps using wav2vec2 alignment
- 👯 Multispeaker ASR using speaker diarization from [pyannote-audio](https://github.com/pyannote/pyannote-audio) (speaker ID labels)
- 🗣 VAD preprocessing, reduces hallucination & batching with no WER degradation
</p>
<h2 align="left", id="what-is-it">What is it 🔎</h2>
This repository refines the timestamps of openAI's Whisper model via forced aligment with phoneme-based ASR models (e.g. wav2vec2.0), multilingual use-case.
**Whisper** is an ASR model [developed by OpenAI](https://github.com/openai/whisper), trained on a large dataset of diverse audio. Whilst it does produces highly accurate transcriptions, the corresponding timestamps are at the utterance-level, not per word, and can be inaccurate by several seconds.
**Whisper** is an ASR model [developed by OpenAI](https://github.com/openai/whisper), trained on a large dataset of diverse audio. Whilst it does produces highly accurate transcriptions, the corresponding timestamps are at the utterance-level, not per word, and can be inaccurate by several seconds. OpenAI's whisper does not natively support batching.
**Phoneme-Based ASR** A suite of models finetuned to recognise the smallest unit of speech distinguishing one word from another, e.g. the element p in "tap". A popular example model is [wav2vec2.0](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self).
**Forced Alignment** refers to the process by which orthographic transcriptions are aligned to audio recordings to automatically generate phone level segmentation.
**Voice Activity Detection (VAD)** is the detection of the presence or absence of human speech.
**Speaker Diarization** is the process of partitioning an audio stream containing human speech into homogeneous segments according to the identity of each speaker.
<h2 align="left", id="highlights">New🚨</h2>
- VAD filtering: Voice Activity Detection (VAD) from [Pyannote.audio](https://huggingface.co/pyannote/voice-activity-detection) is used as a preprocessing step to remove reliance on whisper timestamps and only transcribe audio segments containing speech. add `--vad_filter` flag, increases timestamp accuracy and robustness (requires more GPU mem due to 30s inputs in wav2vec2)
- Character level timestamps (see `*.char.ass` file output)
- Diarization (still in beta, add `--diarization`)
- 1st place at [Ego4d transcription challenge](https://eval.ai/web/challenges/challenge-page/1637/leaderboard/3931/WER) 🏆
- _WhisperX_ accepted at INTERSPEECH 2023
- v3 transcript segment-per-sentence: using nltk sent_tokenize for better subtitlting & better diarization
- v3 released, 70x speed-up open-sourced. Using batched whisper with [faster-whisper](https://github.com/guillaumekln/faster-whisper) backend!
- v2 released, code cleanup, imports whisper library VAD filtering is now turned on by default, as in the paper.
- Paper drop🎓👨🏫! Please see our [ArxiV preprint](https://arxiv.org/abs/2303.00747) for benchmarking and details of WhisperX. We also introduce more efficient batch inference resulting in large-v2 with \*60-70x REAL TIME speed.
<h2 align="left" id="setup">Setup ⚙️</h2>
Install this package using
`pip install git+https://github.com/m-bain/whisperx.git`
### 1. Simple Installation (Recommended)
If already installed, update package to most recent commit
The easiest way to install WhisperX is through PyPi:
`pip install git+https://github.com/m-bain/whisperx.git --upgrade`
If wishing to modify this package, clone and install in editable mode:
```
$ git clone https://github.com/m-bain/whisperX.git
$ cd whisperX
$ pip install -e .
```bash
pip install whisperx
```
Or if using [uvx](https://docs.astral.sh/uv/guides/tools/#running-tools):
```bash
uvx whisperx
```
### 2. Advanced Installation Options
These installation methods are for developers or users with specific needs. If you're not sure, stick with the simple installation above.
#### Option A: Install from GitHub
To install directly from the GitHub repository:
```bash
uvx git+https://github.com/m-bain/whisperX.git
```
#### Option B: Developer Installation
If you want to modify the code or contribute to the project:
```bash
git clone https://github.com/m-bain/whisperX.git
cd whisperX
uv sync --all-extras --dev
```
> **Note**: The development version may contain experimental features and bugs. Use the stable PyPI release for production environments.
You may also need to install ffmpeg, rust etc. Follow openAI instructions here https://github.com/openai/whisper#setup.
### Common Issues & Troubleshooting 🔧
#### libcudnn Dependencies (GPU Users)
If you're using WhisperX with GPU support and encounter errors like:
- `Could not load library libcudnn_ops_infer.so.8`
- `Unable to load any of {libcudnn_cnn.so.9.1.0, libcudnn_cnn.so.9.1, libcudnn_cnn.so.9, libcudnn_cnn.so}`
- `libcudnn_ops_infer.so.8: cannot open shared object file: No such file or directory`
This means your system is missing the CUDA Deep Neural Network library (cuDNN). This library is needed for GPU acceleration but isn't always installed by default.
**Install cuDNN (example for apt based systems):**
```bash
sudo apt update
sudo apt install libcudnn8 libcudnn8-dev -y
```
### Speaker Diarization
To **enable Speaker Diarization**, include your Hugging Face access token (read) that you can generate from [Here](https://huggingface.co/settings/tokens) after the `--hf_token` argument and accept the user agreement for the following models: [Segmentation](https://huggingface.co/pyannote/segmentation-3.0) and [Speaker-Diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1) (if you choose to use Speaker-Diarization 2.x, follow requirements [here](https://huggingface.co/pyannote/speaker-diarization) instead.)
> **Note**<br>
> As of Oct 11, 2023, there is a known issue regarding slow performance with pyannote/Speaker-Diarization-3.0 in whisperX. It is due to dependency conflicts between faster-whisper and pyannote-audio 3.0.0. Please see [this issue](https://github.com/m-bain/whisperX/issues/499) for more details and potential workarounds.
<h2 align="left" id="example">Usage 💬 (command line)</h2>
### English
Run whisper on example segment (using default params)
Run whisper on example segment (using default params, whisper small) add `--highlight_words True` to visualise word timings in the .srt file.
whisperx examples/sample01.wav
whisperx path/to/audio.wav
For increased timestamp accuracy, at the cost of higher gpu mem, use bigger models and VAD filtering e.g.
whisperx examples/sample01.wav --model large.en --vad_filter --align_model WAV2VEC2_ASR_LARGE_LV60K_960H
Result using *WhisperX* with forced alignment to wav2vec2.0 large:
Result using _WhisperX_ with forced alignment to wav2vec2.0 large:
https://user-images.githubusercontent.com/36994049/208253969-7e35fe2a-7541-434a-ae91-8e919540555d.mp4
@ -95,131 +139,179 @@ Compare this to original whisper out the box, where many transcriptions are out
https://user-images.githubusercontent.com/36994049/207743923-b4f0d537-29ae-4be2-b404-bb941db73652.mov
For increased timestamp accuracy, at the cost of higher gpu mem, use bigger models (bigger alignment model not found to be that helpful, see paper) e.g.
whisperx path/to/audio.wav --model large-v2 --align_model WAV2VEC2_ASR_LARGE_LV60K_960H --batch_size 4
To label the transcript with speaker ID's (set number of speakers if known e.g. `--min_speakers 2` `--max_speakers 2`):
whisperx path/to/audio.wav --model large-v2 --diarize --highlight_words True
To run on CPU instead of GPU (and for running on Mac OS X):
whisperx path/to/audio.wav --compute_type int8
### Other languages
The phoneme ASR alignment model is *language-specific*, for tested languages these models are [automatically picked from torchaudio pipelines or huggingface](https://github.com/m-bain/whisperX/blob/e909f2f766b23b2000f2d95df41f9b844ac53e49/whisperx/transcribe.py#L22).
The phoneme ASR alignment model is _language-specific_, for tested languages these models are [automatically picked from torchaudio pipelines or huggingface](https://github.com/m-bain/whisperX/blob/f2da2f858e99e4211fe4f64b5f2938b007827e17/whisperx/alignment.py#L24-L58).
Just pass in the `--language` code, and use the whisper `--model large`.
Currently default models provided for `{en, fr, de, es, it, ja, zh, nl, uk, pt}`. If the detected language is not in this list, you need to find a phoneme-based ASR model from [huggingface model hub](https://huggingface.co/models) and test it on your data.
Currently default models provided for `{en, fr, de, es, it}` via torchaudio pipelines and many other languages via Hugging Face. Please find the list of currently supported languages under `DEFAULT_ALIGN_MODELS_HF` on [alignment.py](https://github.com/m-bain/whisperX/blob/main/whisperx/alignment.py). If the detected language is not in this list, you need to find a phoneme-based ASR model from [huggingface model hub](https://huggingface.co/models) and test it on your data.
#### E.g. German
whisperx --model large --language de examples/sample_de_01.wav
whisperx --model large-v2 --language de path/to/audio.wav
https://user-images.githubusercontent.com/36994049/208298811-e36002ba-3698-4731-97d4-0aebd07e0eb3.mov
See more examples in other languages [here](EXAMPLES.md).
## Python usage 🐍
## Python usage 🐍
```python
import whisperx
import gc
device = "cuda"
device = "cuda"
audio_file = "audio.mp3"
batch_size = 16 # reduce if low on GPU mem
compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)
# transcribe with original whisper
model = whisperx.load_model("large", device)
result = model.transcribe(audio_file)
# 1. Transcribe with original whisper (batched)
model = whisperx.load_model("large-v2", device, compute_type=compute_type)
# save model to local path (optional)
# model_dir = "/path/"
# model = whisperx.load_model("large-v2", device, compute_type=compute_type, download_root=model_dir)
audio = whisperx.load_audio(audio_file)
result = model.transcribe(audio, batch_size=batch_size)
print(result["segments"]) # before alignment
# load alignment model and metadata
# delete model if low on GPU resources
# import gc; import torch; gc.collect(); torch.cuda.empty_cache(); del model
# 2. Align whisper output
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
# align whisper output
result_aligned = whisperx.align(result["segments"], model_a, metadata, audio_file, device)
print(result["segments"]) # after alignment
print(result_aligned["segments"]) # after alignment
print(result_aligned["word_segments"]) # after alignment
# delete model if low on GPU resources
# import gc; import torch; gc.collect(); torch.cuda.empty_cache(); del model_a
# 3. Assign speaker labels
diarize_model = whisperx.diarize.DiarizationPipeline(use_auth_token=YOUR_HF_TOKEN, device=device)
# add min/max number of speakers if known
diarize_segments = diarize_model(audio)
# diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)
result = whisperx.assign_word_speakers(diarize_segments, result)
print(diarize_segments)
print(result["segments"]) # segments are now assigned speaker IDs
```
## Demos 🚀
<h2 align="left" id="whisper-mod">Whisper Modifications</h2>
[![Replicate (large-v3](https://img.shields.io/static/v1?label=Replicate+WhisperX+large-v3&message=Demo+%26+Cloud+API&color=blue)](https://replicate.com/victor-upmeet/whisperx)
[![Replicate (large-v2](https://img.shields.io/static/v1?label=Replicate+WhisperX+large-v2&message=Demo+%26+Cloud+API&color=blue)](https://replicate.com/daanelson/whisperx)
[![Replicate (medium)](https://img.shields.io/static/v1?label=Replicate+WhisperX+medium&message=Demo+%26+Cloud+API&color=blue)](https://replicate.com/carnifexer/whisperx)
In addition to forced alignment, the following two modifications have been made to the whisper transcription method:
If you don't have access to your own GPUs, use the links above to try out WhisperX.
1. `--condition_on_prev_text` is set to `False` by default (reduces hallucination)
<h2 align="left" id="whisper-mod">Technical Details 👷‍♂️</h2>
2. Clamping segment `end_time` to be at least 0.02s (one time precision) later than `start_time` (prevents segments with negative duration)
For specific details on the batching and alignment, the effect of VAD, as well as the chosen alignment model, see the preprint [paper](https://www.robots.ox.ac.uk/~vgg/publications/2023/Bain23/bain23.pdf).
To reduce GPU memory requirements, try any of the following (2. & 3. can affect quality):
1. reduce batch size, e.g. `--batch_size 4`
2. use a smaller ASR model `--model base`
3. Use lighter compute type `--compute_type int8`
Transcription differences from openai's whisper:
1. Transcription without timestamps. To enable single pass batching, whisper inference is performed `--without_timestamps True`, this ensures 1 forward pass per sample in the batch. However, this can cause discrepancies the default whisper output.
2. VAD-based segment transcription, unlike the buffered transcription of openai's. In the WhisperX paper we show this reduces WER, and enables accurate batched inference
3. `--condition_on_prev_text` is set to `False` by default (reduces hallucination)
<h2 align="left" id="limitations">Limitations ⚠️</h2>
- Not thoroughly tested, especially for non-english, results may vary -- please post issue to let me know the results on your data
- Whisper normalises spoken numbers e.g. "fifty seven" to arabic numerals "57". Need to perform this normalization after alignment, so the phonemes can be aligned. Currently just ignores numbers.
- Assumes the initial whisper timestamps are accurate to some degree (within margin of 2 seconds, adjust if needed -- bigger margins more prone to alignment errors)
- Hacked this up quite quickly, there might be some errors, please raise an issue if you encounter any.
- Transcript words which do not contain characters in the alignment models dictionary e.g. "2014." or "£13.60" cannot be aligned and therefore are not given a timing.
- Overlapping speech is not handled particularly well by whisper nor whisperx
- Diarization is far from perfect
- Language specific wav2vec2 model is needed
<h2 align="left" id="contribute">Contribute 🧑‍🏫</h2>
If you are multilingual, a major way you can contribute to this project is to find phoneme models on huggingface (or train your own) and test them on speech for the target language. If the results look good send a merge request and some examples showing its success.
If you are multilingual, a major way you can contribute to this project is to find phoneme models on huggingface (or train your own) and test them on speech for the target language. If the results look good send a pull request and some examples showing its success.
The next major upgrade we are working on is whisper with speaker diarization, so if you have any experience on this please share.
Bug finding and pull requests are also highly appreciated to keep this project going, since it's already diverging from the original research scope.
<h2 align="left" id="coming-soon">Coming Soon 🗓</h2>
<h2 align="left" id="coming-soon">TODO 🗓</h2>
[x] ~~Multilingual init~~ done
- [x] Multilingual init
[x] ~~Subtitle .ass output~~ done
- [x] Automatic align model selection based on language detection
[x] ~~Automatic align model selection based on language detection~~ done
- [x] Python usage
[x] ~~Python usage~~ done
- [x] Incorporating speaker diarization
[x] ~~Character level timestamps~~
- [x] Model flush, for low gpu mem resources
[x] ~~Incorporating speaker diarization~~
- [x] Faster-whisper backend
[ ] Improve diarization (word level)
- [x] Add max-line etc. see (openai's whisper utils.py)
[ ] Inference speedup with batch processing
- [x] Sentence-level segments (nltk toolbox)
<h2 align="left" id="contact">Contact 📇</h2>
- [x] Improve alignment logic
Contact maxbain[at]robots[dot]ox[dot]ac[dot]uk for business things.
- [ ] update examples with diarization and word highlighting
- [ ] Subtitle .ass output <- bring this back (removed in v3)
- [ ] Add benchmarking code (TEDLIUM for spd/WER & word segmentation)
- [x] Allow silero-vad as alternative VAD option
- [ ] Improve diarization (word level). _Harder than first thought..._
<h2 align="left" id="contact">Contact/Support 📇</h2>
Contact maxhbain@gmail.com for queries.
<a href="https://www.buymeacoffee.com/maxhbain" target="_blank"><img src="https://cdn.buymeacoffee.com/buttons/default-orange.png" alt="Buy Me A Coffee" height="41" width="174"></a>
<h2 align="left" id="acks">Acknowledgements 🙏</h2>
Of course, this is mostly just a modification to [openAI's whisper](https://github.com/openai/whisper).
As well as accreditation to this [PyTorch tutorial on forced alignment](https://pytorch.org/tutorials/intermediate/forced_alignment_with_torchaudio_tutorial.html)
This work, and my PhD, is supported by the [VGG (Visual Geometry Group)](https://www.robots.ox.ac.uk/~vgg/) and the University of Oxford.
Of course, this is builds on [openAI's whisper](https://github.com/openai/whisper).
Borrows important alignment code from [PyTorch tutorial on forced alignment](https://pytorch.org/tutorials/intermediate/forced_alignment_with_torchaudio_tutorial.html)
And uses the wonderful pyannote VAD / Diarization https://github.com/pyannote/pyannote-audio
Valuable VAD & Diarization Models from:
- [pyannote audio][https://github.com/pyannote/pyannote-audio]
- [silero vad][https://github.com/snakers4/silero-vad]
Great backend from [faster-whisper](https://github.com/guillaumekln/faster-whisper) and [CTranslate2](https://github.com/OpenNMT/CTranslate2)
Those who have [supported this work financially](https://www.buymeacoffee.com/maxhbain) 🙏
Finally, thanks to the OS [contributors](https://github.com/m-bain/whisperX/graphs/contributors) of this project, keeping it going and identifying bugs.
<h2 align="left" id="cite">Citation</h2>
If you use this in your research, just cite the repo,
If you use this in your research, please cite the paper:
```bibtex
@misc{bain2022whisperx,
author = {Bain, Max},
title = {WhisperX},
year = {2022},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/m-bain/whisperX}},
}
```
as well as the whisper paper,
```bibtex
@article{radford2022robust,
title={Robust speech recognition via large-scale weak supervision},
author={Radford, Alec and Kim, Jong Wook and Xu, Tao and Brockman, Greg and McLeavey, Christine and Sutskever, Ilya},
journal={arXiv preprint arXiv:2212.04356},
year={2022}
}
```
and any alignment model used, e.g. wav2vec2.0.
```bibtex
@article{baevski2020wav2vec,
title={wav2vec 2.0: A framework for self-supervised learning of speech representations},
author={Baevski, Alexei and Zhou, Yuhao and Mohamed, Abdelrahman and Auli, Michael},
journal={Advances in Neural Information Processing Systems},
volume={33},
pages={12449--12460},
year={2020}
@article{bain2022whisperx,
title={WhisperX: Time-Accurate Speech Transcription of Long-Form Audio},
author={Bain, Max and Huh, Jaesung and Han, Tengda and Zisserman, Andrew},
journal={INTERSPEECH 2023},
year={2023}
}
```

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -1,140 +0,0 @@
1
00:00:00,000 --> 00:00:03,000
Bella, Gloria, love.
2
00:00:03,000 --> 00:00:04,000
Oh.
3
00:00:04,000 --> 00:00:05,000
How are you?
4
00:00:05,000 --> 00:00:07,000
Oh, I'm OK.
5
00:00:07,000 --> 00:00:08,000
I will be.
6
00:00:08,000 --> 00:00:09,000
I said she could stay with us tomorrow
7
00:00:09,000 --> 00:00:10,000
just until she feels better.
8
00:00:10,000 --> 00:00:11,000
Yeah.
9
00:00:11,000 --> 00:00:12,000
Of course she can.
10
00:00:12,000 --> 00:00:14,000
No, things won't be for long.
11
00:00:14,000 --> 00:00:16,000
Well, you can stay as long as you want, my love.
12
00:00:16,000 --> 00:00:18,000
I've really missed you.
13
00:00:18,000 --> 00:00:19,000
Pops.
14
00:00:19,000 --> 00:00:20,000
Great to see you, love.
15
00:00:20,000 --> 00:00:22,000
Oh.
16
00:00:22,000 --> 00:00:23,000
All right, shall we get you off to bed then?
17
00:00:23,000 --> 00:00:25,000
You should have given me some warm.
18
00:00:25,000 --> 00:00:26,000
I know.
19
00:00:26,000 --> 00:00:27,000
I'll have to put the electric blanket on.
20
00:00:27,000 --> 00:00:28,000
I'm sorry.
21
00:00:28,000 --> 00:00:29,000
All right, Bella.
22
00:00:29,000 --> 00:00:31,000
Freezing up there.
23
00:00:31,000 --> 00:00:34,000
In a bedroom, Peter unpacks her suitcase.
24
00:00:34,000 --> 00:00:38,000
The middle-aged woman opens her green case.
25
00:00:38,000 --> 00:00:39,000
Do you want your PJs?
26
00:00:39,000 --> 00:00:40,000
Yeah.
27
00:00:40,000 --> 00:00:42,000
Yeah.
28
00:00:42,000 --> 00:00:45,000
Lifting a bundle of pajamas, Peter finds a sheet of paper
29
00:00:45,000 --> 00:00:50,000
labeled Lancaster North Hospital discharge sheet.
30
00:00:50,000 --> 00:00:52,000
He closes the suitcase and brings Gloria the pajamas.
31
00:00:52,000 --> 00:00:54,000
There you go.
32
00:00:54,000 --> 00:00:55,000
Thank you.
33
00:00:55,000 --> 00:00:57,000
He picks up the locket.
34
00:00:57,000 --> 00:00:59,000
He kept it.
35
00:00:59,000 --> 00:01:28,000
Oh, cool.

View File

@ -1,92 +0,0 @@
1
00:00:00,000 --> 00:00:01,240
Lâchez, c'est bon.
2
00:00:01,240 --> 00:00:02,240
Ça va?
3
00:00:02,240 --> 00:00:03,240
Oui.
4
00:00:03,240 --> 00:00:04,240
Merci beaucoup.
5
00:00:04,240 --> 00:00:05,240
Chèque ou espèce?
6
00:00:05,240 --> 00:00:08,640
J'ai un chèque sur la commode, il est signé.
7
00:00:08,640 --> 00:00:09,640
Je vais le repirer.
8
00:00:09,640 --> 00:00:10,640
Ok.
9
00:00:10,640 --> 00:00:11,640
Ouh là!
10
00:00:11,640 --> 00:00:12,640
Venez.
11
00:00:12,640 --> 00:00:13,640
Merci.
12
00:00:13,640 --> 00:00:14,640
Ah! C'est qui?
13
00:00:14,640 --> 00:00:21,640
C'est pas vrai, qu'est-ce qu'il fout ici, ce con?
14
00:00:21,640 --> 00:00:26,640
Excusez-moi, mais je crois que j'ai oublié mon sac chez vous.
15
00:00:26,640 --> 00:00:27,640
Ça va?
16
00:00:27,640 --> 00:00:44,200
Attendez, tout à l'heure là, c'était vous? Vous? Pas lui? Vous?
17
00:00:44,200 --> 00:00:48,360
Vous avez tout à fait raison, M. Xanaquis, Malek est à l'interne brillant qui apprend
18
00:00:48,360 --> 00:00:49,360
le métier avec moi.
19
00:00:49,360 --> 00:00:50,360
Ah!
20
00:00:50,360 --> 00:00:51,360
Bien.
21
00:00:51,360 --> 00:00:55,520
Justement, il y a la famille Boboune qui m'attend pour une consultation.
22
00:00:55,520 --> 00:00:56,520
Qui?
23
00:00:56,520 --> 00:00:57,760
Faisons pas attendre les Boboune, allez.

View File

@ -1,23 +0,0 @@
Lâchez, c'est bon.
Ça va?
Oui.
Merci beaucoup.
Chèque ou espèce?
J'ai un chèque sur la commode, il est signé.
Je vais le repirer.
Ok.
Ouh là!
Venez.
Merci.
Ah! C'est qui?
C'est pas vrai, qu'est-ce qu'il fout ici, ce con?
Excusez-moi, mais je crois que j'ai oublié mon sac chez vous.
Ça va?
Attendez, tout à l'heure là, c'était vous? Vous? Pas lui? Vous?
Vous avez tout à fait raison, M. Xanaquis, Malek est à l'interne brillant qui apprend
le métier avec moi.
Ah!
Bien.
Justement, il y a la famille Boboune qui m'attend pour une consultation.
Qui?
Faisons pas attendre les Boboune, allez.

View File

@ -1,71 +0,0 @@
WEBVTT
00:00.000 --> 00:01.240
Lâchez, c'est bon.
00:01.240 --> 00:02.240
Ça va?
00:02.240 --> 00:03.240
Oui.
00:03.240 --> 00:04.240
Merci beaucoup.
00:04.240 --> 00:05.240
Chèque ou espèce?
00:05.240 --> 00:08.640
J'ai un chèque sur la commode, il est signé.
00:08.640 --> 00:09.640
Je vais le repirer.
00:09.640 --> 00:10.640
Ok.
00:10.640 --> 00:11.640
Ouh là!
00:11.640 --> 00:12.640
Venez.
00:12.640 --> 00:13.640
Merci.
00:13.640 --> 00:14.640
Ah! C'est qui?
00:14.640 --> 00:21.640
C'est pas vrai, qu'est-ce qu'il fout ici, ce con?
00:21.640 --> 00:26.640
Excusez-moi, mais je crois que j'ai oublié mon sac chez vous.
00:26.640 --> 00:27.640
Ça va?
00:27.640 --> 00:44.200
Attendez, tout à l'heure là, c'était vous? Vous? Pas lui? Vous?
00:44.200 --> 00:48.360
Vous avez tout à fait raison, M. Xanaquis, Malek est à l'interne brillant qui apprend
00:48.360 --> 00:49.360
le métier avec moi.
00:49.360 --> 00:50.360
Ah!
00:50.360 --> 00:51.360
Bien.
00:51.360 --> 00:55.520
Justement, il y a la famille Boboune qui m'attend pour une consultation.
00:55.520 --> 00:56.520
Qui?
00:56.520 --> 00:57.760
Faisons pas attendre les Boboune, allez.

Binary file not shown.

View File

@ -1,290 +0,0 @@
[Script Info]
ScriptType: v4.00+
PlayResX: 384
PlayResY: 288
ScaledBorderAndShadow: yes
[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
Style: Default,Arial,24,&Hffffff,&Hffffff,&H0,&H0,0,0,0,0,100,100,0,0,1,1,0,2,10,10,10,0
[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
Dialogue: 0,0:00:1.18,0:00:1.67,Default,,0,0,0,,{\1c&HFF00&\u1}Bella,{\r} Gloria, love.
Dialogue: 0,0:00:1.67,0:00:2.65,Default,,0,0,0,,Bella, Gloria, love.
Dialogue: 0,0:00:2.65,0:00:3.05,Default,,0,0,0,,Bella, {\1c&HFF00&\u1}Gloria,{\r} love.
Dialogue: 0,0:00:3.05,0:00:3.07,Default,,0,0,0,,Bella, Gloria, love.
Dialogue: 0,0:00:3.07,0:00:3.27,Default,,0,0,0,,Bella, Gloria, {\1c&HFF00&\u1}love.{\r}
Dialogue: 0,0:00:3.75,0:00:3.85,Default,,0,0,0,,{\1c&HFF00&\u1}Oh.{\r}
Dialogue: 0,0:00:4.50,0:00:4.72,Default,,0,0,0,,{\1c&HFF00&\u1}How{\r} are you?
Dialogue: 0,0:00:4.72,0:00:5.78,Default,,0,0,0,,How are you?
Dialogue: 0,0:00:5.78,0:00:5.90,Default,,0,0,0,,How {\1c&HFF00&\u1}are{\r} you?
Dialogue: 0,0:00:5.90,0:00:5.94,Default,,0,0,0,,How are you?
Dialogue: 0,0:00:5.94,0:00:6.22,Default,,0,0,0,,How are {\1c&HFF00&\u1}you?{\r}
Dialogue: 0,0:00:6.72,0:00:6.80,Default,,0,0,0,,{\1c&HFF00&\u1}Oh,{\r} I'm OK.
Dialogue: 0,0:00:6.80,0:00:6.88,Default,,0,0,0,,Oh, I'm OK.
Dialogue: 0,0:00:6.88,0:00:7.04,Default,,0,0,0,,Oh, {\1c&HFF00&\u1}I'm{\r} OK.
Dialogue: 0,0:00:7.04,0:00:7.09,Default,,0,0,0,,Oh, I'm OK.
Dialogue: 0,0:00:7.09,0:00:7.13,Default,,0,0,0,,Oh, I'm {\1c&HFF00&\u1}OK.{\r}
Dialogue: 0,0:00:8.41,0:00:8.45,Default,,0,0,0,,{\1c&HFF00&\u1}I{\r} will be.
Dialogue: 0,0:00:8.45,0:00:8.49,Default,,0,0,0,,I will be.
Dialogue: 0,0:00:8.49,0:00:8.73,Default,,0,0,0,,I {\1c&HFF00&\u1}will{\r} be.
Dialogue: 0,0:00:8.73,0:00:8.77,Default,,0,0,0,,I will be.
Dialogue: 0,0:00:8.77,0:00:8.91,Default,,0,0,0,,I will {\1c&HFF00&\u1}be.{\r}
Dialogue: 0,0:00:9.22,0:00:9.30,Default,,0,0,0,,{\1c&HFF00&\u1}I{\r} said she could stay with us tomorrow
Dialogue: 0,0:00:9.30,0:00:9.34,Default,,0,0,0,,I said she could stay with us tomorrow
Dialogue: 0,0:00:9.34,0:00:9.48,Default,,0,0,0,,I {\1c&HFF00&\u1}said{\r} she could stay with us tomorrow
Dialogue: 0,0:00:9.48,0:00:9.52,Default,,0,0,0,,I said she could stay with us tomorrow
Dialogue: 0,0:00:9.52,0:00:9.60,Default,,0,0,0,,I said {\1c&HFF00&\u1}she{\r} could stay with us tomorrow
Dialogue: 0,0:00:9.60,0:00:9.62,Default,,0,0,0,,I said she could stay with us tomorrow
Dialogue: 0,0:00:9.62,0:00:9.76,Default,,0,0,0,,I said she {\1c&HFF00&\u1}could{\r} stay with us tomorrow
Dialogue: 0,0:00:9.76,0:00:9.78,Default,,0,0,0,,I said she could stay with us tomorrow
Dialogue: 0,0:00:9.78,0:00:9.94,Default,,0,0,0,,I said she could {\1c&HFF00&\u1}stay{\r} with us tomorrow
Dialogue: 0,0:00:9.94,0:00:9.96,Default,,0,0,0,,I said she could stay with us tomorrow
Dialogue: 0,0:00:9.96,0:00:10.08,Default,,0,0,0,,I said she could stay {\1c&HFF00&\u1}with{\r} us tomorrow
Dialogue: 0,0:00:10.08,0:00:10.10,Default,,0,0,0,,I said she could stay with us tomorrow
Dialogue: 0,0:00:10.10,0:00:10.14,Default,,0,0,0,,I said she could stay with {\1c&HFF00&\u1}us{\r} tomorrow
Dialogue: 0,0:00:10.14,0:00:10.16,Default,,0,0,0,,I said she could stay with us tomorrow
Dialogue: 0,0:00:10.16,0:00:10.44,Default,,0,0,0,,I said she could stay with us {\1c&HFF00&\u1}tomorrow{\r}
Dialogue: 0,0:00:10.46,0:00:10.54,Default,,0,0,0,,{\1c&HFF00&\u1}just{\r} until she feels better.
Dialogue: 0,0:00:10.54,0:00:10.56,Default,,0,0,0,,just until she feels better.
Dialogue: 0,0:00:10.56,0:00:10.68,Default,,0,0,0,,just {\1c&HFF00&\u1}until{\r} she feels better.
Dialogue: 0,0:00:10.68,0:00:10.70,Default,,0,0,0,,just until she feels better.
Dialogue: 0,0:00:10.70,0:00:10.80,Default,,0,0,0,,just until {\1c&HFF00&\u1}she{\r} feels better.
Dialogue: 0,0:00:10.80,0:00:10.82,Default,,0,0,0,,just until she feels better.
Dialogue: 0,0:00:10.82,0:00:11.05,Default,,0,0,0,,just until she {\1c&HFF00&\u1}feels{\r} better.
Dialogue: 0,0:00:11.05,0:00:11.09,Default,,0,0,0,,just until she feels better.
Dialogue: 0,0:00:11.09,0:00:11.35,Default,,0,0,0,,just until she feels {\1c&HFF00&\u1}better.{\r}
Dialogue: 0,0:00:11.73,0:00:11.95,Default,,0,0,0,,{\1c&HFF00&\u1}Yeah.{\r}
Dialogue: 0,0:00:12.09,0:00:12.17,Default,,0,0,0,,{\1c&HFF00&\u1}Of{\r} course she can.
Dialogue: 0,0:00:12.17,0:00:12.20,Default,,0,0,0,,Of course she can.
Dialogue: 0,0:00:12.20,0:00:12.32,Default,,0,0,0,,Of {\1c&HFF00&\u1}course{\r} she can.
Dialogue: 0,0:00:12.32,0:00:12.38,Default,,0,0,0,,Of course she can.
Dialogue: 0,0:00:12.38,0:00:12.64,Default,,0,0,0,,Of course {\1c&HFF00&\u1}she{\r} can.
Dialogue: 0,0:00:12.64,0:00:12.72,Default,,0,0,0,,Of course she can.
Dialogue: 0,0:00:12.72,0:00:13.24,Default,,0,0,0,,Of course she {\1c&HFF00&\u1}can.{\r}
Dialogue: 0,0:00:13.36,0:00:13.70,Default,,0,0,0,,{\1c&HFF00&\u1}No,{\r} things won't be for long.
Dialogue: 0,0:00:13.70,0:00:13.82,Default,,0,0,0,,No, things won't be for long.
Dialogue: 0,0:00:13.82,0:00:14.12,Default,,0,0,0,,No, {\1c&HFF00&\u1}things{\r} won't be for long.
Dialogue: 0,0:00:14.12,0:00:14.19,Default,,0,0,0,,No, things won't be for long.
Dialogue: 0,0:00:14.19,0:00:14.39,Default,,0,0,0,,No, things {\1c&HFF00&\u1}won't{\r} be for long.
Dialogue: 0,0:00:14.39,0:00:14.43,Default,,0,0,0,,No, things won't be for long.
Dialogue: 0,0:00:14.43,0:00:14.53,Default,,0,0,0,,No, things won't {\1c&HFF00&\u1}be{\r} for long.
Dialogue: 0,0:00:14.53,0:00:14.59,Default,,0,0,0,,No, things won't be for long.
Dialogue: 0,0:00:14.59,0:00:14.73,Default,,0,0,0,,No, things won't be {\1c&HFF00&\u1}for{\r} long.
Dialogue: 0,0:00:14.73,0:00:14.81,Default,,0,0,0,,No, things won't be for long.
Dialogue: 0,0:00:14.81,0:00:15.01,Default,,0,0,0,,No, things won't be for {\1c&HFF00&\u1}long.{\r}
Dialogue: 0,0:00:15.17,0:00:15.41,Default,,0,0,0,,{\1c&HFF00&\u1}Well,{\r} you can stay as long as you want, my love.
Dialogue: 0,0:00:15.41,0:00:15.43,Default,,0,0,0,,Well, you can stay as long as you want, my love.
Dialogue: 0,0:00:15.43,0:00:15.51,Default,,0,0,0,,Well, {\1c&HFF00&\u1}you{\r} can stay as long as you want, my love.
Dialogue: 0,0:00:15.51,0:00:15.55,Default,,0,0,0,,Well, you can stay as long as you want, my love.
Dialogue: 0,0:00:15.55,0:00:15.69,Default,,0,0,0,,Well, you {\1c&HFF00&\u1}can{\r} stay as long as you want, my love.
Dialogue: 0,0:00:15.69,0:00:15.75,Default,,0,0,0,,Well, you can stay as long as you want, my love.
Dialogue: 0,0:00:15.75,0:00:15.89,Default,,0,0,0,,Well, you can {\1c&HFF00&\u1}stay{\r} as long as you want, my love.
Dialogue: 0,0:00:15.89,0:00:15.95,Default,,0,0,0,,Well, you can stay as long as you want, my love.
Dialogue: 0,0:00:15.95,0:00:16.01,Default,,0,0,0,,Well, you can stay {\1c&HFF00&\u1}as{\r} long as you want, my love.
Dialogue: 0,0:00:16.01,0:00:16.05,Default,,0,0,0,,Well, you can stay as long as you want, my love.
Dialogue: 0,0:00:16.05,0:00:16.20,Default,,0,0,0,,Well, you can stay as {\1c&HFF00&\u1}long{\r} as you want, my love.
Dialogue: 0,0:00:16.20,0:00:16.24,Default,,0,0,0,,Well, you can stay as long as you want, my love.
Dialogue: 0,0:00:16.24,0:00:16.28,Default,,0,0,0,,Well, you can stay as long {\1c&HFF00&\u1}as{\r} you want, my love.
Dialogue: 0,0:00:16.28,0:00:16.30,Default,,0,0,0,,Well, you can stay as long as you want, my love.
Dialogue: 0,0:00:16.30,0:00:16.42,Default,,0,0,0,,Well, you can stay as long as {\1c&HFF00&\u1}you{\r} want, my love.
Dialogue: 0,0:00:16.42,0:00:16.46,Default,,0,0,0,,Well, you can stay as long as you want, my love.
Dialogue: 0,0:00:16.46,0:00:16.66,Default,,0,0,0,,Well, you can stay as long as you {\1c&HFF00&\u1}want,{\r} my love.
Dialogue: 0,0:00:16.66,0:00:16.72,Default,,0,0,0,,Well, you can stay as long as you want, my love.
Dialogue: 0,0:00:16.72,0:00:16.92,Default,,0,0,0,,Well, you can stay as long as you want, {\1c&HFF00&\u1}my{\r} love.
Dialogue: 0,0:00:16.92,0:00:16.96,Default,,0,0,0,,Well, you can stay as long as you want, my love.
Dialogue: 0,0:00:16.96,0:00:17.34,Default,,0,0,0,,Well, you can stay as long as you want, my {\1c&HFF00&\u1}love.{\r}
Dialogue: 0,0:00:17.62,0:00:17.86,Default,,0,0,0,,{\1c&HFF00&\u1}I've{\r} really missed you.
Dialogue: 0,0:00:17.86,0:00:17.88,Default,,0,0,0,,I've really missed you.
Dialogue: 0,0:00:17.88,0:00:18.14,Default,,0,0,0,,I've {\1c&HFF00&\u1}really{\r} missed you.
Dialogue: 0,0:00:18.14,0:00:18.19,Default,,0,0,0,,I've really missed you.
Dialogue: 0,0:00:18.19,0:00:18.59,Default,,0,0,0,,I've really {\1c&HFF00&\u1}missed{\r} you.
Dialogue: 0,0:00:18.59,0:00:18.63,Default,,0,0,0,,I've really missed you.
Dialogue: 0,0:00:18.63,0:00:18.81,Default,,0,0,0,,I've really missed {\1c&HFF00&\u1}you.{\r}
Dialogue: 0,0:00:19.49,0:00:19.79,Default,,0,0,0,,{\1c&HFF00&\u1}Pops.{\r}
Dialogue: 0,0:00:20.40,0:00:20.64,Default,,0,0,0,,{\1c&HFF00&\u1}Great{\r} to see you, love.
Dialogue: 0,0:00:20.64,0:00:20.66,Default,,0,0,0,,Great to see you, love.
Dialogue: 0,0:00:20.66,0:00:20.78,Default,,0,0,0,,Great {\1c&HFF00&\u1}to{\r} see you, love.
Dialogue: 0,0:00:20.78,0:00:20.82,Default,,0,0,0,,Great to see you, love.
Dialogue: 0,0:00:20.82,0:00:21.14,Default,,0,0,0,,Great to {\1c&HFF00&\u1}see{\r} you, love.
Dialogue: 0,0:00:21.14,0:00:21.16,Default,,0,0,0,,Great to see you, love.
Dialogue: 0,0:00:21.16,0:00:21.28,Default,,0,0,0,,Great to see {\1c&HFF00&\u1}you,{\r} love.
Dialogue: 0,0:00:21.28,0:00:21.32,Default,,0,0,0,,Great to see you, love.
Dialogue: 0,0:00:21.32,0:00:21.68,Default,,0,0,0,,Great to see you, {\1c&HFF00&\u1}love.{\r}
Dialogue: 0,0:00:21.90,0:00:23.21,Default,,0,0,0,,{\1c&HFF00&\u1}Oh.{\r}
Dialogue: 0,0:00:23.23,0:00:23.29,Default,,0,0,0,,{\1c&HFF00&\u1}All{\r} right, shall we get you off to bed then?
Dialogue: 0,0:00:23.29,0:00:23.31,Default,,0,0,0,,All right, shall we get you off to bed then?
Dialogue: 0,0:00:23.31,0:00:23.41,Default,,0,0,0,,All {\1c&HFF00&\u1}right,{\r} shall we get you off to bed then?
Dialogue: 0,0:00:23.41,0:00:23.43,Default,,0,0,0,,All right, shall we get you off to bed then?
Dialogue: 0,0:00:23.43,0:00:23.55,Default,,0,0,0,,All right, {\1c&HFF00&\u1}shall{\r} we get you off to bed then?
Dialogue: 0,0:00:23.55,0:00:23.57,Default,,0,0,0,,All right, shall we get you off to bed then?
Dialogue: 0,0:00:23.57,0:00:23.65,Default,,0,0,0,,All right, shall {\1c&HFF00&\u1}we{\r} get you off to bed then?
Dialogue: 0,0:00:23.65,0:00:23.67,Default,,0,0,0,,All right, shall we get you off to bed then?
Dialogue: 0,0:00:23.67,0:00:23.74,Default,,0,0,0,,All right, shall we {\1c&HFF00&\u1}get{\r} you off to bed then?
Dialogue: 0,0:00:23.74,0:00:23.76,Default,,0,0,0,,All right, shall we get you off to bed then?
Dialogue: 0,0:00:23.76,0:00:23.82,Default,,0,0,0,,All right, shall we get {\1c&HFF00&\u1}you{\r} off to bed then?
Dialogue: 0,0:00:23.82,0:00:23.84,Default,,0,0,0,,All right, shall we get you off to bed then?
Dialogue: 0,0:00:23.84,0:00:23.94,Default,,0,0,0,,All right, shall we get you {\1c&HFF00&\u1}off{\r} to bed then?
Dialogue: 0,0:00:23.94,0:00:23.96,Default,,0,0,0,,All right, shall we get you off to bed then?
Dialogue: 0,0:00:23.96,0:00:24.04,Default,,0,0,0,,All right, shall we get you off {\1c&HFF00&\u1}to{\r} bed then?
Dialogue: 0,0:00:24.04,0:00:24.06,Default,,0,0,0,,All right, shall we get you off to bed then?
Dialogue: 0,0:00:24.06,0:00:24.22,Default,,0,0,0,,All right, shall we get you off to {\1c&HFF00&\u1}bed{\r} then?
Dialogue: 0,0:00:24.22,0:00:24.24,Default,,0,0,0,,All right, shall we get you off to bed then?
Dialogue: 0,0:00:24.24,0:00:24.38,Default,,0,0,0,,All right, shall we get you off to bed {\1c&HFF00&\u1}then?{\r}
Dialogue: 0,0:00:24.58,0:00:24.72,Default,,0,0,0,,{\1c&HFF00&\u1}You{\r} should have given me some warm.
Dialogue: 0,0:00:24.72,0:00:24.78,Default,,0,0,0,,You should have given me some warm.
Dialogue: 0,0:00:24.78,0:00:24.98,Default,,0,0,0,,You {\1c&HFF00&\u1}should{\r} have given me some warm.
Dialogue: 0,0:00:24.98,0:00:25.02,Default,,0,0,0,,You should have given me some warm.
Dialogue: 0,0:00:25.02,0:00:25.12,Default,,0,0,0,,You should {\1c&HFF00&\u1}have{\r} given me some warm.
Dialogue: 0,0:00:25.12,0:00:25.16,Default,,0,0,0,,You should have given me some warm.
Dialogue: 0,0:00:25.16,0:00:25.29,Default,,0,0,0,,You should have {\1c&HFF00&\u1}given{\r} me some warm.
Dialogue: 0,0:00:25.29,0:00:25.35,Default,,0,0,0,,You should have given me some warm.
Dialogue: 0,0:00:25.35,0:00:25.45,Default,,0,0,0,,You should have given {\1c&HFF00&\u1}me{\r} some warm.
Dialogue: 0,0:00:25.45,0:00:25.49,Default,,0,0,0,,You should have given me some warm.
Dialogue: 0,0:00:25.49,0:00:25.67,Default,,0,0,0,,You should have given me {\1c&HFF00&\u1}some{\r} warm.
Dialogue: 0,0:00:25.67,0:00:25.81,Default,,0,0,0,,You should have given me some warm.
Dialogue: 0,0:00:25.81,0:00:26.05,Default,,0,0,0,,You should have given me some {\1c&HFF00&\u1}warm.{\r}
Dialogue: 0,0:00:26.31,0:00:26.37,Default,,0,0,0,,{\1c&HFF00&\u1}I{\r} know.
Dialogue: 0,0:00:26.37,0:00:26.39,Default,,0,0,0,,I know.
Dialogue: 0,0:00:26.39,0:00:26.49,Default,,0,0,0,,I {\1c&HFF00&\u1}know.{\r}
Dialogue: 0,0:00:26.61,0:00:26.69,Default,,0,0,0,,{\1c&HFF00&\u1}I'll{\r} have to put the electric blanket on.
Dialogue: 0,0:00:26.69,0:00:26.71,Default,,0,0,0,,I'll have to put the electric blanket on.
Dialogue: 0,0:00:26.71,0:00:26.81,Default,,0,0,0,,I'll {\1c&HFF00&\u1}have{\r} to put the electric blanket on.
Dialogue: 0,0:00:26.81,0:00:26.83,Default,,0,0,0,,I'll have to put the electric blanket on.
Dialogue: 0,0:00:26.83,0:00:27.02,Default,,0,0,0,,I'll have {\1c&HFF00&\u1}to{\r} put the electric blanket on.
Dialogue: 0,0:00:27.02,0:00:27.06,Default,,0,0,0,,I'll have to put the electric blanket on.
Dialogue: 0,0:00:27.06,0:00:27.42,Default,,0,0,0,,I'll have to {\1c&HFF00&\u1}put{\r} the electric blanket on.
Dialogue: 0,0:00:27.42,0:00:27.48,Default,,0,0,0,,I'll have to put the electric blanket on.
Dialogue: 0,0:00:27.48,0:00:27.56,Default,,0,0,0,,I'll have to put {\1c&HFF00&\u1}the{\r} electric blanket on.
Dialogue: 0,0:00:27.56,0:00:27.70,Default,,0,0,0,,I'll have to put the electric blanket on.
Dialogue: 0,0:00:27.70,0:00:28.08,Default,,0,0,0,,I'll have to put the {\1c&HFF00&\u1}electric{\r} blanket on.
Dialogue: 0,0:00:28.08,0:00:28.62,Default,,0,0,0,,I'll have to put the electric blanket on.
Dialogue: 0,0:00:28.62,0:00:28.84,Default,,0,0,0,,I'll have to put the electric {\1c&HFF00&\u1}blanket{\r} on.
Dialogue: 0,0:00:28.84,0:00:28.90,Default,,0,0,0,,I'll have to put the electric blanket on.
Dialogue: 0,0:00:28.90,0:00:28.94,Default,,0,0,0,,I'll have to put the electric blanket {\1c&HFF00&\u1}on.{\r}
Dialogue: 0,0:00:29.49,0:00:29.55,Default,,0,0,0,,{\1c&HFF00&\u1}I'm{\r} sorry.
Dialogue: 0,0:00:29.55,0:00:29.57,Default,,0,0,0,,I'm sorry.
Dialogue: 0,0:00:29.57,0:00:29.82,Default,,0,0,0,,I'm {\1c&HFF00&\u1}sorry.{\r}
Dialogue: 0,0:00:29.98,0:00:30.08,Default,,0,0,0,,{\1c&HFF00&\u1}All{\r} right, Bella.
Dialogue: 0,0:00:30.08,0:00:30.10,Default,,0,0,0,,All right, Bella.
Dialogue: 0,0:00:30.10,0:00:30.29,Default,,0,0,0,,All {\1c&HFF00&\u1}right,{\r} Bella.
Dialogue: 0,0:00:30.29,0:00:30.43,Default,,0,0,0,,All right, Bella.
Dialogue: 0,0:00:30.43,0:00:30.63,Default,,0,0,0,,All right, {\1c&HFF00&\u1}Bella.{\r}
Dialogue: 0,0:00:31.37,0:00:31.58,Default,,0,0,0,,{\1c&HFF00&\u1}Freezing{\r} up there.
Dialogue: 0,0:00:31.58,0:00:31.62,Default,,0,0,0,,Freezing up there.
Dialogue: 0,0:00:31.62,0:00:31.66,Default,,0,0,0,,Freezing {\1c&HFF00&\u1}up{\r} there.
Dialogue: 0,0:00:31.66,0:00:31.68,Default,,0,0,0,,Freezing up there.
Dialogue: 0,0:00:31.68,0:00:31.90,Default,,0,0,0,,Freezing up {\1c&HFF00&\u1}there.{\r}
Dialogue: 0,0:00:31.90,0:00:31.94,Default,,0,0,0,,{\1c&HFF00&\u1}In{\r} a bedroom, Peter unpacks her suitcase.
Dialogue: 0,0:00:31.94,0:00:31.96,Default,,0,0,0,,In a bedroom, Peter unpacks her suitcase.
Dialogue: 0,0:00:31.96,0:00:31.98,Default,,0,0,0,,In {\1c&HFF00&\u1}a{\r} bedroom, Peter unpacks her suitcase.
Dialogue: 0,0:00:31.98,0:00:32.00,Default,,0,0,0,,In a bedroom, Peter unpacks her suitcase.
Dialogue: 0,0:00:32.00,0:00:32.14,Default,,0,0,0,,In a {\1c&HFF00&\u1}bedroom,{\r} Peter unpacks her suitcase.
Dialogue: 0,0:00:32.14,0:00:32.20,Default,,0,0,0,,In a bedroom, Peter unpacks her suitcase.
Dialogue: 0,0:00:32.20,0:00:32.50,Default,,0,0,0,,In a bedroom, {\1c&HFF00&\u1}Peter{\r} unpacks her suitcase.
Dialogue: 0,0:00:32.50,0:00:32.58,Default,,0,0,0,,In a bedroom, Peter unpacks her suitcase.
Dialogue: 0,0:00:32.58,0:00:32.98,Default,,0,0,0,,In a bedroom, Peter {\1c&HFF00&\u1}unpacks{\r} her suitcase.
Dialogue: 0,0:00:32.98,0:00:33.00,Default,,0,0,0,,In a bedroom, Peter unpacks her suitcase.
Dialogue: 0,0:00:33.00,0:00:33.10,Default,,0,0,0,,In a bedroom, Peter unpacks {\1c&HFF00&\u1}her{\r} suitcase.
Dialogue: 0,0:00:33.10,0:00:33.16,Default,,0,0,0,,In a bedroom, Peter unpacks her suitcase.
Dialogue: 0,0:00:33.16,0:00:33.65,Default,,0,0,0,,In a bedroom, Peter unpacks her {\1c&HFF00&\u1}suitcase.{\r}
Dialogue: 0,0:00:34.27,0:00:34.35,Default,,0,0,0,,{\1c&HFF00&\u1}The{\r} middle-aged woman opens her green case.
Dialogue: 0,0:00:34.35,0:00:34.39,Default,,0,0,0,,The middle-aged woman opens her green case.
Dialogue: 0,0:00:34.39,0:00:34.91,Default,,0,0,0,,The {\1c&HFF00&\u1}middle-aged{\r} woman opens her green case.
Dialogue: 0,0:00:34.91,0:00:34.99,Default,,0,0,0,,The middle-aged woman opens her green case.
Dialogue: 0,0:00:34.99,0:00:35.27,Default,,0,0,0,,The middle-aged {\1c&HFF00&\u1}woman{\r} opens her green case.
Dialogue: 0,0:00:35.27,0:00:35.39,Default,,0,0,0,,The middle-aged woman opens her green case.
Dialogue: 0,0:00:35.39,0:00:35.67,Default,,0,0,0,,The middle-aged woman {\1c&HFF00&\u1}opens{\r} her green case.
Dialogue: 0,0:00:35.67,0:00:35.71,Default,,0,0,0,,The middle-aged woman opens her green case.
Dialogue: 0,0:00:35.71,0:00:35.81,Default,,0,0,0,,The middle-aged woman opens {\1c&HFF00&\u1}her{\r} green case.
Dialogue: 0,0:00:35.81,0:00:35.85,Default,,0,0,0,,The middle-aged woman opens her green case.
Dialogue: 0,0:00:35.85,0:00:36.19,Default,,0,0,0,,The middle-aged woman opens her {\1c&HFF00&\u1}green{\r} case.
Dialogue: 0,0:00:36.19,0:00:36.23,Default,,0,0,0,,The middle-aged woman opens her green case.
Dialogue: 0,0:00:36.23,0:00:36.53,Default,,0,0,0,,The middle-aged woman opens her green {\1c&HFF00&\u1}case.{\r}
Dialogue: 0,0:00:38.13,0:00:38.25,Default,,0,0,0,,{\1c&HFF00&\u1}Do{\r} you want your PJs?
Dialogue: 0,0:00:38.25,0:00:38.28,Default,,0,0,0,,Do you want your PJs?
Dialogue: 0,0:00:38.28,0:00:38.36,Default,,0,0,0,,Do {\1c&HFF00&\u1}you{\r} want your PJs?
Dialogue: 0,0:00:38.36,0:00:38.38,Default,,0,0,0,,Do you want your PJs?
Dialogue: 0,0:00:38.38,0:00:38.54,Default,,0,0,0,,Do you {\1c&HFF00&\u1}want{\r} your PJs?
Dialogue: 0,0:00:38.54,0:00:38.56,Default,,0,0,0,,Do you want your PJs?
Dialogue: 0,0:00:38.56,0:00:38.74,Default,,0,0,0,,Do you want {\1c&HFF00&\u1}your{\r} PJs?
Dialogue: 0,0:00:38.74,0:00:38.88,Default,,0,0,0,,Do you want your PJs?
Dialogue: 0,0:00:38.88,0:00:39.30,Default,,0,0,0,,Do you want your {\1c&HFF00&\u1}PJs?{\r}
Dialogue: 0,0:00:39.88,0:00:40.18,Default,,0,0,0,,{\1c&HFF00&\u1}Yeah.{\r}
Dialogue: 0,0:00:42.39,0:00:42.69,Default,,0,0,0,,{\1c&HFF00&\u1}Lifting{\r} a bundle of pajamas,
Dialogue: 0,0:00:42.69,0:00:42.73,Default,,0,0,0,,Lifting a bundle of pajamas,
Dialogue: 0,0:00:42.73,0:00:42.75,Default,,0,0,0,,Lifting {\1c&HFF00&\u1}a{\r} bundle of pajamas,
Dialogue: 0,0:00:42.75,0:00:42.81,Default,,0,0,0,,Lifting a bundle of pajamas,
Dialogue: 0,0:00:42.81,0:00:43.11,Default,,0,0,0,,Lifting a {\1c&HFF00&\u1}bundle{\r} of pajamas,
Dialogue: 0,0:00:43.11,0:00:43.13,Default,,0,0,0,,Lifting a bundle of pajamas,
Dialogue: 0,0:00:43.13,0:00:43.19,Default,,0,0,0,,Lifting a bundle {\1c&HFF00&\u1}of{\r} pajamas,
Dialogue: 0,0:00:43.19,0:00:43.25,Default,,0,0,0,,Lifting a bundle of pajamas,
Dialogue: 0,0:00:43.25,0:00:43.77,Default,,0,0,0,,Lifting a bundle of {\1c&HFF00&\u1}pajamas,{\r}
Dialogue: 0,0:00:44.07,0:00:44.31,Default,,0,0,0,,{\1c&HFF00&\u1}Peter{\r} finds a sheet of paper labeled
Dialogue: 0,0:00:44.31,0:00:44.37,Default,,0,0,0,,Peter finds a sheet of paper labeled
Dialogue: 0,0:00:44.37,0:00:44.63,Default,,0,0,0,,Peter {\1c&HFF00&\u1}finds{\r} a sheet of paper labeled
Dialogue: 0,0:00:44.63,0:00:44.67,Default,,0,0,0,,Peter finds a sheet of paper labeled
Dialogue: 0,0:00:44.67,0:00:44.69,Default,,0,0,0,,Peter finds {\1c&HFF00&\u1}a{\r} sheet of paper labeled
Dialogue: 0,0:00:44.69,0:00:44.75,Default,,0,0,0,,Peter finds a sheet of paper labeled
Dialogue: 0,0:00:44.75,0:00:44.95,Default,,0,0,0,,Peter finds a {\1c&HFF00&\u1}sheet{\r} of paper labeled
Dialogue: 0,0:00:44.95,0:00:44.99,Default,,0,0,0,,Peter finds a sheet of paper labeled
Dialogue: 0,0:00:44.99,0:00:45.05,Default,,0,0,0,,Peter finds a sheet {\1c&HFF00&\u1}of{\r} paper labeled
Dialogue: 0,0:00:45.05,0:00:45.11,Default,,0,0,0,,Peter finds a sheet of paper labeled
Dialogue: 0,0:00:45.11,0:00:45.46,Default,,0,0,0,,Peter finds a sheet of {\1c&HFF00&\u1}paper{\r} labeled
Dialogue: 0,0:00:45.46,0:00:45.54,Default,,0,0,0,,Peter finds a sheet of paper labeled
Dialogue: 0,0:00:45.54,0:00:45.88,Default,,0,0,0,,Peter finds a sheet of paper {\1c&HFF00&\u1}labeled{\r}
Dialogue: 0,0:00:46.34,0:00:47.04,Default,,0,0,0,,{\1c&HFF00&\u1}Lancaster{\r} North Hospital discharge sheet.
Dialogue: 0,0:00:47.04,0:00:47.12,Default,,0,0,0,,Lancaster North Hospital discharge sheet.
Dialogue: 0,0:00:47.12,0:00:47.38,Default,,0,0,0,,Lancaster {\1c&HFF00&\u1}North{\r} Hospital discharge sheet.
Dialogue: 0,0:00:47.38,0:00:47.44,Default,,0,0,0,,Lancaster North Hospital discharge sheet.
Dialogue: 0,0:00:47.44,0:00:47.94,Default,,0,0,0,,Lancaster North {\1c&HFF00&\u1}Hospital{\r} discharge sheet.
Dialogue: 0,0:00:47.94,0:00:48.27,Default,,0,0,0,,Lancaster North Hospital discharge sheet.
Dialogue: 0,0:00:48.27,0:00:48.93,Default,,0,0,0,,Lancaster North Hospital {\1c&HFF00&\u1}discharge{\r} sheet.
Dialogue: 0,0:00:48.93,0:00:49.03,Default,,0,0,0,,Lancaster North Hospital discharge sheet.
Dialogue: 0,0:00:49.03,0:00:49.25,Default,,0,0,0,,Lancaster North Hospital discharge {\1c&HFF00&\u1}sheet.{\r}
Dialogue: 0,0:00:50.29,0:00:50.37,Default,,0,0,0,,{\1c&HFF00&\u1}He{\r} closes the suitcase and brings Gloria the pajamas.
Dialogue: 0,0:00:50.37,0:00:50.41,Default,,0,0,0,,He closes the suitcase and brings Gloria the pajamas.
Dialogue: 0,0:00:50.41,0:00:50.77,Default,,0,0,0,,He {\1c&HFF00&\u1}closes{\r} the suitcase and brings Gloria the pajamas.
Dialogue: 0,0:00:50.77,0:00:50.81,Default,,0,0,0,,He closes the suitcase and brings Gloria the pajamas.
Dialogue: 0,0:00:50.81,0:00:50.91,Default,,0,0,0,,He closes {\1c&HFF00&\u1}the{\r} suitcase and brings Gloria the pajamas.
Dialogue: 0,0:00:50.91,0:00:50.95,Default,,0,0,0,,He closes the suitcase and brings Gloria the pajamas.
Dialogue: 0,0:00:50.95,0:00:51.39,Default,,0,0,0,,He closes the {\1c&HFF00&\u1}suitcase{\r} and brings Gloria the pajamas.
Dialogue: 0,0:00:51.39,0:00:51.43,Default,,0,0,0,,He closes the suitcase and brings Gloria the pajamas.
Dialogue: 0,0:00:51.43,0:00:51.51,Default,,0,0,0,,He closes the suitcase {\1c&HFF00&\u1}and{\r} brings Gloria the pajamas.
Dialogue: 0,0:00:51.51,0:00:51.53,Default,,0,0,0,,He closes the suitcase and brings Gloria the pajamas.
Dialogue: 0,0:00:51.53,0:00:51.79,Default,,0,0,0,,He closes the suitcase and {\1c&HFF00&\u1}brings{\r} Gloria the pajamas.
Dialogue: 0,0:00:51.79,0:00:51.83,Default,,0,0,0,,He closes the suitcase and brings Gloria the pajamas.
Dialogue: 0,0:00:51.83,0:00:52.23,Default,,0,0,0,,He closes the suitcase and brings {\1c&HFF00&\u1}Gloria{\r} the pajamas.
Dialogue: 0,0:00:52.23,0:00:52.25,Default,,0,0,0,,He closes the suitcase and brings Gloria the pajamas.
Dialogue: 0,0:00:52.25,0:00:52.32,Default,,0,0,0,,He closes the suitcase and brings Gloria {\1c&HFF00&\u1}the{\r} pajamas.
Dialogue: 0,0:00:52.32,0:00:52.36,Default,,0,0,0,,He closes the suitcase and brings Gloria the pajamas.
Dialogue: 0,0:00:52.36,0:00:52.86,Default,,0,0,0,,He closes the suitcase and brings Gloria the {\1c&HFF00&\u1}pajamas.{\r}
Dialogue: 0,0:00:54.19,0:00:54.49,Default,,0,0,0,,{\1c&HFF00&\u1}There{\r} you go.
Dialogue: 0,0:00:54.49,0:00:54.55,Default,,0,0,0,,There you go.
Dialogue: 0,0:00:54.55,0:00:54.77,Default,,0,0,0,,There {\1c&HFF00&\u1}you{\r} go.
Dialogue: 0,0:00:54.77,0:00:54.79,Default,,0,0,0,,There you go.
Dialogue: 0,0:00:54.79,0:00:54.83,Default,,0,0,0,,There you {\1c&HFF00&\u1}go.{\r}
Dialogue: 0,0:00:55.65,0:00:55.77,Default,,0,0,0,,{\1c&HFF00&\u1}Thank{\r} you.
Dialogue: 0,0:00:55.77,0:00:55.80,Default,,0,0,0,,Thank you.
Dialogue: 0,0:00:55.80,0:00:55.90,Default,,0,0,0,,Thank {\1c&HFF00&\u1}you.{\r}
Dialogue: 0,0:00:55.90,0:00:55.94,Default,,0,0,0,,{\1c&HFF00&\u1}He{\r} picks up the locket.
Dialogue: 0,0:00:55.94,0:00:55.96,Default,,0,0,0,,He picks up the locket.
Dialogue: 0,0:00:55.96,0:00:56.10,Default,,0,0,0,,He {\1c&HFF00&\u1}picks{\r} up the locket.
Dialogue: 0,0:00:56.10,0:00:56.12,Default,,0,0,0,,He picks up the locket.
Dialogue: 0,0:00:56.12,0:00:56.20,Default,,0,0,0,,He picks {\1c&HFF00&\u1}up{\r} the locket.
Dialogue: 0,0:00:56.20,0:00:56.22,Default,,0,0,0,,He picks up the locket.
Dialogue: 0,0:00:56.22,0:00:56.32,Default,,0,0,0,,He picks up {\1c&HFF00&\u1}the{\r} locket.
Dialogue: 0,0:00:56.32,0:00:56.36,Default,,0,0,0,,He picks up the locket.
Dialogue: 0,0:00:56.36,0:00:56.74,Default,,0,0,0,,He picks up the {\1c&HFF00&\u1}locket.{\r}
Dialogue: 0,0:00:57.12,0:00:57.22,Default,,0,0,0,,{\1c&HFF00&\u1}You{\r} kept it.
Dialogue: 0,0:00:57.22,0:00:57.27,Default,,0,0,0,,You kept it.
Dialogue: 0,0:00:57.27,0:00:57.47,Default,,0,0,0,,You {\1c&HFF00&\u1}kept{\r} it.
Dialogue: 0,0:00:57.47,0:00:57.55,Default,,0,0,0,,You kept it.
Dialogue: 0,0:00:57.55,0:00:57.63,Default,,0,0,0,,You kept {\1c&HFF00&\u1}it.{\r}
Dialogue: 0,0:00:58.87,0:00:58.99,Default,,0,0,0,,{\1c&HFF00&\u1}Oh,{\r} of course.
Dialogue: 0,0:00:58.99,0:00:59.28,Default,,0,0,0,,Oh, of course.
Dialogue: 0,0:00:59.28,0:00:59.58,Default,,0,0,0,,Oh, {\1c&HFF00&\u1}of{\r} course.
Dialogue: 0,0:00:59.58,0:00:59.68,Default,,0,0,0,,Oh, of course.
Dialogue: 0,0:00:59.68,0:00:59.96,Default,,0,0,0,,Oh, of {\1c&HFF00&\u1}course.{\r}

View File

@ -1,140 +0,0 @@
1
00:00:01,185 --> 00:00:03,273
Bella, Gloria, love.
2
00:00:03,754 --> 00:00:03,855
Oh.
3
00:00:04,496 --> 00:00:06,219
How are you?
4
00:00:06,723 --> 00:00:07,126
Oh, I'm OK.
5
00:00:08,412 --> 00:00:08,915
I will be.
6
00:00:09,215 --> 00:00:10,439
I said she could stay with us tomorrow
7
00:00:10,459 --> 00:00:11,351
just until she feels better.
8
00:00:11,733 --> 00:00:11,954
Yeah.
9
00:00:12,095 --> 00:00:13,238
Of course she can.
10
00:00:13,359 --> 00:00:15,012
No, things won't be for long.
11
00:00:15,173 --> 00:00:17,338
Well, you can stay as long as you want, my love.
12
00:00:17,621 --> 00:00:18,810
I've really missed you.
13
00:00:19,493 --> 00:00:19,795
Pops.
14
00:00:20,396 --> 00:00:21,679
Great to see you, love.
15
00:00:21,901 --> 00:00:23,213
Oh.
16
00:00:23,233 --> 00:00:24,378
All right, shall we get you off to bed then?
17
00:00:24,579 --> 00:00:26,052
You should have given me some warm.
18
00:00:26,313 --> 00:00:26,494
I know.
19
00:00:26,614 --> 00:00:28,940
I'll have to put the electric blanket on.
20
00:00:29,490 --> 00:00:29,817
I'm sorry.
21
00:00:29,980 --> 00:00:30,633
All right, Bella.
22
00:00:31,375 --> 00:00:31,897
Freezing up there.
23
00:00:31,897 --> 00:00:33,647
In a bedroom, Peter unpacks her suitcase.
24
00:00:34,268 --> 00:00:36,533
The middle-aged woman opens her green case.
25
00:00:38,135 --> 00:00:39,296
Do you want your PJs?
26
00:00:39,879 --> 00:00:40,181
Yeah.
27
00:00:42,388 --> 00:00:43,773
Lifting a bundle of pajamas,
28
00:00:44,073 --> 00:00:45,876
Peter finds a sheet of paper labeled
29
00:00:46,338 --> 00:00:49,249
Lancaster North Hospital discharge sheet.
30
00:00:50,291 --> 00:00:52,856
He closes the suitcase and brings Gloria the pajamas.
31
00:00:54,186 --> 00:00:54,831
There you go.
32
00:00:55,654 --> 00:00:55,895
Thank you.
33
00:00:55,895 --> 00:00:56,742
He picks up the locket.
34
00:00:57,124 --> 00:00:57,627
You kept it.
35
00:00:58,874 --> 00:00:59,960
Oh, of course.

View File

@ -1,624 +0,0 @@
1
00:00:01,185 --> 00:00:01,667
Bella,
2
00:00:02,651 --> 00:00:03,052
Gloria,
3
00:00:03,072 --> 00:00:03,273
love.
4
00:00:03,754 --> 00:00:03,855
Oh.
5
00:00:04,496 --> 00:00:04,716
How
6
00:00:05,778 --> 00:00:05,898
are
7
00:00:05,938 --> 00:00:06,219
you?
8
00:00:06,723 --> 00:00:06,803
Oh,
9
00:00:06,884 --> 00:00:07,045
I'm
10
00:00:07,085 --> 00:00:07,126
OK.
11
00:00:08,412 --> 00:00:08,452
I
12
00:00:08,492 --> 00:00:08,734
will
13
00:00:08,774 --> 00:00:08,915
be.
14
00:00:09,215 --> 00:00:09,296
I
15
00:00:09,336 --> 00:00:09,476
said
16
00:00:09,516 --> 00:00:09,596
she
17
00:00:09,616 --> 00:00:09,757
could
18
00:00:09,777 --> 00:00:09,937
stay
19
00:00:09,957 --> 00:00:10,078
with
20
00:00:10,098 --> 00:00:10,138
us
21
00:00:10,158 --> 00:00:10,439
tomorrow
22
00:00:10,459 --> 00:00:10,540
just
23
00:00:10,560 --> 00:00:10,682
until
24
00:00:10,702 --> 00:00:10,804
she
25
00:00:10,824 --> 00:00:11,047
feels
26
00:00:11,087 --> 00:00:11,351
better.
27
00:00:11,733 --> 00:00:11,954
Yeah.
28
00:00:12,095 --> 00:00:12,175
Of
29
00:00:12,195 --> 00:00:12,315
course
30
00:00:12,376 --> 00:00:12,636
she
31
00:00:12,716 --> 00:00:13,238
can.
32
00:00:13,359 --> 00:00:13,702
No,
33
00:00:13,823 --> 00:00:14,125
things
34
00:00:14,185 --> 00:00:14,387
won't
35
00:00:14,427 --> 00:00:14,528
be
36
00:00:14,589 --> 00:00:14,730
for
37
00:00:14,810 --> 00:00:15,012
long.
38
00:00:15,173 --> 00:00:15,413
Well,
39
00:00:15,433 --> 00:00:15,513
you
40
00:00:15,554 --> 00:00:15,694
can
41
00:00:15,754 --> 00:00:15,894
stay
42
00:00:15,955 --> 00:00:16,015
as
43
00:00:16,055 --> 00:00:16,195
long
44
00:00:16,235 --> 00:00:16,275
as
45
00:00:16,295 --> 00:00:16,416
you
46
00:00:16,456 --> 00:00:16,656
want,
47
00:00:16,717 --> 00:00:16,917
my
48
00:00:16,957 --> 00:00:17,338
love.
49
00:00:17,621 --> 00:00:17,863
I've
50
00:00:17,883 --> 00:00:18,145
really
51
00:00:18,185 --> 00:00:18,588
missed
52
00:00:18,629 --> 00:00:18,810
you.
53
00:00:19,493 --> 00:00:19,795
Pops.
54
00:00:20,396 --> 00:00:20,637
Great
55
00:00:20,657 --> 00:00:20,777
to
56
00:00:20,817 --> 00:00:21,138
see
57
00:00:21,158 --> 00:00:21,278
you,
58
00:00:21,318 --> 00:00:21,679
love.
59
00:00:21,901 --> 00:00:23,213
Oh.
60
00:00:23,233 --> 00:00:23,293
All
61
00:00:23,313 --> 00:00:23,414
right,
62
00:00:23,434 --> 00:00:23,554
shall
63
00:00:23,574 --> 00:00:23,655
we
64
00:00:23,675 --> 00:00:23,735
get
65
00:00:23,755 --> 00:00:23,815
you
66
00:00:23,835 --> 00:00:23,936
off
67
00:00:23,956 --> 00:00:24,036
to
68
00:00:24,056 --> 00:00:24,217
bed
69
00:00:24,237 --> 00:00:24,378
then?
70
00:00:24,579 --> 00:00:24,720
You
71
00:00:24,781 --> 00:00:24,983
should
72
00:00:25,023 --> 00:00:25,124
have
73
00:00:25,164 --> 00:00:25,285
given
74
00:00:25,346 --> 00:00:25,447
me
75
00:00:25,487 --> 00:00:25,669
some
76
00:00:25,810 --> 00:00:26,052
warm.
77
00:00:26,313 --> 00:00:26,373
I
78
00:00:26,393 --> 00:00:26,494
know.
79
00:00:26,614 --> 00:00:26,694
I'll
80
00:00:26,714 --> 00:00:26,815
have
81
00:00:26,835 --> 00:00:27,015
to
82
00:00:27,055 --> 00:00:27,416
put
83
00:00:27,476 --> 00:00:27,556
the
84
00:00:27,697 --> 00:00:28,078
electric
85
00:00:28,619 --> 00:00:28,840
blanket
86
00:00:28,900 --> 00:00:28,940
on.
87
00:00:29,490 --> 00:00:29,551
I'm
88
00:00:29,572 --> 00:00:29,817
sorry.
89
00:00:29,980 --> 00:00:30,082
All
90
00:00:30,102 --> 00:00:30,286
right,
91
00:00:30,429 --> 00:00:30,633
Bella.
92
00:00:31,375 --> 00:00:31,576
Freezing
93
00:00:31,616 --> 00:00:31,656
up
94
00:00:31,676 --> 00:00:31,897
there.
95
00:00:31,897 --> 00:00:31,937
In
96
00:00:31,957 --> 00:00:31,977
a
97
00:00:31,997 --> 00:00:32,138
bedroom,
98
00:00:32,198 --> 00:00:32,500
Peter
99
00:00:32,581 --> 00:00:32,983
unpacks
100
00:00:33,003 --> 00:00:33,103
her
101
00:00:33,164 --> 00:00:33,647
suitcase.
102
00:00:34,268 --> 00:00:34,348
The
103
00:00:34,388 --> 00:00:34,909
middle-aged
104
00:00:34,989 --> 00:00:35,270
woman
105
00:00:35,390 --> 00:00:35,671
opens
106
00:00:35,711 --> 00:00:35,811
her
107
00:00:35,851 --> 00:00:36,192
green
108
00:00:36,232 --> 00:00:36,533
case.
109
00:00:38,135 --> 00:00:38,255
Do
110
00:00:38,275 --> 00:00:38,355
you
111
00:00:38,375 --> 00:00:38,535
want
112
00:00:38,555 --> 00:00:38,736
your
113
00:00:38,876 --> 00:00:39,296
PJs?
114
00:00:39,879 --> 00:00:40,181
Yeah.
115
00:00:42,388 --> 00:00:42,689
Lifting
116
00:00:42,729 --> 00:00:42,749
a
117
00:00:42,809 --> 00:00:43,110
bundle
118
00:00:43,131 --> 00:00:43,191
of
119
00:00:43,251 --> 00:00:43,773
pajamas,
120
00:00:44,073 --> 00:00:44,314
Peter
121
00:00:44,374 --> 00:00:44,634
finds
122
00:00:44,674 --> 00:00:44,694
a
123
00:00:44,754 --> 00:00:44,955
sheet
124
00:00:44,995 --> 00:00:45,055
of
125
00:00:45,115 --> 00:00:45,456
paper
126
00:00:45,536 --> 00:00:45,876
labeled
127
00:00:46,338 --> 00:00:47,041
Lancaster
128
00:00:47,121 --> 00:00:47,382
North
129
00:00:47,442 --> 00:00:47,944
Hospital
130
00:00:48,266 --> 00:00:48,928
discharge
131
00:00:49,029 --> 00:00:49,249
sheet.
132
00:00:50,291 --> 00:00:50,371
He
133
00:00:50,412 --> 00:00:50,772
closes
134
00:00:50,812 --> 00:00:50,912
the
135
00:00:50,953 --> 00:00:51,393
suitcase
136
00:00:51,433 --> 00:00:51,514
and
137
00:00:51,534 --> 00:00:51,794
brings
138
00:00:51,834 --> 00:00:52,235
Gloria
139
00:00:52,255 --> 00:00:52,315
the
140
00:00:52,355 --> 00:00:52,856
pajamas.
141
00:00:54,186 --> 00:00:54,488
There
142
00:00:54,549 --> 00:00:54,771
you
143
00:00:54,791 --> 00:00:54,831
go.
144
00:00:55,654 --> 00:00:55,775
Thank
145
00:00:55,795 --> 00:00:55,895
you.
146
00:00:55,895 --> 00:00:55,936
He
147
00:00:55,956 --> 00:00:56,097
picks
148
00:00:56,117 --> 00:00:56,198
up
149
00:00:56,218 --> 00:00:56,319
the
150
00:00:56,359 --> 00:00:56,742
locket.
151
00:00:57,124 --> 00:00:57,225
You
152
00:00:57,265 --> 00:00:57,466
kept
153
00:00:57,547 --> 00:00:57,627
it.
154
00:00:58,874 --> 00:00:58,994
Oh,
155
00:00:59,276 --> 00:00:59,578
of
156
00:00:59,678 --> 00:00:59,960
course.

View File

@ -1,184 +0,0 @@
[Script Info]
ScriptType: v4.00+
PlayResX: 384
PlayResY: 288
ScaledBorderAndShadow: yes
[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
Style: Default,Arial,24,&Hffffff,&Hffffff,&H0,&H0,0,0,0,0,100,100,0,0,1,1,0,2,10,10,10,0
[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
Dialogue: 0,0:00:0.56,0:00:0.96,Default,,0,0,0,,{\1c&HFF00&\u1}Weinlein{\r} von Hammersmann
Dialogue: 0,0:00:0.96,0:00:0.98,Default,,0,0,0,,Weinlein von Hammersmann
Dialogue: 0,0:00:0.98,0:00:1.21,Default,,0,0,0,,Weinlein {\1c&HFF00&\u1}von{\r} Hammersmann
Dialogue: 0,0:00:1.21,0:00:1.23,Default,,0,0,0,,Weinlein von Hammersmann
Dialogue: 0,0:00:1.23,0:00:1.87,Default,,0,0,0,,Weinlein von {\1c&HFF00&\u1}Hammersmann{\r}
Dialogue: 0,0:00:6.19,0:00:6.61,Default,,0,0,0,,{\1c&HFF00&\u1}Oberst{\r} Lande, es ist lange her
Dialogue: 0,0:00:6.61,0:00:6.63,Default,,0,0,0,,Oberst Lande, es ist lange her
Dialogue: 0,0:00:6.63,0:00:6.97,Default,,0,0,0,,Oberst {\1c&HFF00&\u1}Lande,{\r} es ist lange her
Dialogue: 0,0:00:6.97,0:00:7.01,Default,,0,0,0,,Oberst Lande, es ist lange her
Dialogue: 0,0:00:7.01,0:00:7.17,Default,,0,0,0,,Oberst Lande, {\1c&HFF00&\u1}es{\r} ist lange her
Dialogue: 0,0:00:7.17,0:00:7.21,Default,,0,0,0,,Oberst Lande, es ist lange her
Dialogue: 0,0:00:7.21,0:00:7.47,Default,,0,0,0,,Oberst Lande, es {\1c&HFF00&\u1}ist{\r} lange her
Dialogue: 0,0:00:7.47,0:00:7.49,Default,,0,0,0,,Oberst Lande, es ist lange her
Dialogue: 0,0:00:7.49,0:00:7.77,Default,,0,0,0,,Oberst Lande, es ist {\1c&HFF00&\u1}lange{\r} her
Dialogue: 0,0:00:7.77,0:00:7.83,Default,,0,0,0,,Oberst Lande, es ist lange her
Dialogue: 0,0:00:7.83,0:00:8.01,Default,,0,0,0,,Oberst Lande, es ist lange {\1c&HFF00&\u1}her{\r}
Dialogue: 0,0:00:8.01,0:00:8.17,Default,,0,0,0,,{\1c&HFF00&\u1}Schneide{\r} ich wie eh und je
Dialogue: 0,0:00:8.17,0:00:8.19,Default,,0,0,0,,Schneide ich wie eh und je
Dialogue: 0,0:00:8.19,0:00:8.25,Default,,0,0,0,,Schneide {\1c&HFF00&\u1}ich{\r} wie eh und je
Dialogue: 0,0:00:8.25,0:00:8.27,Default,,0,0,0,,Schneide ich wie eh und je
Dialogue: 0,0:00:8.27,0:00:8.33,Default,,0,0,0,,Schneide ich {\1c&HFF00&\u1}wie{\r} eh und je
Dialogue: 0,0:00:8.33,0:00:8.35,Default,,0,0,0,,Schneide ich wie eh und je
Dialogue: 0,0:00:8.35,0:00:8.39,Default,,0,0,0,,Schneide ich wie {\1c&HFF00&\u1}eh{\r} und je
Dialogue: 0,0:00:8.39,0:00:8.41,Default,,0,0,0,,Schneide ich wie eh und je
Dialogue: 0,0:00:8.41,0:00:8.47,Default,,0,0,0,,Schneide ich wie eh {\1c&HFF00&\u1}und{\r} je
Dialogue: 0,0:00:8.47,0:00:8.49,Default,,0,0,0,,Schneide ich wie eh und je
Dialogue: 0,0:00:8.49,0:00:8.53,Default,,0,0,0,,Schneide ich wie eh und {\1c&HFF00&\u1}je{\r}
Dialogue: 0,0:00:13.99,0:00:14.17,Default,,0,0,0,,{\1c&HFF00&\u1}Also{\r} was ist mit Ihrem wunderschönen Bein geschehen?
Dialogue: 0,0:00:14.17,0:00:14.21,Default,,0,0,0,,Also was ist mit Ihrem wunderschönen Bein geschehen?
Dialogue: 0,0:00:14.21,0:00:14.45,Default,,0,0,0,,Also {\1c&HFF00&\u1}was{\r} ist mit Ihrem wunderschönen Bein geschehen?
Dialogue: 0,0:00:14.45,0:00:14.47,Default,,0,0,0,,Also was ist mit Ihrem wunderschönen Bein geschehen?
Dialogue: 0,0:00:14.47,0:00:14.59,Default,,0,0,0,,Also was {\1c&HFF00&\u1}ist{\r} mit Ihrem wunderschönen Bein geschehen?
Dialogue: 0,0:00:14.59,0:00:14.61,Default,,0,0,0,,Also was ist mit Ihrem wunderschönen Bein geschehen?
Dialogue: 0,0:00:14.61,0:00:14.71,Default,,0,0,0,,Also was ist {\1c&HFF00&\u1}mit{\r} Ihrem wunderschönen Bein geschehen?
Dialogue: 0,0:00:14.71,0:00:14.75,Default,,0,0,0,,Also was ist mit Ihrem wunderschönen Bein geschehen?
Dialogue: 0,0:00:14.75,0:00:14.91,Default,,0,0,0,,Also was ist mit {\1c&HFF00&\u1}Ihrem{\r} wunderschönen Bein geschehen?
Dialogue: 0,0:00:14.91,0:00:14.93,Default,,0,0,0,,Also was ist mit Ihrem wunderschönen Bein geschehen?
Dialogue: 0,0:00:14.93,0:00:15.55,Default,,0,0,0,,Also was ist mit Ihrem {\1c&HFF00&\u1}wunderschönen{\r} Bein geschehen?
Dialogue: 0,0:00:15.55,0:00:15.59,Default,,0,0,0,,Also was ist mit Ihrem wunderschönen Bein geschehen?
Dialogue: 0,0:00:15.59,0:00:15.81,Default,,0,0,0,,Also was ist mit Ihrem wunderschönen {\1c&HFF00&\u1}Bein{\r} geschehen?
Dialogue: 0,0:00:15.81,0:00:15.85,Default,,0,0,0,,Also was ist mit Ihrem wunderschönen Bein geschehen?
Dialogue: 0,0:00:15.85,0:00:16.23,Default,,0,0,0,,Also was ist mit Ihrem wunderschönen Bein {\1c&HFF00&\u1}geschehen?{\r}
Dialogue: 0,0:00:17.03,0:00:17.15,Default,,0,0,0,,{\1c&HFF00&\u1}Ein{\r} Nebenprodukt der Arschtritte, die Sie in der deutschen Filmwelt ausleihen, zweifelsohne
Dialogue: 0,0:00:17.15,0:00:17.17,Default,,0,0,0,,Ein Nebenprodukt der Arschtritte, die Sie in der deutschen Filmwelt ausleihen, zweifelsohne
Dialogue: 0,0:00:17.17,0:00:17.74,Default,,0,0,0,,Ein {\1c&HFF00&\u1}Nebenprodukt{\r} der Arschtritte, die Sie in der deutschen Filmwelt ausleihen, zweifelsohne
Dialogue: 0,0:00:17.74,0:00:17.78,Default,,0,0,0,,Ein Nebenprodukt der Arschtritte, die Sie in der deutschen Filmwelt ausleihen, zweifelsohne
Dialogue: 0,0:00:17.78,0:00:17.92,Default,,0,0,0,,Ein Nebenprodukt {\1c&HFF00&\u1}der{\r} Arschtritte, die Sie in der deutschen Filmwelt ausleihen, zweifelsohne
Dialogue: 0,0:00:17.92,0:00:17.96,Default,,0,0,0,,Ein Nebenprodukt der Arschtritte, die Sie in der deutschen Filmwelt ausleihen, zweifelsohne
Dialogue: 0,0:00:17.96,0:00:18.60,Default,,0,0,0,,Ein Nebenprodukt der {\1c&HFF00&\u1}Arschtritte,{\r} die Sie in der deutschen Filmwelt ausleihen, zweifelsohne
Dialogue: 0,0:00:18.60,0:00:18.64,Default,,0,0,0,,Ein Nebenprodukt der Arschtritte, die Sie in der deutschen Filmwelt ausleihen, zweifelsohne
Dialogue: 0,0:00:18.64,0:00:18.70,Default,,0,0,0,,Ein Nebenprodukt der Arschtritte, {\1c&HFF00&\u1}die{\r} Sie in der deutschen Filmwelt ausleihen, zweifelsohne
Dialogue: 0,0:00:18.70,0:00:18.74,Default,,0,0,0,,Ein Nebenprodukt der Arschtritte, die Sie in der deutschen Filmwelt ausleihen, zweifelsohne
Dialogue: 0,0:00:18.74,0:00:18.82,Default,,0,0,0,,Ein Nebenprodukt der Arschtritte, die {\1c&HFF00&\u1}Sie{\r} in der deutschen Filmwelt ausleihen, zweifelsohne
Dialogue: 0,0:00:18.82,0:00:18.86,Default,,0,0,0,,Ein Nebenprodukt der Arschtritte, die Sie in der deutschen Filmwelt ausleihen, zweifelsohne
Dialogue: 0,0:00:18.86,0:00:18.92,Default,,0,0,0,,Ein Nebenprodukt der Arschtritte, die Sie {\1c&HFF00&\u1}in{\r} der deutschen Filmwelt ausleihen, zweifelsohne
Dialogue: 0,0:00:18.92,0:00:18.94,Default,,0,0,0,,Ein Nebenprodukt der Arschtritte, die Sie in der deutschen Filmwelt ausleihen, zweifelsohne
Dialogue: 0,0:00:18.94,0:00:19.02,Default,,0,0,0,,Ein Nebenprodukt der Arschtritte, die Sie in {\1c&HFF00&\u1}der{\r} deutschen Filmwelt ausleihen, zweifelsohne
Dialogue: 0,0:00:19.02,0:00:19.04,Default,,0,0,0,,Ein Nebenprodukt der Arschtritte, die Sie in der deutschen Filmwelt ausleihen, zweifelsohne
Dialogue: 0,0:00:19.04,0:00:19.32,Default,,0,0,0,,Ein Nebenprodukt der Arschtritte, die Sie in der {\1c&HFF00&\u1}deutschen{\r} Filmwelt ausleihen, zweifelsohne
Dialogue: 0,0:00:19.32,0:00:19.36,Default,,0,0,0,,Ein Nebenprodukt der Arschtritte, die Sie in der deutschen Filmwelt ausleihen, zweifelsohne
Dialogue: 0,0:00:19.36,0:00:19.88,Default,,0,0,0,,Ein Nebenprodukt der Arschtritte, die Sie in der deutschen {\1c&HFF00&\u1}Filmwelt{\r} ausleihen, zweifelsohne
Dialogue: 0,0:00:19.88,0:00:19.94,Default,,0,0,0,,Ein Nebenprodukt der Arschtritte, die Sie in der deutschen Filmwelt ausleihen, zweifelsohne
Dialogue: 0,0:00:19.94,0:00:20.48,Default,,0,0,0,,Ein Nebenprodukt der Arschtritte, die Sie in der deutschen Filmwelt {\1c&HFF00&\u1}ausleihen,{\r} zweifelsohne
Dialogue: 0,0:00:20.48,0:00:20.52,Default,,0,0,0,,Ein Nebenprodukt der Arschtritte, die Sie in der deutschen Filmwelt ausleihen, zweifelsohne
Dialogue: 0,0:00:20.52,0:00:21.22,Default,,0,0,0,,Ein Nebenprodukt der Arschtritte, die Sie in der deutschen Filmwelt ausleihen, {\1c&HFF00&\u1}zweifelsohne{\r}
Dialogue: 0,0:00:22.10,0:00:22.36,Default,,0,0,0,,{\1c&HFF00&\u1}Sparen{\r} Sie sich Ihre Komplimente, Sie alter Hund
Dialogue: 0,0:00:22.36,0:00:22.38,Default,,0,0,0,,Sparen Sie sich Ihre Komplimente, Sie alter Hund
Dialogue: 0,0:00:22.38,0:00:22.48,Default,,0,0,0,,Sparen {\1c&HFF00&\u1}Sie{\r} sich Ihre Komplimente, Sie alter Hund
Dialogue: 0,0:00:22.48,0:00:22.50,Default,,0,0,0,,Sparen Sie sich Ihre Komplimente, Sie alter Hund
Dialogue: 0,0:00:22.50,0:00:22.62,Default,,0,0,0,,Sparen Sie {\1c&HFF00&\u1}sich{\r} Ihre Komplimente, Sie alter Hund
Dialogue: 0,0:00:22.62,0:00:22.64,Default,,0,0,0,,Sparen Sie sich Ihre Komplimente, Sie alter Hund
Dialogue: 0,0:00:22.64,0:00:22.83,Default,,0,0,0,,Sparen Sie sich {\1c&HFF00&\u1}Ihre{\r} Komplimente, Sie alter Hund
Dialogue: 0,0:00:22.83,0:00:22.87,Default,,0,0,0,,Sparen Sie sich Ihre Komplimente, Sie alter Hund
Dialogue: 0,0:00:22.87,0:00:23.43,Default,,0,0,0,,Sparen Sie sich Ihre {\1c&HFF00&\u1}Komplimente,{\r} Sie alter Hund
Dialogue: 0,0:00:23.43,0:00:23.45,Default,,0,0,0,,Sparen Sie sich Ihre Komplimente, Sie alter Hund
Dialogue: 0,0:00:23.45,0:00:23.55,Default,,0,0,0,,Sparen Sie sich Ihre Komplimente, {\1c&HFF00&\u1}Sie{\r} alter Hund
Dialogue: 0,0:00:23.55,0:00:23.59,Default,,0,0,0,,Sparen Sie sich Ihre Komplimente, Sie alter Hund
Dialogue: 0,0:00:23.59,0:00:23.89,Default,,0,0,0,,Sparen Sie sich Ihre Komplimente, Sie {\1c&HFF00&\u1}alter{\r} Hund
Dialogue: 0,0:00:23.89,0:00:23.93,Default,,0,0,0,,Sparen Sie sich Ihre Komplimente, Sie alter Hund
Dialogue: 0,0:00:23.93,0:00:24.05,Default,,0,0,0,,Sparen Sie sich Ihre Komplimente, Sie alter {\1c&HFF00&\u1}Hund{\r}
Dialogue: 0,0:00:24.57,0:00:24.65,Default,,0,0,0,,{\1c&HFF00&\u1}Ich{\r} kenne zu viele von ihren früheren Eroberungen, als dass ich in ihren Honigtopf treten könnte.
Dialogue: 0,0:00:24.65,0:00:24.67,Default,,0,0,0,,Ich kenne zu viele von ihren früheren Eroberungen, als dass ich in ihren Honigtopf treten könnte.
Dialogue: 0,0:00:24.67,0:00:24.97,Default,,0,0,0,,Ich {\1c&HFF00&\u1}kenne{\r} zu viele von ihren früheren Eroberungen, als dass ich in ihren Honigtopf treten könnte.
Dialogue: 0,0:00:24.97,0:00:25.01,Default,,0,0,0,,Ich kenne zu viele von ihren früheren Eroberungen, als dass ich in ihren Honigtopf treten könnte.
Dialogue: 0,0:00:25.01,0:00:25.19,Default,,0,0,0,,Ich kenne {\1c&HFF00&\u1}zu{\r} viele von ihren früheren Eroberungen, als dass ich in ihren Honigtopf treten könnte.
Dialogue: 0,0:00:25.19,0:00:25.23,Default,,0,0,0,,Ich kenne zu viele von ihren früheren Eroberungen, als dass ich in ihren Honigtopf treten könnte.
Dialogue: 0,0:00:25.23,0:00:25.49,Default,,0,0,0,,Ich kenne zu {\1c&HFF00&\u1}viele{\r} von ihren früheren Eroberungen, als dass ich in ihren Honigtopf treten könnte.
Dialogue: 0,0:00:25.49,0:00:25.51,Default,,0,0,0,,Ich kenne zu viele von ihren früheren Eroberungen, als dass ich in ihren Honigtopf treten könnte.
Dialogue: 0,0:00:25.51,0:00:25.65,Default,,0,0,0,,Ich kenne zu viele {\1c&HFF00&\u1}von{\r} ihren früheren Eroberungen, als dass ich in ihren Honigtopf treten könnte.
Dialogue: 0,0:00:25.65,0:00:25.69,Default,,0,0,0,,Ich kenne zu viele von ihren früheren Eroberungen, als dass ich in ihren Honigtopf treten könnte.
Dialogue: 0,0:00:25.69,0:00:25.95,Default,,0,0,0,,Ich kenne zu viele von {\1c&HFF00&\u1}ihren{\r} früheren Eroberungen, als dass ich in ihren Honigtopf treten könnte.
Dialogue: 0,0:00:25.95,0:00:25.99,Default,,0,0,0,,Ich kenne zu viele von ihren früheren Eroberungen, als dass ich in ihren Honigtopf treten könnte.
Dialogue: 0,0:00:25.99,0:00:26.41,Default,,0,0,0,,Ich kenne zu viele von ihren {\1c&HFF00&\u1}früheren{\r} Eroberungen, als dass ich in ihren Honigtopf treten könnte.
Dialogue: 0,0:00:26.41,0:00:26.45,Default,,0,0,0,,Ich kenne zu viele von ihren früheren Eroberungen, als dass ich in ihren Honigtopf treten könnte.
Dialogue: 0,0:00:26.45,0:00:27.01,Default,,0,0,0,,Ich kenne zu viele von ihren früheren {\1c&HFF00&\u1}Eroberungen,{\r} als dass ich in ihren Honigtopf treten könnte.
Dialogue: 0,0:00:27.01,0:00:27.06,Default,,0,0,0,,Ich kenne zu viele von ihren früheren Eroberungen, als dass ich in ihren Honigtopf treten könnte.
Dialogue: 0,0:00:27.06,0:00:27.24,Default,,0,0,0,,Ich kenne zu viele von ihren früheren Eroberungen, {\1c&HFF00&\u1}als{\r} dass ich in ihren Honigtopf treten könnte.
Dialogue: 0,0:00:27.24,0:00:27.26,Default,,0,0,0,,Ich kenne zu viele von ihren früheren Eroberungen, als dass ich in ihren Honigtopf treten könnte.
Dialogue: 0,0:00:27.26,0:00:27.46,Default,,0,0,0,,Ich kenne zu viele von ihren früheren Eroberungen, als {\1c&HFF00&\u1}dass{\r} ich in ihren Honigtopf treten könnte.
Dialogue: 0,0:00:27.46,0:00:27.50,Default,,0,0,0,,Ich kenne zu viele von ihren früheren Eroberungen, als dass ich in ihren Honigtopf treten könnte.
Dialogue: 0,0:00:27.50,0:00:27.60,Default,,0,0,0,,Ich kenne zu viele von ihren früheren Eroberungen, als dass {\1c&HFF00&\u1}ich{\r} in ihren Honigtopf treten könnte.
Dialogue: 0,0:00:27.60,0:00:27.64,Default,,0,0,0,,Ich kenne zu viele von ihren früheren Eroberungen, als dass ich in ihren Honigtopf treten könnte.
Dialogue: 0,0:00:27.64,0:00:27.74,Default,,0,0,0,,Ich kenne zu viele von ihren früheren Eroberungen, als dass ich {\1c&HFF00&\u1}in{\r} ihren Honigtopf treten könnte.
Dialogue: 0,0:00:27.74,0:00:27.78,Default,,0,0,0,,Ich kenne zu viele von ihren früheren Eroberungen, als dass ich in ihren Honigtopf treten könnte.
Dialogue: 0,0:00:27.78,0:00:28.00,Default,,0,0,0,,Ich kenne zu viele von ihren früheren Eroberungen, als dass ich in {\1c&HFF00&\u1}ihren{\r} Honigtopf treten könnte.
Dialogue: 0,0:00:28.00,0:00:28.04,Default,,0,0,0,,Ich kenne zu viele von ihren früheren Eroberungen, als dass ich in ihren Honigtopf treten könnte.
Dialogue: 0,0:00:28.04,0:00:28.64,Default,,0,0,0,,Ich kenne zu viele von ihren früheren Eroberungen, als dass ich in ihren {\1c&HFF00&\u1}Honigtopf{\r} treten könnte.
Dialogue: 0,0:00:28.64,0:00:28.68,Default,,0,0,0,,Ich kenne zu viele von ihren früheren Eroberungen, als dass ich in ihren Honigtopf treten könnte.
Dialogue: 0,0:00:28.68,0:00:28.96,Default,,0,0,0,,Ich kenne zu viele von ihren früheren Eroberungen, als dass ich in ihren Honigtopf {\1c&HFF00&\u1}treten{\r} könnte.
Dialogue: 0,0:00:28.96,0:00:29.00,Default,,0,0,0,,Ich kenne zu viele von ihren früheren Eroberungen, als dass ich in ihren Honigtopf treten könnte.
Dialogue: 0,0:00:29.00,0:00:29.24,Default,,0,0,0,,Ich kenne zu viele von ihren früheren Eroberungen, als dass ich in ihren Honigtopf treten {\1c&HFF00&\u1}könnte.{\r}
Dialogue: 0,0:00:29.24,0:00:29.30,Default,,0,0,0,,{\1c&HFF00&\u1}Na{\r} im Ernst, was ist passiert?
Dialogue: 0,0:00:29.30,0:00:29.32,Default,,0,0,0,,Na im Ernst, was ist passiert?
Dialogue: 0,0:00:29.32,0:00:29.36,Default,,0,0,0,,Na {\1c&HFF00&\u1}im{\r} Ernst, was ist passiert?
Dialogue: 0,0:00:29.36,0:00:29.38,Default,,0,0,0,,Na im Ernst, was ist passiert?
Dialogue: 0,0:00:29.38,0:00:29.48,Default,,0,0,0,,Na im {\1c&HFF00&\u1}Ernst,{\r} was ist passiert?
Dialogue: 0,0:00:29.48,0:00:29.50,Default,,0,0,0,,Na im Ernst, was ist passiert?
Dialogue: 0,0:00:29.50,0:00:29.56,Default,,0,0,0,,Na im Ernst, {\1c&HFF00&\u1}was{\r} ist passiert?
Dialogue: 0,0:00:29.56,0:00:29.58,Default,,0,0,0,,Na im Ernst, was ist passiert?
Dialogue: 0,0:00:29.58,0:00:29.64,Default,,0,0,0,,Na im Ernst, was {\1c&HFF00&\u1}ist{\r} passiert?
Dialogue: 0,0:00:29.64,0:00:29.66,Default,,0,0,0,,Na im Ernst, was ist passiert?
Dialogue: 0,0:00:29.66,0:00:29.82,Default,,0,0,0,,Na im Ernst, was ist {\1c&HFF00&\u1}passiert?{\r}
Dialogue: 0,0:00:30.78,0:00:32.27,Default,,0,0,0,,{\1c&HFF00&\u1}Tja,{\r} ich habe mich, dummerweise muss ich eingestehen, im Bergsteigen versucht.
Dialogue: 0,0:00:32.27,0:00:32.33,Default,,0,0,0,,Tja, ich habe mich, dummerweise muss ich eingestehen, im Bergsteigen versucht.
Dialogue: 0,0:00:32.33,0:00:32.49,Default,,0,0,0,,Tja, {\1c&HFF00&\u1}ich{\r} habe mich, dummerweise muss ich eingestehen, im Bergsteigen versucht.
Dialogue: 0,0:00:32.49,0:00:32.53,Default,,0,0,0,,Tja, ich habe mich, dummerweise muss ich eingestehen, im Bergsteigen versucht.
Dialogue: 0,0:00:32.53,0:00:32.79,Default,,0,0,0,,Tja, ich {\1c&HFF00&\u1}habe{\r} mich, dummerweise muss ich eingestehen, im Bergsteigen versucht.
Dialogue: 0,0:00:32.79,0:00:32.83,Default,,0,0,0,,Tja, ich habe mich, dummerweise muss ich eingestehen, im Bergsteigen versucht.
Dialogue: 0,0:00:32.83,0:00:33.83,Default,,0,0,0,,Tja, ich habe {\1c&HFF00&\u1}mich,{\r} dummerweise muss ich eingestehen, im Bergsteigen versucht.
Dialogue: 0,0:00:33.83,0:00:33.85,Default,,0,0,0,,Tja, ich habe mich, dummerweise muss ich eingestehen, im Bergsteigen versucht.
Dialogue: 0,0:00:33.85,0:00:34.57,Default,,0,0,0,,Tja, ich habe mich, {\1c&HFF00&\u1}dummerweise{\r} muss ich eingestehen, im Bergsteigen versucht.
Dialogue: 0,0:00:34.57,0:00:34.59,Default,,0,0,0,,Tja, ich habe mich, dummerweise muss ich eingestehen, im Bergsteigen versucht.
Dialogue: 0,0:00:34.59,0:00:34.73,Default,,0,0,0,,Tja, ich habe mich, dummerweise {\1c&HFF00&\u1}muss{\r} ich eingestehen, im Bergsteigen versucht.
Dialogue: 0,0:00:34.73,0:00:34.77,Default,,0,0,0,,Tja, ich habe mich, dummerweise muss ich eingestehen, im Bergsteigen versucht.
Dialogue: 0,0:00:34.77,0:00:34.89,Default,,0,0,0,,Tja, ich habe mich, dummerweise muss {\1c&HFF00&\u1}ich{\r} eingestehen, im Bergsteigen versucht.
Dialogue: 0,0:00:34.89,0:00:34.93,Default,,0,0,0,,Tja, ich habe mich, dummerweise muss ich eingestehen, im Bergsteigen versucht.
Dialogue: 0,0:00:34.93,0:00:36.83,Default,,0,0,0,,Tja, ich habe mich, dummerweise muss ich {\1c&HFF00&\u1}eingestehen,{\r} im Bergsteigen versucht.
Dialogue: 0,0:00:36.83,0:00:36.87,Default,,0,0,0,,Tja, ich habe mich, dummerweise muss ich eingestehen, im Bergsteigen versucht.
Dialogue: 0,0:00:36.87,0:00:36.99,Default,,0,0,0,,Tja, ich habe mich, dummerweise muss ich eingestehen, {\1c&HFF00&\u1}im{\r} Bergsteigen versucht.
Dialogue: 0,0:00:36.99,0:00:37.03,Default,,0,0,0,,Tja, ich habe mich, dummerweise muss ich eingestehen, im Bergsteigen versucht.
Dialogue: 0,0:00:37.03,0:00:37.76,Default,,0,0,0,,Tja, ich habe mich, dummerweise muss ich eingestehen, im {\1c&HFF00&\u1}Bergsteigen{\r} versucht.
Dialogue: 0,0:00:37.76,0:00:37.78,Default,,0,0,0,,Tja, ich habe mich, dummerweise muss ich eingestehen, im Bergsteigen versucht.
Dialogue: 0,0:00:37.78,0:00:38.22,Default,,0,0,0,,Tja, ich habe mich, dummerweise muss ich eingestehen, im Bergsteigen {\1c&HFF00&\u1}versucht.{\r}
Dialogue: 0,0:00:41.23,0:00:41.85,Default,,0,0,0,,{\1c&HFF00&\u1}Bergsteigen?{\r} Dabei haben sie ihr Bein verletzt beim Bergsteigen?
Dialogue: 0,0:00:41.85,0:00:41.87,Default,,0,0,0,,Bergsteigen? Dabei haben sie ihr Bein verletzt beim Bergsteigen?
Dialogue: 0,0:00:41.87,0:00:42.13,Default,,0,0,0,,Bergsteigen? {\1c&HFF00&\u1}Dabei{\r} haben sie ihr Bein verletzt beim Bergsteigen?
Dialogue: 0,0:00:42.13,0:00:42.15,Default,,0,0,0,,Bergsteigen? Dabei haben sie ihr Bein verletzt beim Bergsteigen?
Dialogue: 0,0:00:42.15,0:00:42.33,Default,,0,0,0,,Bergsteigen? Dabei {\1c&HFF00&\u1}haben{\r} sie ihr Bein verletzt beim Bergsteigen?
Dialogue: 0,0:00:42.33,0:00:42.37,Default,,0,0,0,,Bergsteigen? Dabei haben sie ihr Bein verletzt beim Bergsteigen?
Dialogue: 0,0:00:42.37,0:00:42.51,Default,,0,0,0,,Bergsteigen? Dabei haben {\1c&HFF00&\u1}sie{\r} ihr Bein verletzt beim Bergsteigen?
Dialogue: 0,0:00:42.51,0:00:42.55,Default,,0,0,0,,Bergsteigen? Dabei haben sie ihr Bein verletzt beim Bergsteigen?
Dialogue: 0,0:00:42.55,0:00:42.61,Default,,0,0,0,,Bergsteigen? Dabei haben sie {\1c&HFF00&\u1}ihr{\r} Bein verletzt beim Bergsteigen?
Dialogue: 0,0:00:42.61,0:00:42.63,Default,,0,0,0,,Bergsteigen? Dabei haben sie ihr Bein verletzt beim Bergsteigen?
Dialogue: 0,0:00:42.63,0:00:42.77,Default,,0,0,0,,Bergsteigen? Dabei haben sie ihr {\1c&HFF00&\u1}Bein{\r} verletzt beim Bergsteigen?
Dialogue: 0,0:00:42.77,0:00:42.81,Default,,0,0,0,,Bergsteigen? Dabei haben sie ihr Bein verletzt beim Bergsteigen?
Dialogue: 0,0:00:42.81,0:00:43.23,Default,,0,0,0,,Bergsteigen? Dabei haben sie ihr Bein {\1c&HFF00&\u1}verletzt{\r} beim Bergsteigen?
Dialogue: 0,0:00:43.23,0:00:43.27,Default,,0,0,0,,Bergsteigen? Dabei haben sie ihr Bein verletzt beim Bergsteigen?
Dialogue: 0,0:00:43.27,0:00:43.39,Default,,0,0,0,,Bergsteigen? Dabei haben sie ihr Bein verletzt {\1c&HFF00&\u1}beim{\r} Bergsteigen?
Dialogue: 0,0:00:43.39,0:00:43.45,Default,,0,0,0,,Bergsteigen? Dabei haben sie ihr Bein verletzt beim Bergsteigen?
Dialogue: 0,0:00:43.45,0:00:44.13,Default,,0,0,0,,Bergsteigen? Dabei haben sie ihr Bein verletzt beim {\1c&HFF00&\u1}Bergsteigen?{\r}
Dialogue: 0,0:00:44.54,0:00:44.62,Default,,0,0,0,,{\1c&HFF00&\u1}Ob{\r} sie es glauben oder nicht.
Dialogue: 0,0:00:44.62,0:00:44.66,Default,,0,0,0,,Ob sie es glauben oder nicht.
Dialogue: 0,0:00:44.66,0:00:44.72,Default,,0,0,0,,Ob {\1c&HFF00&\u1}sie{\r} es glauben oder nicht.
Dialogue: 0,0:00:44.72,0:00:44.74,Default,,0,0,0,,Ob sie es glauben oder nicht.
Dialogue: 0,0:00:44.74,0:00:44.80,Default,,0,0,0,,Ob sie {\1c&HFF00&\u1}es{\r} glauben oder nicht.
Dialogue: 0,0:00:44.80,0:00:44.82,Default,,0,0,0,,Ob sie es glauben oder nicht.
Dialogue: 0,0:00:44.82,0:00:45.14,Default,,0,0,0,,Ob sie es {\1c&HFF00&\u1}glauben{\r} oder nicht.
Dialogue: 0,0:00:45.14,0:00:45.16,Default,,0,0,0,,Ob sie es glauben oder nicht.
Dialogue: 0,0:00:45.16,0:00:45.46,Default,,0,0,0,,Ob sie es glauben {\1c&HFF00&\u1}oder{\r} nicht.
Dialogue: 0,0:00:45.46,0:00:45.50,Default,,0,0,0,,Ob sie es glauben oder nicht.
Dialogue: 0,0:00:45.50,0:00:45.66,Default,,0,0,0,,Ob sie es glauben oder {\1c&HFF00&\u1}nicht.{\r}

View File

@ -1,44 +0,0 @@
1
00:00:00,563 --> 00:00:01,869
Weinlein von Hammersmann
2
00:00:06,187 --> 00:00:08,013
Oberst Lande, es ist lange her
3
00:00:08,013 --> 00:00:08,534
Schneide ich wie eh und je
4
00:00:13,987 --> 00:00:16,234
Also was ist mit Ihrem wunderschönen Bein geschehen?
5
00:00:17,035 --> 00:00:21,218
Ein Nebenprodukt der Arschtritte, die Sie in der deutschen Filmwelt ausleihen, zweifelsohne
6
00:00:22,102 --> 00:00:24,051
Sparen Sie sich Ihre Komplimente, Sie alter Hund
7
00:00:24,572 --> 00:00:29,238
Ich kenne zu viele von ihren früheren Eroberungen, als dass ich in ihren Honigtopf treten könnte.
8
00:00:29,238 --> 00:00:29,821
Na im Ernst, was ist passiert?
9
00:00:30,783 --> 00:00:38,217
Tja, ich habe mich, dummerweise muss ich eingestehen, im Bergsteigen versucht.
10
00:00:41,226 --> 00:00:44,135
Bergsteigen? Dabei haben sie ihr Bein verletzt beim Bergsteigen?
11
00:00:44,535 --> 00:00:45,657
Ob sie es glauben oder nicht.

View File

@ -1,199 +0,0 @@
[Script Info]
ScriptType: v4.00+
PlayResX: 384
PlayResY: 288
ScaledBorderAndShadow: yes
[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
Style: Default,Arial,24,&Hffffff,&Hffffff,&H0,&H0,0,0,0,0,100,100,0,0,1,1,0,2,10,10,10,0
[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
Dialogue: 0,0:00:0.77,0:00:1.07,Default,,0,0,0,,{\1c&HFF00&\u1}Lâchez,{\r} c'est bon.
Dialogue: 0,0:00:1.07,0:00:1.11,Default,,0,0,0,,Lâchez, c'est bon.
Dialogue: 0,0:00:1.11,0:00:1.21,Default,,0,0,0,,Lâchez, {\1c&HFF00&\u1}c'est{\r} bon.
Dialogue: 0,0:00:1.21,0:00:1.23,Default,,0,0,0,,Lâchez, c'est bon.
Dialogue: 0,0:00:1.23,0:00:1.31,Default,,0,0,0,,Lâchez, c'est {\1c&HFF00&\u1}bon.{\r}
Dialogue: 0,0:00:1.31,0:00:1.81,Default,,0,0,0,,{\1c&HFF00&\u1}Ça{\r} va?
Dialogue: 0,0:00:1.81,0:00:1.83,Default,,0,0,0,,Ça va?
Dialogue: 0,0:00:1.83,0:00:1.89,Default,,0,0,0,,Ça {\1c&HFF00&\u1}va?{\r}
Dialogue: 0,0:00:1.91,0:00:1.97,Default,,0,0,0,,{\1c&HFF00&\u1}Oui.{\r}
Dialogue: 0,0:00:1.97,0:00:3.27,Default,,0,0,0,,{\1c&HFF00&\u1}Merci{\r} beaucoup.
Dialogue: 0,0:00:3.27,0:00:3.29,Default,,0,0,0,,Merci beaucoup.
Dialogue: 0,0:00:3.29,0:00:3.49,Default,,0,0,0,,Merci {\1c&HFF00&\u1}beaucoup.{\r}
Dialogue: 0,0:00:4.36,0:00:4.58,Default,,0,0,0,,{\1c&HFF00&\u1}Chèque{\r} ou espèce?
Dialogue: 0,0:00:4.58,0:00:4.64,Default,,0,0,0,,Chèque ou espèce?
Dialogue: 0,0:00:4.64,0:00:4.72,Default,,0,0,0,,Chèque {\1c&HFF00&\u1}ou{\r} espèce?
Dialogue: 0,0:00:4.72,0:00:4.78,Default,,0,0,0,,Chèque ou espèce?
Dialogue: 0,0:00:4.78,0:00:5.04,Default,,0,0,0,,Chèque ou {\1c&HFF00&\u1}espèce?{\r}
Dialogue: 0,0:00:6.54,0:00:6.70,Default,,0,0,0,,{\1c&HFF00&\u1}J'ai{\r} laissé un chèque sur la commode, il est signé.
Dialogue: 0,0:00:6.70,0:00:6.74,Default,,0,0,0,,J'ai laissé un chèque sur la commode, il est signé.
Dialogue: 0,0:00:6.74,0:00:6.99,Default,,0,0,0,,J'ai {\1c&HFF00&\u1}laissé{\r} un chèque sur la commode, il est signé.
Dialogue: 0,0:00:6.99,0:00:7.03,Default,,0,0,0,,J'ai laissé un chèque sur la commode, il est signé.
Dialogue: 0,0:00:7.03,0:00:7.09,Default,,0,0,0,,J'ai laissé {\1c&HFF00&\u1}un{\r} chèque sur la commode, il est signé.
Dialogue: 0,0:00:7.09,0:00:7.13,Default,,0,0,0,,J'ai laissé un chèque sur la commode, il est signé.
Dialogue: 0,0:00:7.13,0:00:7.33,Default,,0,0,0,,J'ai laissé un {\1c&HFF00&\u1}chèque{\r} sur la commode, il est signé.
Dialogue: 0,0:00:7.33,0:00:7.35,Default,,0,0,0,,J'ai laissé un chèque sur la commode, il est signé.
Dialogue: 0,0:00:7.35,0:00:7.49,Default,,0,0,0,,J'ai laissé un chèque {\1c&HFF00&\u1}sur{\r} la commode, il est signé.
Dialogue: 0,0:00:7.49,0:00:7.51,Default,,0,0,0,,J'ai laissé un chèque sur la commode, il est signé.
Dialogue: 0,0:00:7.51,0:00:7.59,Default,,0,0,0,,J'ai laissé un chèque sur {\1c&HFF00&\u1}la{\r} commode, il est signé.
Dialogue: 0,0:00:7.59,0:00:7.63,Default,,0,0,0,,J'ai laissé un chèque sur la commode, il est signé.
Dialogue: 0,0:00:7.63,0:00:7.91,Default,,0,0,0,,J'ai laissé un chèque sur la {\1c&HFF00&\u1}commode,{\r} il est signé.
Dialogue: 0,0:00:7.91,0:00:7.99,Default,,0,0,0,,J'ai laissé un chèque sur la commode, il est signé.
Dialogue: 0,0:00:7.99,0:00:8.03,Default,,0,0,0,,J'ai laissé un chèque sur la commode, {\1c&HFF00&\u1}il{\r} est signé.
Dialogue: 0,0:00:8.03,0:00:8.09,Default,,0,0,0,,J'ai laissé un chèque sur la commode, il est signé.
Dialogue: 0,0:00:8.09,0:00:8.19,Default,,0,0,0,,J'ai laissé un chèque sur la commode, il {\1c&HFF00&\u1}est{\r} signé.
Dialogue: 0,0:00:8.19,0:00:8.21,Default,,0,0,0,,J'ai laissé un chèque sur la commode, il est signé.
Dialogue: 0,0:00:8.21,0:00:8.39,Default,,0,0,0,,J'ai laissé un chèque sur la commode, il est {\1c&HFF00&\u1}signé.{\r}
Dialogue: 0,0:00:8.39,0:00:8.81,Default,,0,0,0,,{\1c&HFF00&\u1}Vous{\r} le remplirez.
Dialogue: 0,0:00:8.81,0:00:8.83,Default,,0,0,0,,Vous le remplirez.
Dialogue: 0,0:00:8.83,0:00:8.95,Default,,0,0,0,,Vous {\1c&HFF00&\u1}le{\r} remplirez.
Dialogue: 0,0:00:8.95,0:00:8.97,Default,,0,0,0,,Vous le remplirez.
Dialogue: 0,0:00:8.97,0:00:9.26,Default,,0,0,0,,Vous le {\1c&HFF00&\u1}remplirez.{\r}
Dialogue: 0,0:00:9.28,0:00:9.34,Default,,0,0,0,,{\1c&HFF00&\u1}OK.{\r}
Dialogue: 0,0:00:9.36,0:00:9.40,Default,,0,0,0,,{\1c&HFF00&\u1}Oh!{\r}
Dialogue: 0,0:00:12.41,0:00:12.51,Default,,0,0,0,,{\1c&HFF00&\u1}Ouh{\r} là!
Dialogue: 0,0:00:12.51,0:00:12.53,Default,,0,0,0,,Ouh là!
Dialogue: 0,0:00:12.53,0:00:12.59,Default,,0,0,0,,Ouh {\1c&HFF00&\u1}là!{\r}
Dialogue: 0,0:00:12.59,0:00:12.73,Default,,0,0,0,,{\1c&HFF00&\u1}Venez.{\r}
Dialogue: 0,0:00:14.45,0:00:14.63,Default,,0,0,0,,{\1c&HFF00&\u1}Merci.{\r}
Dialogue: 0,0:00:14.65,0:00:14.76,Default,,0,0,0,,{\1c&HFF00&\u1}Ah!{\r}
Dialogue: 0,0:00:15.64,0:00:16.51,Default,,0,0,0,,{\1c&HFF00&\u1}C'est{\r} qui?
Dialogue: 0,0:00:16.51,0:00:16.53,Default,,0,0,0,,C'est qui?
Dialogue: 0,0:00:16.53,0:00:16.63,Default,,0,0,0,,C'est {\1c&HFF00&\u1}qui?{\r}
Dialogue: 0,0:00:20.00,0:00:22.85,Default,,0,0,0,,{\1c&HFF00&\u1}C'est{\r} pas vrai, qu'est-ce qu'il fout ici, ce con?
Dialogue: 0,0:00:22.85,0:00:22.87,Default,,0,0,0,,C'est pas vrai, qu'est-ce qu'il fout ici, ce con?
Dialogue: 0,0:00:22.87,0:00:22.93,Default,,0,0,0,,C'est {\1c&HFF00&\u1}pas{\r} vrai, qu'est-ce qu'il fout ici, ce con?
Dialogue: 0,0:00:22.93,0:00:22.95,Default,,0,0,0,,C'est pas vrai, qu'est-ce qu'il fout ici, ce con?
Dialogue: 0,0:00:22.95,0:00:23.15,Default,,0,0,0,,C'est pas {\1c&HFF00&\u1}vrai,{\r} qu'est-ce qu'il fout ici, ce con?
Dialogue: 0,0:00:23.15,0:00:23.17,Default,,0,0,0,,C'est pas vrai, qu'est-ce qu'il fout ici, ce con?
Dialogue: 0,0:00:23.17,0:00:23.35,Default,,0,0,0,,C'est pas vrai, {\1c&HFF00&\u1}qu'est-ce{\r} qu'il fout ici, ce con?
Dialogue: 0,0:00:23.35,0:00:23.37,Default,,0,0,0,,C'est pas vrai, qu'est-ce qu'il fout ici, ce con?
Dialogue: 0,0:00:23.37,0:00:23.51,Default,,0,0,0,,C'est pas vrai, qu'est-ce {\1c&HFF00&\u1}qu'il{\r} fout ici, ce con?
Dialogue: 0,0:00:23.51,0:00:23.53,Default,,0,0,0,,C'est pas vrai, qu'est-ce qu'il fout ici, ce con?
Dialogue: 0,0:00:23.53,0:00:23.67,Default,,0,0,0,,C'est pas vrai, qu'est-ce qu'il {\1c&HFF00&\u1}fout{\r} ici, ce con?
Dialogue: 0,0:00:23.67,0:00:23.73,Default,,0,0,0,,C'est pas vrai, qu'est-ce qu'il fout ici, ce con?
Dialogue: 0,0:00:23.73,0:00:23.95,Default,,0,0,0,,C'est pas vrai, qu'est-ce qu'il fout {\1c&HFF00&\u1}ici,{\r} ce con?
Dialogue: 0,0:00:23.95,0:00:23.99,Default,,0,0,0,,C'est pas vrai, qu'est-ce qu'il fout ici, ce con?
Dialogue: 0,0:00:23.99,0:00:24.11,Default,,0,0,0,,C'est pas vrai, qu'est-ce qu'il fout ici, {\1c&HFF00&\u1}ce{\r} con?
Dialogue: 0,0:00:24.11,0:00:24.15,Default,,0,0,0,,C'est pas vrai, qu'est-ce qu'il fout ici, ce con?
Dialogue: 0,0:00:24.15,0:00:24.23,Default,,0,0,0,,C'est pas vrai, qu'est-ce qu'il fout ici, ce {\1c&HFF00&\u1}con?{\r}
Dialogue: 0,0:00:24.51,0:00:24.96,Default,,0,0,0,,{\1c&HFF00&\u1}Excusez-moi,{\r} mais je crois que j'ai oublié mon sac chez vous.
Dialogue: 0,0:00:24.96,0:00:24.98,Default,,0,0,0,,Excusez-moi, mais je crois que j'ai oublié mon sac chez vous.
Dialogue: 0,0:00:24.98,0:00:25.06,Default,,0,0,0,,Excusez-moi, {\1c&HFF00&\u1}mais{\r} je crois que j'ai oublié mon sac chez vous.
Dialogue: 0,0:00:25.06,0:00:25.08,Default,,0,0,0,,Excusez-moi, mais je crois que j'ai oublié mon sac chez vous.
Dialogue: 0,0:00:25.08,0:00:25.12,Default,,0,0,0,,Excusez-moi, mais {\1c&HFF00&\u1}je{\r} crois que j'ai oublié mon sac chez vous.
Dialogue: 0,0:00:25.12,0:00:25.14,Default,,0,0,0,,Excusez-moi, mais je crois que j'ai oublié mon sac chez vous.
Dialogue: 0,0:00:25.14,0:00:25.26,Default,,0,0,0,,Excusez-moi, mais je {\1c&HFF00&\u1}crois{\r} que j'ai oublié mon sac chez vous.
Dialogue: 0,0:00:25.26,0:00:25.28,Default,,0,0,0,,Excusez-moi, mais je crois que j'ai oublié mon sac chez vous.
Dialogue: 0,0:00:25.28,0:00:25.34,Default,,0,0,0,,Excusez-moi, mais je crois {\1c&HFF00&\u1}que{\r} j'ai oublié mon sac chez vous.
Dialogue: 0,0:00:25.34,0:00:25.36,Default,,0,0,0,,Excusez-moi, mais je crois que j'ai oublié mon sac chez vous.
Dialogue: 0,0:00:25.36,0:00:25.42,Default,,0,0,0,,Excusez-moi, mais je crois que {\1c&HFF00&\u1}j'ai{\r} oublié mon sac chez vous.
Dialogue: 0,0:00:25.42,0:00:25.44,Default,,0,0,0,,Excusez-moi, mais je crois que j'ai oublié mon sac chez vous.
Dialogue: 0,0:00:25.44,0:00:25.60,Default,,0,0,0,,Excusez-moi, mais je crois que j'ai {\1c&HFF00&\u1}oublié{\r} mon sac chez vous.
Dialogue: 0,0:00:25.60,0:00:25.62,Default,,0,0,0,,Excusez-moi, mais je crois que j'ai oublié mon sac chez vous.
Dialogue: 0,0:00:25.62,0:00:25.76,Default,,0,0,0,,Excusez-moi, mais je crois que j'ai oublié {\1c&HFF00&\u1}mon{\r} sac chez vous.
Dialogue: 0,0:00:25.76,0:00:25.78,Default,,0,0,0,,Excusez-moi, mais je crois que j'ai oublié mon sac chez vous.
Dialogue: 0,0:00:25.78,0:00:25.94,Default,,0,0,0,,Excusez-moi, mais je crois que j'ai oublié mon {\1c&HFF00&\u1}sac{\r} chez vous.
Dialogue: 0,0:00:25.94,0:00:25.96,Default,,0,0,0,,Excusez-moi, mais je crois que j'ai oublié mon sac chez vous.
Dialogue: 0,0:00:25.96,0:00:26.04,Default,,0,0,0,,Excusez-moi, mais je crois que j'ai oublié mon sac {\1c&HFF00&\u1}chez{\r} vous.
Dialogue: 0,0:00:26.04,0:00:26.06,Default,,0,0,0,,Excusez-moi, mais je crois que j'ai oublié mon sac chez vous.
Dialogue: 0,0:00:26.06,0:00:26.18,Default,,0,0,0,,Excusez-moi, mais je crois que j'ai oublié mon sac chez {\1c&HFF00&\u1}vous.{\r}
Dialogue: 0,0:00:26.18,0:00:26.30,Default,,0,0,0,,{\1c&HFF00&\u1}Ça{\r} va?
Dialogue: 0,0:00:26.30,0:00:26.32,Default,,0,0,0,,Ça va?
Dialogue: 0,0:00:26.32,0:00:26.36,Default,,0,0,0,,Ça {\1c&HFF00&\u1}va?{\r}
Dialogue: 0,0:00:31.04,0:00:31.24,Default,,0,0,0,,{\1c&HFF00&\u1}Attendez.{\r}
Dialogue: 0,0:00:36.81,0:00:36.97,Default,,0,0,0,,{\1c&HFF00&\u1}Tout{\r} à l'heure, là, c'était...
Dialogue: 0,0:00:36.97,0:00:37.01,Default,,0,0,0,,Tout à l'heure, là, c'était...
Dialogue: 0,0:00:37.01,0:00:37.05,Default,,0,0,0,,Tout {\1c&HFF00&\u1}à{\r} l'heure, là, c'était...
Dialogue: 0,0:00:37.05,0:00:37.09,Default,,0,0,0,,Tout à l'heure, là, c'était...
Dialogue: 0,0:00:37.09,0:00:37.35,Default,,0,0,0,,Tout à {\1c&HFF00&\u1}l'heure,{\r} là, c'était...
Dialogue: 0,0:00:37.35,0:00:37.39,Default,,0,0,0,,Tout à l'heure, là, c'était...
Dialogue: 0,0:00:37.39,0:00:37.98,Default,,0,0,0,,Tout à l'heure, {\1c&HFF00&\u1}là,{\r} c'était...
Dialogue: 0,0:00:37.98,0:00:38.02,Default,,0,0,0,,Tout à l'heure, là, c'était...
Dialogue: 0,0:00:38.02,0:00:38.28,Default,,0,0,0,,Tout à l'heure, là, {\1c&HFF00&\u1}c'était...{\r}
Dialogue: 0,0:00:38.28,0:00:38.36,Default,,0,0,0,,{\1c&HFF00&\u1}Vous?{\r}
Dialogue: 0,0:00:39.12,0:00:39.24,Default,,0,0,0,,{\1c&HFF00&\u1}Vous?{\r} Pas...
Dialogue: 0,0:00:39.24,0:00:39.30,Default,,0,0,0,,Vous? Pas...
Dialogue: 0,0:00:39.30,0:00:39.42,Default,,0,0,0,,Vous? {\1c&HFF00&\u1}Pas...{\r}
Dialogue: 0,0:00:39.42,0:00:39.53,Default,,0,0,0,,{\1c&HFF00&\u1}Pas{\r} lui? Vous?
Dialogue: 0,0:00:39.53,0:00:39.55,Default,,0,0,0,,Pas lui? Vous?
Dialogue: 0,0:00:39.55,0:00:39.61,Default,,0,0,0,,Pas {\1c&HFF00&\u1}lui?{\r} Vous?
Dialogue: 0,0:00:39.61,0:00:39.63,Default,,0,0,0,,Pas lui? Vous?
Dialogue: 0,0:00:39.63,0:00:39.71,Default,,0,0,0,,Pas lui? {\1c&HFF00&\u1}Vous?{\r}
Dialogue: 0,0:00:44.19,0:00:44.35,Default,,0,0,0,,{\1c&HFF00&\u1}Vous{\r} avez tout à fait raison, M. Xenakis.
Dialogue: 0,0:00:44.35,0:00:44.39,Default,,0,0,0,,Vous avez tout à fait raison, M. Xenakis.
Dialogue: 0,0:00:44.39,0:00:44.62,Default,,0,0,0,,Vous {\1c&HFF00&\u1}avez{\r} tout à fait raison, M. Xenakis.
Dialogue: 0,0:00:44.62,0:00:44.64,Default,,0,0,0,,Vous avez tout à fait raison, M. Xenakis.
Dialogue: 0,0:00:44.64,0:00:44.80,Default,,0,0,0,,Vous avez {\1c&HFF00&\u1}tout{\r} à fait raison, M. Xenakis.
Dialogue: 0,0:00:44.80,0:00:44.82,Default,,0,0,0,,Vous avez tout à fait raison, M. Xenakis.
Dialogue: 0,0:00:44.82,0:00:44.84,Default,,0,0,0,,Vous avez tout {\1c&HFF00&\u1}à{\r} fait raison, M. Xenakis.
Dialogue: 0,0:00:44.84,0:00:44.90,Default,,0,0,0,,Vous avez tout à fait raison, M. Xenakis.
Dialogue: 0,0:00:44.90,0:00:45.04,Default,,0,0,0,,Vous avez tout à {\1c&HFF00&\u1}fait{\r} raison, M. Xenakis.
Dialogue: 0,0:00:45.04,0:00:45.08,Default,,0,0,0,,Vous avez tout à fait raison, M. Xenakis.
Dialogue: 0,0:00:45.08,0:00:45.38,Default,,0,0,0,,Vous avez tout à fait {\1c&HFF00&\u1}raison,{\r} M. Xenakis.
Dialogue: 0,0:00:45.38,0:00:45.42,Default,,0,0,0,,Vous avez tout à fait raison, M. Xenakis.
Dialogue: 0,0:00:45.42,0:00:45.62,Default,,0,0,0,,Vous avez tout à fait raison, {\1c&HFF00&\u1}M.{\r} Xenakis.
Dialogue: 0,0:00:45.62,0:00:45.68,Default,,0,0,0,,Vous avez tout à fait raison, M. Xenakis.
Dialogue: 0,0:00:45.68,0:00:45.98,Default,,0,0,0,,Vous avez tout à fait raison, M. {\1c&HFF00&\u1}Xenakis.{\r}
Dialogue: 0,0:00:46.75,0:00:47.13,Default,,0,0,0,,{\1c&HFF00&\u1}Malek{\r} est à l'interne brillant, qui apprend le métier avec moi.
Dialogue: 0,0:00:47.13,0:00:47.15,Default,,0,0,0,,Malek est à l'interne brillant, qui apprend le métier avec moi.
Dialogue: 0,0:00:47.15,0:00:47.27,Default,,0,0,0,,Malek {\1c&HFF00&\u1}est{\r} à l'interne brillant, qui apprend le métier avec moi.
Dialogue: 0,0:00:47.27,0:00:47.31,Default,,0,0,0,,Malek est à l'interne brillant, qui apprend le métier avec moi.
Dialogue: 0,0:00:47.31,0:00:47.37,Default,,0,0,0,,Malek est {\1c&HFF00&\u1}à{\r} l'interne brillant, qui apprend le métier avec moi.
Dialogue: 0,0:00:47.37,0:00:47.39,Default,,0,0,0,,Malek est à l'interne brillant, qui apprend le métier avec moi.
Dialogue: 0,0:00:47.39,0:00:47.75,Default,,0,0,0,,Malek est à {\1c&HFF00&\u1}l'interne{\r} brillant, qui apprend le métier avec moi.
Dialogue: 0,0:00:47.75,0:00:47.79,Default,,0,0,0,,Malek est à l'interne brillant, qui apprend le métier avec moi.
Dialogue: 0,0:00:47.79,0:00:48.07,Default,,0,0,0,,Malek est à l'interne {\1c&HFF00&\u1}brillant,{\r} qui apprend le métier avec moi.
Dialogue: 0,0:00:48.07,0:00:48.11,Default,,0,0,0,,Malek est à l'interne brillant, qui apprend le métier avec moi.
Dialogue: 0,0:00:48.11,0:00:48.19,Default,,0,0,0,,Malek est à l'interne brillant, {\1c&HFF00&\u1}qui{\r} apprend le métier avec moi.
Dialogue: 0,0:00:48.19,0:00:48.23,Default,,0,0,0,,Malek est à l'interne brillant, qui apprend le métier avec moi.
Dialogue: 0,0:00:48.23,0:00:48.44,Default,,0,0,0,,Malek est à l'interne brillant, qui {\1c&HFF00&\u1}apprend{\r} le métier avec moi.
Dialogue: 0,0:00:48.44,0:00:48.46,Default,,0,0,0,,Malek est à l'interne brillant, qui apprend le métier avec moi.
Dialogue: 0,0:00:48.46,0:00:48.52,Default,,0,0,0,,Malek est à l'interne brillant, qui apprend {\1c&HFF00&\u1}le{\r} métier avec moi.
Dialogue: 0,0:00:48.52,0:00:48.54,Default,,0,0,0,,Malek est à l'interne brillant, qui apprend le métier avec moi.
Dialogue: 0,0:00:48.54,0:00:48.74,Default,,0,0,0,,Malek est à l'interne brillant, qui apprend le {\1c&HFF00&\u1}métier{\r} avec moi.
Dialogue: 0,0:00:48.74,0:00:48.76,Default,,0,0,0,,Malek est à l'interne brillant, qui apprend le métier avec moi.
Dialogue: 0,0:00:48.76,0:00:48.88,Default,,0,0,0,,Malek est à l'interne brillant, qui apprend le métier {\1c&HFF00&\u1}avec{\r} moi.
Dialogue: 0,0:00:48.88,0:00:48.90,Default,,0,0,0,,Malek est à l'interne brillant, qui apprend le métier avec moi.
Dialogue: 0,0:00:48.90,0:00:49.00,Default,,0,0,0,,Malek est à l'interne brillant, qui apprend le métier avec {\1c&HFF00&\u1}moi.{\r}
Dialogue: 0,0:00:49.02,0:00:49.06,Default,,0,0,0,,{\1c&HFF00&\u1}Ah!{\r}
Dialogue: 0,0:00:49.06,0:00:49.20,Default,,0,0,0,,{\1c&HFF00&\u1}C'est{\r} vrai.
Dialogue: 0,0:00:49.20,0:00:49.22,Default,,0,0,0,,C'est vrai.
Dialogue: 0,0:00:49.22,0:00:49.30,Default,,0,0,0,,C'est {\1c&HFF00&\u1}vrai.{\r}
Dialogue: 0,0:00:49.30,0:00:49.44,Default,,0,0,0,,{\1c&HFF00&\u1}Bien.{\r}
Dialogue: 0,0:00:52.93,0:00:53.21,Default,,0,0,0,,{\1c&HFF00&\u1}Justement,{\r} y a la famille Boboun qui m'attend pour une consultation.
Dialogue: 0,0:00:53.21,0:00:53.33,Default,,0,0,0,,Justement, y a la famille Boboun qui m'attend pour une consultation.
Dialogue: 0,0:00:53.33,0:00:53.35,Default,,0,0,0,,Justement, {\1c&HFF00&\u1}y{\r} a la famille Boboun qui m'attend pour une consultation.
Dialogue: 0,0:00:53.35,0:00:53.37,Default,,0,0,0,,Justement, y a la famille Boboun qui m'attend pour une consultation.
Dialogue: 0,0:00:53.37,0:00:53.39,Default,,0,0,0,,Justement, y {\1c&HFF00&\u1}a{\r} la famille Boboun qui m'attend pour une consultation.
Dialogue: 0,0:00:53.39,0:00:53.43,Default,,0,0,0,,Justement, y a la famille Boboun qui m'attend pour une consultation.
Dialogue: 0,0:00:53.43,0:00:53.49,Default,,0,0,0,,Justement, y a {\1c&HFF00&\u1}la{\r} famille Boboun qui m'attend pour une consultation.
Dialogue: 0,0:00:53.49,0:00:53.51,Default,,0,0,0,,Justement, y a la famille Boboun qui m'attend pour une consultation.
Dialogue: 0,0:00:53.51,0:00:53.85,Default,,0,0,0,,Justement, y a la {\1c&HFF00&\u1}famille{\r} Boboun qui m'attend pour une consultation.
Dialogue: 0,0:00:53.85,0:00:53.89,Default,,0,0,0,,Justement, y a la famille Boboun qui m'attend pour une consultation.
Dialogue: 0,0:00:53.89,0:00:54.16,Default,,0,0,0,,Justement, y a la famille {\1c&HFF00&\u1}Boboun{\r} qui m'attend pour une consultation.
Dialogue: 0,0:00:54.16,0:00:54.18,Default,,0,0,0,,Justement, y a la famille Boboun qui m'attend pour une consultation.
Dialogue: 0,0:00:54.18,0:00:54.26,Default,,0,0,0,,Justement, y a la famille Boboun {\1c&HFF00&\u1}qui{\r} m'attend pour une consultation.
Dialogue: 0,0:00:54.26,0:00:54.30,Default,,0,0,0,,Justement, y a la famille Boboun qui m'attend pour une consultation.
Dialogue: 0,0:00:54.30,0:00:54.52,Default,,0,0,0,,Justement, y a la famille Boboun qui {\1c&HFF00&\u1}m'attend{\r} pour une consultation.
Dialogue: 0,0:00:54.52,0:00:54.54,Default,,0,0,0,,Justement, y a la famille Boboun qui m'attend pour une consultation.
Dialogue: 0,0:00:54.54,0:00:54.64,Default,,0,0,0,,Justement, y a la famille Boboun qui m'attend {\1c&HFF00&\u1}pour{\r} une consultation.
Dialogue: 0,0:00:54.64,0:00:54.68,Default,,0,0,0,,Justement, y a la famille Boboun qui m'attend pour une consultation.
Dialogue: 0,0:00:54.68,0:00:54.90,Default,,0,0,0,,Justement, y a la famille Boboun qui m'attend pour {\1c&HFF00&\u1}une{\r} consultation.
Dialogue: 0,0:00:54.90,0:00:54.94,Default,,0,0,0,,Justement, y a la famille Boboun qui m'attend pour une consultation.
Dialogue: 0,0:00:54.94,0:00:55.34,Default,,0,0,0,,Justement, y a la famille Boboun qui m'attend pour une {\1c&HFF00&\u1}consultation.{\r}
Dialogue: 0,0:00:55.58,0:00:55.64,Default,,0,0,0,,{\1c&HFF00&\u1}Qui?{\r}
Dialogue: 0,0:00:56.53,0:00:56.79,Default,,0,0,0,,{\1c&HFF00&\u1}Faisons{\r} pas attendre les bobounes, allez.
Dialogue: 0,0:00:56.79,0:00:56.81,Default,,0,0,0,,Faisons pas attendre les bobounes, allez.
Dialogue: 0,0:00:56.81,0:00:56.91,Default,,0,0,0,,Faisons {\1c&HFF00&\u1}pas{\r} attendre les bobounes, allez.
Dialogue: 0,0:00:56.91,0:00:56.93,Default,,0,0,0,,Faisons pas attendre les bobounes, allez.
Dialogue: 0,0:00:56.93,0:00:57.15,Default,,0,0,0,,Faisons pas {\1c&HFF00&\u1}attendre{\r} les bobounes, allez.
Dialogue: 0,0:00:57.15,0:00:57.19,Default,,0,0,0,,Faisons pas attendre les bobounes, allez.
Dialogue: 0,0:00:57.19,0:00:57.25,Default,,0,0,0,,Faisons pas attendre {\1c&HFF00&\u1}les{\r} bobounes, allez.
Dialogue: 0,0:00:57.25,0:00:57.27,Default,,0,0,0,,Faisons pas attendre les bobounes, allez.
Dialogue: 0,0:00:57.27,0:00:57.59,Default,,0,0,0,,Faisons pas attendre les {\1c&HFF00&\u1}bobounes,{\r} allez.
Dialogue: 0,0:00:57.59,0:00:57.61,Default,,0,0,0,,Faisons pas attendre les bobounes, allez.
Dialogue: 0,0:00:57.61,0:00:57.75,Default,,0,0,0,,Faisons pas attendre les bobounes, {\1c&HFF00&\u1}allez.{\r}

View File

@ -1,120 +0,0 @@
1
00:00:00,765 --> 00:00:01,309
Lâchez, c'est bon.
2
00:00:01,309 --> 00:00:01,891
Ça va?
3
00:00:01,911 --> 00:00:01,971
Oui.
4
00:00:01,971 --> 00:00:03,495
Merci beaucoup.
5
00:00:04,356 --> 00:00:05,037
Chèque ou espèce?
6
00:00:06,544 --> 00:00:08,393
J'ai laissé un chèque sur la commode, il est signé.
7
00:00:08,393 --> 00:00:09,255
Vous le remplirez.
8
00:00:09,275 --> 00:00:09,335
OK.
9
00:00:09,355 --> 00:00:09,395
Oh!
10
00:00:12,410 --> 00:00:12,590
Ouh là!
11
00:00:12,590 --> 00:00:12,731
Venez.
12
00:00:14,454 --> 00:00:14,635
Merci.
13
00:00:14,655 --> 00:00:14,755
Ah!
14
00:00:15,640 --> 00:00:16,626
C'est qui?
15
00:00:20,000 --> 00:00:24,234
C'est pas vrai, qu'est-ce qu'il fout ici, ce con?
16
00:00:24,515 --> 00:00:26,177
Excusez-moi, mais je crois que j'ai oublié mon sac chez vous.
17
00:00:26,177 --> 00:00:26,359
Ça va?
18
00:00:31,040 --> 00:00:31,241
Attendez.
19
00:00:36,813 --> 00:00:38,278
Tout à l'heure, là, c'était...
20
00:00:38,278 --> 00:00:38,359
Vous?
21
00:00:39,123 --> 00:00:39,425
Vous? Pas...
22
00:00:39,425 --> 00:00:39,706
Pas lui? Vous?
23
00:00:44,194 --> 00:00:45,980
Vous avez tout à fait raison, M. Xenakis.
24
00:00:46,745 --> 00:00:49,000
Malek est à l'interne brillant, qui apprend le métier avec moi.
25
00:00:49,020 --> 00:00:49,061
Ah!
26
00:00:49,061 --> 00:00:49,303
C'est vrai.
27
00:00:49,303 --> 00:00:49,443
Bien.
28
00:00:52,932 --> 00:00:55,338
Justement, y a la famille Boboun qui m'attend pour une consultation.
29
00:00:55,581 --> 00:00:55,642
Qui?
30
00:00:56,527 --> 00:00:57,753
Faisons pas attendre les bobounes, allez.

View File

@ -1,177 +0,0 @@
[Script Info]
ScriptType: v4.00+
PlayResX: 384
PlayResY: 288
ScaledBorderAndShadow: yes
[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
Style: Default,Arial,24,&Hffffff,&Hffffff,&H0,&H0,0,0,0,0,100,100,0,0,1,1,0,2,10,10,10,0
[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
Dialogue: 0,0:00:1.20,0:00:1.62,Default,,0,0,0,,{\1c&HFF00&\u1}Signore,{\r} è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:1.62,0:00:1.64,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:1.64,0:00:1.66,Default,,0,0,0,,Signore, {\1c&HFF00&\u1}è{\r} un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:1.66,0:00:1.72,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:1.72,0:00:2.12,Default,,0,0,0,,Signore, è {\1c&HFF00&\u1}un{\r} piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:2.12,0:00:2.18,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:2.18,0:00:2.72,Default,,0,0,0,,Signore, è un {\1c&HFF00&\u1}piacere,{\r} gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:2.72,0:00:3.33,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:3.33,0:00:3.45,Default,,0,0,0,,Signore, è un piacere, {\1c&HFF00&\u1}gli{\r} amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:3.45,0:00:3.49,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:3.49,0:00:3.79,Default,,0,0,0,,Signore, è un piacere, gli {\1c&HFF00&\u1}amici{\r} della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:3.79,0:00:3.83,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:3.83,0:00:4.01,Default,,0,0,0,,Signore, è un piacere, gli amici {\1c&HFF00&\u1}della{\r} vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:4.01,0:00:4.05,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:4.05,0:00:4.35,Default,,0,0,0,,Signore, è un piacere, gli amici della {\1c&HFF00&\u1}vedetta{\r} ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:4.35,0:00:4.39,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:4.39,0:00:4.79,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta {\1c&HFF00&\u1}ammirata{\r} da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:4.79,0:00:4.85,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:4.85,0:00:4.95,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata {\1c&HFF00&\u1}da{\r} tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:4.95,0:00:4.97,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:4.97,0:00:5.15,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da {\1c&HFF00&\u1}tutti{\r} noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:5.15,0:00:5.21,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:5.21,0:00:5.33,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti {\1c&HFF00&\u1}noi,{\r} questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:5.33,0:00:5.41,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:5.41,0:00:5.61,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, {\1c&HFF00&\u1}questa{\r} gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:5.61,0:00:5.79,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:5.79,0:00:6.07,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa {\1c&HFF00&\u1}gemma{\r} propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:6.07,0:00:6.13,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:6.13,0:00:6.51,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma {\1c&HFF00&\u1}propria{\r} della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:6.51,0:00:6.57,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:6.57,0:00:6.77,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria {\1c&HFF00&\u1}della{\r} nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:6.77,0:00:6.81,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:6.81,0:00:6.99,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della {\1c&HFF00&\u1}nostra{\r} cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:6.99,0:00:7.07,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:7.07,0:00:7.35,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra {\1c&HFF00&\u1}cultura,{\r} saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:7.35,0:00:7.41,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:7.41,0:00:7.73,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, {\1c&HFF00&\u1}saranno{\r} naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:7.73,0:00:7.87,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:7.87,0:00:8.47,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno {\1c&HFF00&\u1}naturalmente{\r} accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:8.47,0:00:8.55,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:8.55,0:00:8.85,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente {\1c&HFF00&\u1}accolti{\r} sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:8.85,0:00:8.91,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:8.91,0:00:9.07,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti {\1c&HFF00&\u1}sotto{\r} la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:9.07,0:00:9.13,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:9.13,0:00:9.19,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto {\1c&HFF00&\u1}la{\r} mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:9.19,0:00:9.23,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:9.23,0:00:9.33,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la {\1c&HFF00&\u1}mia{\r} protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:9.33,0:00:9.37,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:9.37,0:00:9.82,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia {\1c&HFF00&\u1}protezione{\r} per la durata del loro soggiorno.
Dialogue: 0,0:00:9.82,0:00:9.88,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:9.88,0:00:9.96,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione {\1c&HFF00&\u1}per{\r} la durata del loro soggiorno.
Dialogue: 0,0:00:9.96,0:00:10.02,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:10.02,0:00:10.08,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per {\1c&HFF00&\u1}la{\r} durata del loro soggiorno.
Dialogue: 0,0:00:10.08,0:00:10.12,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:10.12,0:00:10.44,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la {\1c&HFF00&\u1}durata{\r} del loro soggiorno.
Dialogue: 0,0:00:10.44,0:00:10.50,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:10.50,0:00:10.60,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata {\1c&HFF00&\u1}del{\r} loro soggiorno.
Dialogue: 0,0:00:10.60,0:00:10.62,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:10.62,0:00:10.78,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del {\1c&HFF00&\u1}loro{\r} soggiorno.
Dialogue: 0,0:00:10.78,0:00:10.86,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
Dialogue: 0,0:00:10.86,0:00:11.30,Default,,0,0,0,,Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro {\1c&HFF00&\u1}soggiorno.{\r}
Dialogue: 0,0:00:13.00,0:00:13.12,Default,,0,0,0,,{\1c&HFF00&\u1}Grazie.{\r}
Dialogue: 0,0:00:15.60,0:00:17.89,Default,,0,0,0,,{\1c&HFF00&\u1}Gorlami?{\r} Lo pronuncio correttamente?
Dialogue: 0,0:00:17.89,0:00:18.75,Default,,0,0,0,,Gorlami? Lo pronuncio correttamente?
Dialogue: 0,0:00:18.75,0:00:18.83,Default,,0,0,0,,Gorlami? {\1c&HFF00&\u1}Lo{\r} pronuncio correttamente?
Dialogue: 0,0:00:18.83,0:00:18.87,Default,,0,0,0,,Gorlami? Lo pronuncio correttamente?
Dialogue: 0,0:00:18.87,0:00:19.31,Default,,0,0,0,,Gorlami? Lo {\1c&HFF00&\u1}pronuncio{\r} correttamente?
Dialogue: 0,0:00:19.31,0:00:19.37,Default,,0,0,0,,Gorlami? Lo pronuncio correttamente?
Dialogue: 0,0:00:19.37,0:00:19.87,Default,,0,0,0,,Gorlami? Lo pronuncio {\1c&HFF00&\u1}correttamente?{\r}
Dialogue: 0,0:00:21.58,0:00:21.74,Default,,0,0,0,,{\1c&HFF00&\u1}Sì,{\r} corretto.
Dialogue: 0,0:00:21.74,0:00:22.34,Default,,0,0,0,,Sì, corretto.
Dialogue: 0,0:00:22.34,0:00:22.72,Default,,0,0,0,,Sì, {\1c&HFF00&\u1}corretto.{\r}
Dialogue: 0,0:00:23.54,0:00:24.82,Default,,0,0,0,,{\1c&HFF00&\u1}Gorlami?{\r} Per cortesia, me lo ripeto ancora.
Dialogue: 0,0:00:24.82,0:00:25.45,Default,,0,0,0,,Gorlami? Per cortesia, me lo ripeto ancora.
Dialogue: 0,0:00:25.45,0:00:25.53,Default,,0,0,0,,Gorlami? {\1c&HFF00&\u1}Per{\r} cortesia, me lo ripeto ancora.
Dialogue: 0,0:00:25.53,0:00:25.61,Default,,0,0,0,,Gorlami? Per cortesia, me lo ripeto ancora.
Dialogue: 0,0:00:25.61,0:00:26.17,Default,,0,0,0,,Gorlami? Per {\1c&HFF00&\u1}cortesia,{\r} me lo ripeto ancora.
Dialogue: 0,0:00:26.17,0:00:26.39,Default,,0,0,0,,Gorlami? Per cortesia, me lo ripeto ancora.
Dialogue: 0,0:00:26.39,0:00:26.45,Default,,0,0,0,,Gorlami? Per cortesia, {\1c&HFF00&\u1}me{\r} lo ripeto ancora.
Dialogue: 0,0:00:26.45,0:00:26.49,Default,,0,0,0,,Gorlami? Per cortesia, me lo ripeto ancora.
Dialogue: 0,0:00:26.49,0:00:26.55,Default,,0,0,0,,Gorlami? Per cortesia, me {\1c&HFF00&\u1}lo{\r} ripeto ancora.
Dialogue: 0,0:00:26.55,0:00:26.61,Default,,0,0,0,,Gorlami? Per cortesia, me lo ripeto ancora.
Dialogue: 0,0:00:26.61,0:00:27.05,Default,,0,0,0,,Gorlami? Per cortesia, me lo {\1c&HFF00&\u1}ripeto{\r} ancora.
Dialogue: 0,0:00:27.05,0:00:27.11,Default,,0,0,0,,Gorlami? Per cortesia, me lo ripeto ancora.
Dialogue: 0,0:00:27.11,0:00:27.49,Default,,0,0,0,,Gorlami? Per cortesia, me lo ripeto {\1c&HFF00&\u1}ancora.{\r}
Dialogue: 0,0:00:27.55,0:00:28.79,Default,,0,0,0,,{\1c&HFF00&\u1}ancora{\r} gourlami scusi con me gourlami ancora una volta gourlami e come si chiama lei antonio
Dialogue: 0,0:00:28.79,0:00:28.83,Default,,0,0,0,,ancora gourlami scusi con me gourlami ancora una volta gourlami e come si chiama lei antonio
Dialogue: 0,0:00:28.83,0:00:30.96,Default,,0,0,0,,ancora {\1c&HFF00&\u1}gourlami{\r} scusi con me gourlami ancora una volta gourlami e come si chiama lei antonio
Dialogue: 0,0:00:30.96,0:00:31.02,Default,,0,0,0,,ancora gourlami scusi con me gourlami ancora una volta gourlami e come si chiama lei antonio
Dialogue: 0,0:00:31.02,0:00:31.36,Default,,0,0,0,,ancora gourlami {\1c&HFF00&\u1}scusi{\r} con me gourlami ancora una volta gourlami e come si chiama lei antonio
Dialogue: 0,0:00:31.36,0:00:31.46,Default,,0,0,0,,ancora gourlami scusi con me gourlami ancora una volta gourlami e come si chiama lei antonio
Dialogue: 0,0:00:31.46,0:00:31.66,Default,,0,0,0,,ancora gourlami scusi {\1c&HFF00&\u1}con{\r} me gourlami ancora una volta gourlami e come si chiama lei antonio
Dialogue: 0,0:00:31.66,0:00:31.72,Default,,0,0,0,,ancora gourlami scusi con me gourlami ancora una volta gourlami e come si chiama lei antonio
Dialogue: 0,0:00:31.72,0:00:31.88,Default,,0,0,0,,ancora gourlami scusi con {\1c&HFF00&\u1}me{\r} gourlami ancora una volta gourlami e come si chiama lei antonio
Dialogue: 0,0:00:31.88,0:00:33.46,Default,,0,0,0,,ancora gourlami scusi con me gourlami ancora una volta gourlami e come si chiama lei antonio
Dialogue: 0,0:00:33.46,0:00:34.12,Default,,0,0,0,,ancora gourlami scusi con me {\1c&HFF00&\u1}gourlami{\r} ancora una volta gourlami e come si chiama lei antonio
Dialogue: 0,0:00:34.12,0:00:34.84,Default,,0,0,0,,ancora gourlami scusi con me gourlami ancora una volta gourlami e come si chiama lei antonio
Dialogue: 0,0:00:34.84,0:00:35.20,Default,,0,0,0,,ancora gourlami scusi con me gourlami {\1c&HFF00&\u1}ancora{\r} una volta gourlami e come si chiama lei antonio
Dialogue: 0,0:00:35.20,0:00:35.32,Default,,0,0,0,,ancora gourlami scusi con me gourlami ancora una volta gourlami e come si chiama lei antonio
Dialogue: 0,0:00:35.32,0:00:35.44,Default,,0,0,0,,ancora gourlami scusi con me gourlami ancora {\1c&HFF00&\u1}una{\r} volta gourlami e come si chiama lei antonio
Dialogue: 0,0:00:35.44,0:00:35.48,Default,,0,0,0,,ancora gourlami scusi con me gourlami ancora una volta gourlami e come si chiama lei antonio
Dialogue: 0,0:00:35.48,0:00:35.82,Default,,0,0,0,,ancora gourlami scusi con me gourlami ancora una {\1c&HFF00&\u1}volta{\r} gourlami e come si chiama lei antonio
Dialogue: 0,0:00:35.82,0:00:39.16,Default,,0,0,0,,ancora gourlami scusi con me gourlami ancora una volta gourlami e come si chiama lei antonio
Dialogue: 0,0:00:39.16,0:00:39.72,Default,,0,0,0,,ancora gourlami scusi con me gourlami ancora una volta {\1c&HFF00&\u1}gourlami{\r} e come si chiama lei antonio
Dialogue: 0,0:00:39.72,0:00:40.96,Default,,0,0,0,,ancora gourlami scusi con me gourlami ancora una volta gourlami e come si chiama lei antonio
Dialogue: 0,0:00:40.96,0:00:41.14,Default,,0,0,0,,ancora gourlami scusi con me gourlami ancora una volta gourlami {\1c&HFF00&\u1}e{\r} come si chiama lei antonio
Dialogue: 0,0:00:41.14,0:00:41.20,Default,,0,0,0,,ancora gourlami scusi con me gourlami ancora una volta gourlami e come si chiama lei antonio
Dialogue: 0,0:00:41.20,0:00:41.34,Default,,0,0,0,,ancora gourlami scusi con me gourlami ancora una volta gourlami e {\1c&HFF00&\u1}come{\r} si chiama lei antonio
Dialogue: 0,0:00:41.34,0:00:41.38,Default,,0,0,0,,ancora gourlami scusi con me gourlami ancora una volta gourlami e come si chiama lei antonio
Dialogue: 0,0:00:41.38,0:00:41.46,Default,,0,0,0,,ancora gourlami scusi con me gourlami ancora una volta gourlami e come {\1c&HFF00&\u1}si{\r} chiama lei antonio
Dialogue: 0,0:00:41.46,0:00:41.50,Default,,0,0,0,,ancora gourlami scusi con me gourlami ancora una volta gourlami e come si chiama lei antonio
Dialogue: 0,0:00:41.50,0:00:41.70,Default,,0,0,0,,ancora gourlami scusi con me gourlami ancora una volta gourlami e come si {\1c&HFF00&\u1}chiama{\r} lei antonio
Dialogue: 0,0:00:41.70,0:00:41.80,Default,,0,0,0,,ancora gourlami scusi con me gourlami ancora una volta gourlami e come si chiama lei antonio
Dialogue: 0,0:00:41.80,0:00:42.06,Default,,0,0,0,,ancora gourlami scusi con me gourlami ancora una volta gourlami e come si chiama {\1c&HFF00&\u1}lei{\r} antonio
Dialogue: 0,0:00:42.06,0:00:43.44,Default,,0,0,0,,ancora gourlami scusi con me gourlami ancora una volta gourlami e come si chiama lei antonio
Dialogue: 0,0:00:43.44,0:00:43.98,Default,,0,0,0,,ancora gourlami scusi con me gourlami ancora una volta gourlami e come si chiama lei {\1c&HFF00&\u1}antonio{\r}
Dialogue: 0,0:00:44.36,0:00:45.14,Default,,0,0,0,,{\1c&HFF00&\u1}margarete{\r} ancora margarete un'altra volta ma adesso vorrei proprio sentire la musica
Dialogue: 0,0:00:45.14,0:00:46.56,Default,,0,0,0,,margarete ancora margarete un'altra volta ma adesso vorrei proprio sentire la musica
Dialogue: 0,0:00:46.56,0:00:46.85,Default,,0,0,0,,margarete {\1c&HFF00&\u1}ancora{\r} margarete un'altra volta ma adesso vorrei proprio sentire la musica
Dialogue: 0,0:00:46.85,0:00:47.85,Default,,0,0,0,,margarete ancora margarete un'altra volta ma adesso vorrei proprio sentire la musica
Dialogue: 0,0:00:47.85,0:00:49.15,Default,,0,0,0,,margarete ancora {\1c&HFF00&\u1}margarete{\r} un'altra volta ma adesso vorrei proprio sentire la musica
Dialogue: 0,0:00:49.15,0:00:49.43,Default,,0,0,0,,margarete ancora margarete un'altra volta ma adesso vorrei proprio sentire la musica
Dialogue: 0,0:00:49.43,0:00:49.85,Default,,0,0,0,,margarete ancora margarete {\1c&HFF00&\u1}un'altra{\r} volta ma adesso vorrei proprio sentire la musica
Dialogue: 0,0:00:49.85,0:00:49.91,Default,,0,0,0,,margarete ancora margarete un'altra volta ma adesso vorrei proprio sentire la musica
Dialogue: 0,0:00:49.91,0:00:50.13,Default,,0,0,0,,margarete ancora margarete un'altra {\1c&HFF00&\u1}volta{\r} ma adesso vorrei proprio sentire la musica
Dialogue: 0,0:00:50.13,0:00:50.19,Default,,0,0,0,,margarete ancora margarete un'altra volta ma adesso vorrei proprio sentire la musica
Dialogue: 0,0:00:50.19,0:00:50.37,Default,,0,0,0,,margarete ancora margarete un'altra volta {\1c&HFF00&\u1}ma{\r} adesso vorrei proprio sentire la musica
Dialogue: 0,0:00:50.37,0:00:50.43,Default,,0,0,0,,margarete ancora margarete un'altra volta ma adesso vorrei proprio sentire la musica
Dialogue: 0,0:00:50.43,0:00:50.73,Default,,0,0,0,,margarete ancora margarete un'altra volta ma {\1c&HFF00&\u1}adesso{\r} vorrei proprio sentire la musica
Dialogue: 0,0:00:50.73,0:00:50.77,Default,,0,0,0,,margarete ancora margarete un'altra volta ma adesso vorrei proprio sentire la musica
Dialogue: 0,0:00:50.77,0:00:51.01,Default,,0,0,0,,margarete ancora margarete un'altra volta ma adesso {\1c&HFF00&\u1}vorrei{\r} proprio sentire la musica
Dialogue: 0,0:00:51.01,0:00:51.05,Default,,0,0,0,,margarete ancora margarete un'altra volta ma adesso vorrei proprio sentire la musica
Dialogue: 0,0:00:51.05,0:00:51.29,Default,,0,0,0,,margarete ancora margarete un'altra volta ma adesso vorrei {\1c&HFF00&\u1}proprio{\r} sentire la musica
Dialogue: 0,0:00:51.29,0:00:51.43,Default,,0,0,0,,margarete ancora margarete un'altra volta ma adesso vorrei proprio sentire la musica
Dialogue: 0,0:00:51.43,0:00:51.71,Default,,0,0,0,,margarete ancora margarete un'altra volta ma adesso vorrei proprio {\1c&HFF00&\u1}sentire{\r} la musica
Dialogue: 0,0:00:51.71,0:00:51.79,Default,,0,0,0,,margarete ancora margarete un'altra volta ma adesso vorrei proprio sentire la musica
Dialogue: 0,0:00:51.79,0:00:51.88,Default,,0,0,0,,margarete ancora margarete un'altra volta ma adesso vorrei proprio sentire {\1c&HFF00&\u1}la{\r} musica
Dialogue: 0,0:00:51.88,0:00:51.92,Default,,0,0,0,,margarete ancora margarete un'altra volta ma adesso vorrei proprio sentire la musica
Dialogue: 0,0:00:51.92,0:00:52.36,Default,,0,0,0,,margarete ancora margarete un'altra volta ma adesso vorrei proprio sentire la {\1c&HFF00&\u1}musica{\r}
Dialogue: 0,0:00:52.36,0:00:52.40,Default,,0,0,0,,{\1c&HFF00&\u1}le{\r} parole margheriti margheriti e lei dominic decoco come dominic decoco bravo bravo
Dialogue: 0,0:00:52.40,0:00:52.52,Default,,0,0,0,,le parole margheriti margheriti e lei dominic decoco come dominic decoco bravo bravo
Dialogue: 0,0:00:52.52,0:00:54.32,Default,,0,0,0,,le {\1c&HFF00&\u1}parole{\r} margheriti margheriti e lei dominic decoco come dominic decoco bravo bravo
Dialogue: 0,0:00:54.32,0:00:54.40,Default,,0,0,0,,le parole margheriti margheriti e lei dominic decoco come dominic decoco bravo bravo
Dialogue: 0,0:00:54.40,0:00:56.40,Default,,0,0,0,,le parole {\1c&HFF00&\u1}margheriti{\r} margheriti e lei dominic decoco come dominic decoco bravo bravo
Dialogue: 0,0:00:56.40,0:00:56.96,Default,,0,0,0,,le parole margheriti margheriti e lei dominic decoco come dominic decoco bravo bravo
Dialogue: 0,0:00:56.96,0:00:57.92,Default,,0,0,0,,le parole margheriti {\1c&HFF00&\u1}margheriti{\r} e lei dominic decoco come dominic decoco bravo bravo
Dialogue: 0,0:00:57.92,0:00:58.53,Default,,0,0,0,,le parole margheriti margheriti e lei dominic decoco come dominic decoco bravo bravo
Dialogue: 0,0:00:58.53,0:00:58.59,Default,,0,0,0,,le parole margheriti margheriti {\1c&HFF00&\u1}e{\r} lei dominic decoco come dominic decoco bravo bravo
Dialogue: 0,0:00:58.59,0:00:58.65,Default,,0,0,0,,le parole margheriti margheriti e lei dominic decoco come dominic decoco bravo bravo
Dialogue: 0,0:00:58.65,0:00:58.97,Default,,0,0,0,,le parole margheriti margheriti e {\1c&HFF00&\u1}lei{\r} dominic decoco come dominic decoco bravo bravo
Dialogue: 0,0:00:58.97,0:00:59.59,Default,,0,0,0,,le parole margheriti margheriti e lei dominic decoco come dominic decoco bravo bravo
Dialogue: 0,0:00:59.59,0:00:59.91,Default,,0,0,0,,le parole margheriti margheriti e lei {\1c&HFF00&\u1}dominic{\r} decoco come dominic decoco bravo bravo
Dialogue: 0,0:00:59.91,0:00:59.95,Default,,0,0,0,,le parole margheriti margheriti e lei dominic decoco come dominic decoco bravo bravo
Dialogue: 0,0:00:59.95,0:01:0.35,Default,,0,0,0,,le parole margheriti margheriti e lei dominic {\1c&HFF00&\u1}decoco{\r} come dominic decoco bravo bravo
Dialogue: 0,0:01:0.35,0:01:0.55,Default,,0,0,0,,le parole margheriti margheriti e lei dominic decoco come dominic decoco bravo bravo
Dialogue: 0,0:01:0.55,0:01:0.73,Default,,0,0,0,,le parole margheriti margheriti e lei dominic decoco {\1c&HFF00&\u1}come{\r} dominic decoco bravo bravo
Dialogue: 0,0:01:0.73,0:01:1.25,Default,,0,0,0,,le parole margheriti margheriti e lei dominic decoco come dominic decoco bravo bravo
Dialogue: 0,0:01:1.25,0:01:1.57,Default,,0,0,0,,le parole margheriti margheriti e lei dominic decoco come {\1c&HFF00&\u1}dominic{\r} decoco bravo bravo
Dialogue: 0,0:01:1.57,0:01:1.61,Default,,0,0,0,,le parole margheriti margheriti e lei dominic decoco come dominic decoco bravo bravo
Dialogue: 0,0:01:1.61,0:01:2.01,Default,,0,0,0,,le parole margheriti margheriti e lei dominic decoco come dominic {\1c&HFF00&\u1}decoco{\r} bravo bravo
Dialogue: 0,0:01:2.01,0:01:2.17,Default,,0,0,0,,le parole margheriti margheriti e lei dominic decoco come dominic decoco bravo bravo
Dialogue: 0,0:01:2.17,0:01:2.45,Default,,0,0,0,,le parole margheriti margheriti e lei dominic decoco come dominic decoco {\1c&HFF00&\u1}bravo{\r} bravo
Dialogue: 0,0:01:2.45,0:01:2.91,Default,,0,0,0,,le parole margheriti margheriti e lei dominic decoco come dominic decoco bravo bravo
Dialogue: 0,0:01:2.91,0:01:3.29,Default,,0,0,0,,le parole margheriti margheriti e lei dominic decoco come dominic decoco bravo {\1c&HFF00&\u1}bravo{\r}

View File

@ -1,32 +0,0 @@
1
00:00:01,202 --> 00:00:11,297
Signore, è un piacere, gli amici della vedetta ammirata da tutti noi, questa gemma propria della nostra cultura, saranno naturalmente accolti sotto la mia protezione per la durata del loro soggiorno.
2
00:00:13,000 --> 00:00:13,120
Grazie.
3
00:00:15,602 --> 00:00:19,874
Gorlami? Lo pronuncio correttamente?
4
00:00:21,576 --> 00:00:22,717
Sì, corretto.
5
00:00:23,540 --> 00:00:27,495
Gorlami? Per cortesia, me lo ripeto ancora.
6
00:00:27,555 --> 00:00:43,979
ancora gourlami scusi con me gourlami ancora una volta gourlami e come si chiama lei antonio
7
00:00:44,360 --> 00:00:52,356
margarete ancora margarete un'altra volta ma adesso vorrei proprio sentire la musica
8
00:00:52,356 --> 00:01:03,292
le parole margheriti margheriti e lei dominic decoco come dominic decoco bravo bravo

BIN
figures/pipeline.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 120 KiB

36
pyproject.toml Normal file
View File

@ -0,0 +1,36 @@
[project]
urls = { repository = "https://github.com/m-bain/whisperx" }
authors = [{ name = "Max Bain" }]
name = "whisperx"
version = "3.3.4"
description = "Time-Accurate Automatic Speech Recognition using Whisper."
readme = "README.md"
requires-python = ">=3.9, <3.13"
license = { text = "BSD-2-Clause" }
dependencies = [
"ctranslate2<4.5.0",
"faster-whisper>=1.1.1",
"nltk>=3.9.1",
"numpy>=2.0.2",
"onnxruntime>=1.19",
"pandas>=2.2.3",
"pyannote-audio>=3.3.2",
"torch>=2.5.1",
"torchaudio>=2.5.1",
"transformers>=4.48.0",
]
[project.scripts]
whisperx = "whisperx.__main__:cli"
[build-system]
requires = ["setuptools"]
[tool.setuptools]
include-package-data = true
[tool.setuptools.packages.find]
where = ["."]
include = ["whisperx*"]

View File

@ -1,9 +0,0 @@
numpy
torch
torchaudio
tqdm
soundfile
more-itertools
transformers>=4.19.0
ffmpeg-python==0.2.0
pyannote.audio

View File

@ -1,28 +0,0 @@
import os
import pkg_resources
from setuptools import setup, find_packages
setup(
name="whisperx",
py_modules=["whisperx"],
version="1.0",
description="Time-Accurate Automatic Speech Recognition using Whisper.",
readme="README.md",
python_requires=">=3.7",
author="Max Bain",
url="https://github.com/m-bain/whisperx",
license="MIT",
packages=find_packages(exclude=["tests*"]),
install_requires=[
str(r)
for r in pkg_resources.parse_requirements(
open(os.path.join(os.path.dirname(__file__), "requirements.txt"))
)
],
entry_points = {
'console_scripts': ['whisperx=whisperx.transcribe:cli'],
},
include_package_data=True,
extras_require={'dev': ['pytest']},
)

Binary file not shown.

View File

@ -1,19 +0,0 @@
import os.path
import numpy as np
from whisper.audio import load_audio, log_mel_spectrogram, SAMPLE_RATE
def test_audio():
audio_path = os.path.join(os.path.dirname(__file__), "jfk.flac")
audio = load_audio(audio_path)
assert audio.ndim == 1
assert SAMPLE_RATE * 10 < audio.shape[0] < SAMPLE_RATE * 12
assert 0 < audio.std() < 1
mel_from_audio = log_mel_spectrogram(audio)
mel_from_file = log_mel_spectrogram(audio_path)
assert np.allclose(mel_from_audio, mel_from_file)
assert mel_from_audio.max() - mel_from_audio.min() <= 2.0

View File

@ -1,92 +0,0 @@
import pytest
from whisper.normalizers import EnglishTextNormalizer
from whisper.normalizers.english import EnglishNumberNormalizer, EnglishSpellingNormalizer
@pytest.mark.parametrize("std", [EnglishNumberNormalizer(), EnglishTextNormalizer()])
def test_number_normalizer(std):
assert std("two") == "2"
assert std("thirty one") == "31"
assert std("five twenty four") == "524"
assert std("nineteen ninety nine") == "1999"
assert std("twenty nineteen") == "2019"
assert std("two point five million") == "2500000"
assert std("four point two billions") == "4200000000s"
assert std("200 thousand") == "200000"
assert std("200 thousand dollars") == "$200000"
assert std("$20 million") == "$20000000"
assert std("€52.4 million") == "€52400000"
assert std("£77 thousands") == "£77000s"
assert std("two double o eight") == "2008"
assert std("three thousand twenty nine") == "3029"
assert std("forty three thousand two hundred sixty") == "43260"
assert std("forty three thousand two hundred and sixty") == "43260"
assert std("nineteen fifties") == "1950s"
assert std("thirty first") == "31st"
assert std("thirty three thousand and three hundred and thirty third") == "33333rd"
assert std("three billion") == "3000000000"
assert std("millions") == "1000000s"
assert std("july third twenty twenty") == "july 3rd 2020"
assert std("august twenty sixth twenty twenty one") == "august 26th 2021"
assert std("3 14") == "3 14"
assert std("3.14") == "3.14"
assert std("3 point 2") == "3.2"
assert std("3 point 14") == "3.14"
assert std("fourteen point 4") == "14.4"
assert std("two point two five dollars") == "$2.25"
assert std("two hundred million dollars") == "$200000000"
assert std("$20.1 million") == "$20100000"
assert std("ninety percent") == "90%"
assert std("seventy six per cent") == "76%"
assert std("double oh seven") == "007"
assert std("double zero seven") == "007"
assert std("nine one one") == "911"
assert std("nine double one") == "911"
assert std("one triple oh one") == "10001"
assert std("two thousandth") == "2000th"
assert std("thirty two thousandth") == "32000th"
assert std("minus 500") == "-500"
assert std("positive twenty thousand") == "+20000"
assert std("two dollars and seventy cents") == "$2.70"
assert std("3 cents") == "¢3"
assert std("$0.36") == "¢36"
assert std("three euros and sixty five cents") == "€3.65"
assert std("three and a half million") == "3500000"
assert std("forty eight and a half dollars") == "$48.5"
assert std("b747") == "b 747"
assert std("10 th") == "10th"
assert std("10th") == "10th"
def test_spelling_normalizer():
std = EnglishSpellingNormalizer()
assert std("mobilisation") == "mobilization"
assert std("cancelation") == "cancellation"
def test_text_normalizer():
std = EnglishTextNormalizer()
assert std("Let's") == "let us"
assert std("he's like") == "he is like"
assert std("she's been like") == "she has been like"
assert std("10km") == "10 km"
assert std("RC232") == "rc 232"
assert (
std("Mr. Park visited Assoc. Prof. Kim Jr.")
== "mister park visited associate professor kim junior"
)

View File

@ -1,14 +0,0 @@
from whisper.tokenizer import get_tokenizer
def test_tokenizer():
gpt2_tokenizer = get_tokenizer(multilingual=False)
multilingual_tokenizer = get_tokenizer(multilingual=True)
text = "다람쥐 헌 쳇바퀴에 타고파"
gpt2_tokens = gpt2_tokenizer.encode(text)
multilingual_tokens = multilingual_tokenizer.encode(text)
assert gpt2_tokenizer.decode(gpt2_tokens) == text
assert multilingual_tokenizer.decode(multilingual_tokens) == text
assert len(gpt2_tokens) > len(multilingual_tokens)

View File

@ -1,20 +0,0 @@
import os
import pytest
import whisper
@pytest.mark.parametrize('model_name', whisper.available_models())
def test_transcribe(model_name: str):
model = whisper.load_model(model_name).cuda()
audio_path = os.path.join(os.path.dirname(__file__), "jfk.flac")
language = "en" if model_name.endswith(".en") else None
result = model.transcribe(audio_path, language=language, temperature=0.0)
assert result["language"] == "en"
transcription = result["text"].lower()
assert "my fellow americans" in transcription
assert "your country" in transcription
assert "do for you" in transcription

2905
uv.lock generated Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,226 @@
import math
from whisperx.conjunctions import get_conjunctions, get_comma
def normal_round(n):
if n - math.floor(n) < 0.5:
return math.floor(n)
return math.ceil(n)
def format_timestamp(seconds: float, is_vtt: bool = False):
assert seconds >= 0, "non-negative timestamp expected"
milliseconds = round(seconds * 1000.0)
hours = milliseconds // 3_600_000
milliseconds -= hours * 3_600_000
minutes = milliseconds // 60_000
milliseconds -= minutes * 60_000
seconds = milliseconds // 1_000
milliseconds -= seconds * 1_000
separator = '.' if is_vtt else ','
hours_marker = f"{hours:02d}:"
return (
f"{hours_marker}{minutes:02d}:{seconds:02d}{separator}{milliseconds:03d}"
)
class SubtitlesProcessor:
def __init__(self, segments, lang, max_line_length = 45, min_char_length_splitter = 30, is_vtt = False):
self.comma = get_comma(lang)
self.conjunctions = set(get_conjunctions(lang))
self.segments = segments
self.lang = lang
self.max_line_length = max_line_length
self.min_char_length_splitter = min_char_length_splitter
self.is_vtt = is_vtt
complex_script_languages = ['th', 'lo', 'my', 'km', 'am', 'ko', 'ja', 'zh', 'ti', 'ta', 'te', 'kn', 'ml', 'hi', 'ne', 'mr', 'ar', 'fa', 'ur', 'ka']
if self.lang in complex_script_languages:
self.max_line_length = 30
self.min_char_length_splitter = 20
def estimate_timestamp_for_word(self, words, i, next_segment_start_time=None):
k = 0.25
has_prev_end = i > 0 and 'end' in words[i - 1]
has_next_start = i < len(words) - 1 and 'start' in words[i + 1]
if has_prev_end:
words[i]['start'] = words[i - 1]['end']
if has_next_start:
words[i]['end'] = words[i + 1]['start']
else:
if next_segment_start_time:
words[i]['end'] = next_segment_start_time if next_segment_start_time - words[i - 1]['end'] <= 1 else next_segment_start_time - 0.5
else:
words[i]['end'] = words[i]['start'] + len(words[i]['word']) * k
elif has_next_start:
words[i]['start'] = words[i + 1]['start'] - len(words[i]['word']) * k
words[i]['end'] = words[i + 1]['start']
else:
if next_segment_start_time:
words[i]['start'] = next_segment_start_time - 1
words[i]['end'] = next_segment_start_time - 0.5
else:
words[i]['start'] = 0
words[i]['end'] = 0
def process_segments(self, advanced_splitting=True):
subtitles = []
for i, segment in enumerate(self.segments):
next_segment_start_time = self.segments[i + 1]['start'] if i + 1 < len(self.segments) else None
if advanced_splitting:
split_points = self.determine_advanced_split_points(segment, next_segment_start_time)
subtitles.extend(self.generate_subtitles_from_split_points(segment, split_points, next_segment_start_time))
else:
words = segment['words']
for i, word in enumerate(words):
if 'start' not in word or 'end' not in word:
self.estimate_timestamp_for_word(words, i, next_segment_start_time)
subtitles.append({
'start': segment['start'],
'end': segment['end'],
'text': segment['text']
})
return subtitles
def determine_advanced_split_points(self, segment, next_segment_start_time=None):
split_points = []
last_split_point = 0
char_count = 0
words = segment.get('words', segment['text'].split())
add_space = 0 if self.lang in ['zh', 'ja'] else 1
total_char_count = sum(len(word['word']) if isinstance(word, dict) else len(word) + add_space for word in words)
char_count_after = total_char_count
for i, word in enumerate(words):
word_text = word['word'] if isinstance(word, dict) else word
word_length = len(word_text) + add_space
char_count += word_length
char_count_after -= word_length
char_count_before = char_count - word_length
if isinstance(word, dict) and ('start' not in word or 'end' not in word):
self.estimate_timestamp_for_word(words, i, next_segment_start_time)
if char_count >= self.max_line_length:
midpoint = normal_round((last_split_point + i) / 2)
if char_count_before >= self.min_char_length_splitter:
split_points.append(midpoint)
last_split_point = midpoint + 1
char_count = sum(len(words[j]['word']) if isinstance(words[j], dict) else len(words[j]) + add_space for j in range(last_split_point, i + 1))
elif word_text.endswith(self.comma) and char_count_before >= self.min_char_length_splitter and char_count_after >= self.min_char_length_splitter:
split_points.append(i)
last_split_point = i + 1
char_count = 0
elif word_text.lower() in self.conjunctions and char_count_before >= self.min_char_length_splitter and char_count_after >= self.min_char_length_splitter:
split_points.append(i - 1)
last_split_point = i
char_count = word_length
return split_points
def generate_subtitles_from_split_points(self, segment, split_points, next_start_time=None):
subtitles = []
words = segment.get('words', segment['text'].split())
total_word_count = len(words)
total_time = segment['end'] - segment['start']
elapsed_time = segment['start']
prefix = ' ' if self.lang not in ['zh', 'ja'] else ''
start_idx = 0
for split_point in split_points:
fragment_words = words[start_idx:split_point + 1]
current_word_count = len(fragment_words)
if isinstance(fragment_words[0], dict):
start_time = fragment_words[0]['start']
end_time = fragment_words[-1]['end']
next_start_time_for_word = words[split_point + 1]['start'] if split_point + 1 < len(words) else None
if next_start_time_for_word and (next_start_time_for_word - end_time) <= 0.8:
end_time = next_start_time_for_word
else:
fragment = prefix.join(fragment_words).strip()
current_duration = (current_word_count / total_word_count) * total_time
start_time = elapsed_time
end_time = elapsed_time + current_duration
elapsed_time += current_duration
subtitles.append({
'start': start_time,
'end': end_time,
'text': fragment if not isinstance(fragment_words[0], dict) else prefix.join(word['word'] for word in fragment_words)
})
start_idx = split_point + 1
# Handle the last fragment
if start_idx < len(words):
fragment_words = words[start_idx:]
current_word_count = len(fragment_words)
if isinstance(fragment_words[0], dict):
start_time = fragment_words[0]['start']
end_time = fragment_words[-1]['end']
else:
fragment = prefix.join(fragment_words).strip()
current_duration = (current_word_count / total_word_count) * total_time
start_time = elapsed_time
end_time = elapsed_time + current_duration
if next_start_time and (next_start_time - end_time) <= 0.8:
end_time = next_start_time
subtitles.append({
'start': start_time,
'end': end_time if end_time is not None else segment['end'],
'text': fragment if not isinstance(fragment_words[0], dict) else prefix.join(word['word'] for word in fragment_words)
})
return subtitles
def save(self, filename="subtitles.srt", advanced_splitting=True):
subtitles = self.process_segments(advanced_splitting)
def write_subtitle(file, idx, start_time, end_time, text):
file.write(f"{idx}\n")
file.write(f"{start_time} --> {end_time}\n")
file.write(text + "\n\n")
with open(filename, 'w', encoding='utf-8') as file:
if self.is_vtt:
file.write("WEBVTT\n\n")
if advanced_splitting:
for idx, subtitle in enumerate(subtitles, 1):
start_time = format_timestamp(subtitle['start'], self.is_vtt)
end_time = format_timestamp(subtitle['end'], self.is_vtt)
text = subtitle['text'].strip()
write_subtitle(file, idx, start_time, end_time, text)
return len(subtitles)

View File

@ -1,115 +1,31 @@
import hashlib
import io
import os
import urllib
import warnings
from typing import List, Optional, Union
import torch
from tqdm import tqdm
from .audio import load_audio, log_mel_spectrogram, pad_or_trim
from .decoding import DecodingOptions, DecodingResult, decode, detect_language
from .model import Whisper, ModelDimensions
from .transcribe import transcribe, load_align_model, align, transcribe_with_vad
_MODELS = {
"tiny.en": "https://openaipublic.azureedge.net/main/whisper/models/d3dd57d32accea0b295c96e26691aa14d8822fac7d9d27d5dc00b4ca2826dd03/tiny.en.pt",
"tiny": "https://openaipublic.azureedge.net/main/whisper/models/65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9/tiny.pt",
"base.en": "https://openaipublic.azureedge.net/main/whisper/models/25a8566e1d0c1e2231d1c762132cd20e0f96a85d16145c3a00adf5d1ac670ead/base.en.pt",
"base": "https://openaipublic.azureedge.net/main/whisper/models/ed3a0b6b1c0edf879ad9b11b1af5a0e6ab5db9205f891f668f8b0e6c6326e34e/base.pt",
"small.en": "https://openaipublic.azureedge.net/main/whisper/models/f953ad0fd29cacd07d5a9eda5624af0f6bcf2258be67c92b79389873d91e0872/small.en.pt",
"small": "https://openaipublic.azureedge.net/main/whisper/models/9ecf779972d90ba49c06d968637d720dd632c55bbf19d441fb42bf17a411e794/small.pt",
"medium.en": "https://openaipublic.azureedge.net/main/whisper/models/d7440d1dc186f76616474e0ff0b3b6b879abc9d1a4926b7adfa41db2d497ab4f/medium.en.pt",
"medium": "https://openaipublic.azureedge.net/main/whisper/models/345ae4da62f9b3d59415adc60127b97c714f32e89e936602e85993674d08dcb1/medium.pt",
"large-v1": "https://openaipublic.azureedge.net/main/whisper/models/e4b87e7e0bf463eb8e6956e646f1e277e901512310def2c24bf0e11bd3c28e9a/large-v1.pt",
"large-v2": "https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt",
"large": "https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt",
}
import importlib
def _download(url: str, root: str, in_memory: bool) -> Union[bytes, str]:
os.makedirs(root, exist_ok=True)
expected_sha256 = url.split("/")[-2]
download_target = os.path.join(root, os.path.basename(url))
if os.path.exists(download_target) and not os.path.isfile(download_target):
raise RuntimeError(f"{download_target} exists and is not a regular file")
if os.path.isfile(download_target):
with open(download_target, "rb") as f:
model_bytes = f.read()
if hashlib.sha256(model_bytes).hexdigest() == expected_sha256:
return model_bytes if in_memory else download_target
else:
warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True, unit_divisor=1024) as loop:
while True:
buffer = source.read(8192)
if not buffer:
break
output.write(buffer)
loop.update(len(buffer))
model_bytes = open(download_target, "rb").read()
if hashlib.sha256(model_bytes).hexdigest() != expected_sha256:
raise RuntimeError("Model has been downloaded but the SHA256 checksum does not not match. Please retry loading the model.")
return model_bytes if in_memory else download_target
def _lazy_import(name):
module = importlib.import_module(f"whisperx.{name}")
return module
def available_models() -> List[str]:
"""Returns the names of available models"""
return list(_MODELS.keys())
def load_align_model(*args, **kwargs):
alignment = _lazy_import("alignment")
return alignment.load_align_model(*args, **kwargs)
def load_model(name: str, device: Optional[Union[str, torch.device]] = None, download_root: str = None, in_memory: bool = False) -> Whisper:
"""
Load a Whisper ASR model
def align(*args, **kwargs):
alignment = _lazy_import("alignment")
return alignment.align(*args, **kwargs)
Parameters
----------
name : str
one of the official model names listed by `whisper.available_models()`, or
path to a model checkpoint containing the model dimensions and the model state_dict.
device : Union[str, torch.device]
the PyTorch device to put the model into
download_root: str
path to download the model files; by default, it uses "~/.cache/whisper"
in_memory: bool
whether to preload the model weights into host memory
Returns
-------
model : Whisper
The Whisper ASR model instance
"""
def load_model(*args, **kwargs):
asr = _lazy_import("asr")
return asr.load_model(*args, **kwargs)
if device is None:
device = "cuda" if torch.cuda.is_available() else "cpu"
if download_root is None:
download_root = os.getenv(
"XDG_CACHE_HOME",
os.path.join(os.path.expanduser("~"), ".cache", "whisper")
)
if name in _MODELS:
checkpoint_file = _download(_MODELS[name], download_root, in_memory)
elif os.path.isfile(name):
checkpoint_file = open(name, "rb").read() if in_memory else name
else:
raise RuntimeError(f"Model {name} not found; available models = {available_models()}")
def load_audio(*args, **kwargs):
audio = _lazy_import("audio")
return audio.load_audio(*args, **kwargs)
with (io.BytesIO(checkpoint_file) if in_memory else open(checkpoint_file, "rb")) as fp:
checkpoint = torch.load(fp, map_location=device)
del checkpoint_file
dims = ModelDimensions(**checkpoint["dims"])
model = Whisper(dims)
model.load_state_dict(checkpoint["model_state_dict"])
return model.to(device)
def assign_word_speakers(*args, **kwargs):
diarize = _lazy_import("diarize")
return diarize.assign_word_speakers(*args, **kwargs)

View File

@ -1,4 +1,88 @@
from .transcribe import cli
import argparse
import importlib.metadata
import platform
import torch
from whisperx.utils import (LANGUAGES, TO_LANGUAGE_CODE, optional_float,
optional_int, str2bool)
cli()
def cli():
# fmt: off
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("audio", nargs="+", type=str, help="audio file(s) to transcribe")
parser.add_argument("--model", default="small", help="name of the Whisper model to use")
parser.add_argument("--model_cache_only", type=str2bool, default=False, help="If True, will not attempt to download models, instead using cached models from --model_dir")
parser.add_argument("--model_dir", type=str, default=None, help="the path to save model files; uses ~/.cache/whisper by default")
parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", help="device to use for PyTorch inference")
parser.add_argument("--device_index", default=0, type=int, help="device index to use for FasterWhisper inference")
parser.add_argument("--batch_size", default=8, type=int, help="the preferred batch size for inference")
parser.add_argument("--compute_type", default="float16", type=str, choices=["float16", "float32", "int8"], help="compute type for computation")
parser.add_argument("--output_dir", "-o", type=str, default=".", help="directory to save the outputs")
parser.add_argument("--output_format", "-f", type=str, default="all", choices=["all", "srt", "vtt", "txt", "tsv", "json", "aud"], help="format of the output file; if not specified, all available formats will be produced")
parser.add_argument("--verbose", type=str2bool, default=True, help="whether to print out the progress and debug messages")
parser.add_argument("--task", type=str, default="transcribe", choices=["transcribe", "translate"], help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
parser.add_argument("--language", type=str, default=None, choices=sorted(LANGUAGES.keys()) + sorted([k.title() for k in TO_LANGUAGE_CODE.keys()]), help="language spoken in the audio, specify None to perform language detection")
# alignment params
parser.add_argument("--align_model", default=None, help="Name of phoneme-level ASR model to do alignment")
parser.add_argument("--interpolate_method", default="nearest", choices=["nearest", "linear", "ignore"], help="For word .srt, method to assign timestamps to non-aligned words, or merge them into neighbouring.")
parser.add_argument("--no_align", action='store_true', help="Do not perform phoneme alignment")
parser.add_argument("--return_char_alignments", action='store_true', help="Return character-level alignments in the output json file")
# vad params
parser.add_argument("--vad_method", type=str, default="pyannote", choices=["pyannote", "silero"], help="VAD method to be used")
parser.add_argument("--vad_onset", type=float, default=0.500, help="Onset threshold for VAD (see pyannote.audio), reduce this if speech is not being detected")
parser.add_argument("--vad_offset", type=float, default=0.363, help="Offset threshold for VAD (see pyannote.audio), reduce this if speech is not being detected.")
parser.add_argument("--chunk_size", type=int, default=30, help="Chunk size for merging VAD segments. Default is 30, reduce this if the chunk is too long.")
# diarization params
parser.add_argument("--diarize", action="store_true", help="Apply diarization to assign speaker labels to each segment/word")
parser.add_argument("--min_speakers", default=None, type=int, help="Minimum number of speakers to in audio file")
parser.add_argument("--max_speakers", default=None, type=int, help="Maximum number of speakers to in audio file")
parser.add_argument("--diarize_model", default="pyannote/speaker-diarization-3.1", type=str, help="Name of the speaker diarization model to use")
parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling")
parser.add_argument("--best_of", type=optional_int, default=5, help="number of candidates when sampling with non-zero temperature")
parser.add_argument("--beam_size", type=optional_int, default=5, help="number of beams in beam search, only applicable when temperature is zero")
parser.add_argument("--patience", type=float, default=1.0, help="optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search")
parser.add_argument("--length_penalty", type=float, default=1.0, help="optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple length normalization by default")
parser.add_argument("--suppress_tokens", type=str, default="-1", help="comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations")
parser.add_argument("--suppress_numerals", action="store_true", help="whether to suppress numeric symbols and currency symbols during sampling, since wav2vec2 cannot align them correctly")
parser.add_argument("--initial_prompt", type=str, default=None, help="optional text to provide as a prompt for the first window.")
parser.add_argument("--condition_on_previous_text", type=str2bool, default=False, help="if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop")
parser.add_argument("--fp16", type=str2bool, default=True, help="whether to perform inference in fp16; True by default")
parser.add_argument("--temperature_increment_on_fallback", type=optional_float, default=0.2, help="temperature to increase when falling back when the decoding fails to meet either of the thresholds below")
parser.add_argument("--compression_ratio_threshold", type=optional_float, default=2.4, help="if the gzip compression ratio is higher than this value, treat the decoding as failed")
parser.add_argument("--logprob_threshold", type=optional_float, default=-1.0, help="if the average log probability is lower than this value, treat the decoding as failed")
parser.add_argument("--no_speech_threshold", type=optional_float, default=0.6, help="if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence")
parser.add_argument("--max_line_width", type=optional_int, default=None, help="(not possible with --no_align) the maximum number of characters in a line before breaking the line")
parser.add_argument("--max_line_count", type=optional_int, default=None, help="(not possible with --no_align) the maximum number of lines in a segment")
parser.add_argument("--highlight_words", type=str2bool, default=False, help="(not possible with --no_align) underline each word as it is spoken in srt and vtt")
parser.add_argument("--segment_resolution", type=str, default="sentence", choices=["sentence", "chunk"], help="(not possible with --no_align) the maximum number of characters in a line before breaking the line")
parser.add_argument("--threads", type=optional_int, default=0, help="number of threads used by torch for CPU inference; supercedes MKL_NUM_THREADS/OMP_NUM_THREADS")
parser.add_argument("--hf_token", type=str, default=None, help="Hugging Face Access Token to access PyAnnote gated models")
parser.add_argument("--print_progress", type=str2bool, default = False, help = "if True, progress will be printed in transcribe() and align() methods.")
parser.add_argument("--version", "-V", action="version", version=f"%(prog)s {importlib.metadata.version('whisperx')}",help="Show whisperx version information and exit")
parser.add_argument("--python-version", "-P", action="version", version=f"Python {platform.python_version()} ({platform.python_implementation()})",help="Show python version information and exit")
# fmt: on
args = parser.parse_args().__dict__
from whisperx.transcribe import transcribe_task
transcribe_task(args, parser)
if __name__ == "__main__":
cli()

View File

@ -1,73 +1,584 @@
"""
Forced Alignment with Whisper
C. Max Bain
"""
import math
from dataclasses import dataclass
from typing import Iterable, Optional, Union, List
import numpy as np
import pandas as pd
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from whisperx.audio import SAMPLE_RATE, load_audio
from whisperx.utils import interpolate_nans
from whisperx.types import (
AlignedTranscriptionResult,
SingleSegment,
SingleAlignedSegment,
SingleWordSegment,
SegmentData,
)
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
PUNKT_ABBREVIATIONS = ['dr', 'vs', 'mr', 'mrs', 'prof']
LANGUAGES_WITHOUT_SPACES = ["ja", "zh"]
DEFAULT_ALIGN_MODELS_TORCH = {
"en": "WAV2VEC2_ASR_BASE_960H",
"fr": "VOXPOPULI_ASR_BASE_10K_FR",
"de": "VOXPOPULI_ASR_BASE_10K_DE",
"es": "VOXPOPULI_ASR_BASE_10K_ES",
"it": "VOXPOPULI_ASR_BASE_10K_IT",
}
DEFAULT_ALIGN_MODELS_HF = {
"ja": "jonatasgrosman/wav2vec2-large-xlsr-53-japanese",
"zh": "jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-cn",
"nl": "jonatasgrosman/wav2vec2-large-xlsr-53-dutch",
"uk": "Yehor/wav2vec2-xls-r-300m-uk-with-small-lm",
"pt": "jonatasgrosman/wav2vec2-large-xlsr-53-portuguese",
"ar": "jonatasgrosman/wav2vec2-large-xlsr-53-arabic",
"cs": "comodoro/wav2vec2-xls-r-300m-cs-250",
"ru": "jonatasgrosman/wav2vec2-large-xlsr-53-russian",
"pl": "jonatasgrosman/wav2vec2-large-xlsr-53-polish",
"hu": "jonatasgrosman/wav2vec2-large-xlsr-53-hungarian",
"fi": "jonatasgrosman/wav2vec2-large-xlsr-53-finnish",
"fa": "jonatasgrosman/wav2vec2-large-xlsr-53-persian",
"el": "jonatasgrosman/wav2vec2-large-xlsr-53-greek",
"tr": "mpoyraz/wav2vec2-xls-r-300m-cv7-turkish",
"da": "saattrupdan/wav2vec2-xls-r-300m-ftspeech",
"he": "imvladikon/wav2vec2-xls-r-300m-hebrew",
"vi": 'nguyenvulebinh/wav2vec2-base-vi',
"ko": "kresnik/wav2vec2-large-xlsr-korean",
"ur": "kingabzpro/wav2vec2-large-xls-r-300m-Urdu",
"te": "anuragshas/wav2vec2-large-xlsr-53-telugu",
"hi": "theainerd/Wav2Vec2-large-xlsr-hindi",
"ca": "softcatala/wav2vec2-large-xlsr-catala",
"ml": "gvs/wav2vec2-large-xlsr-malayalam",
"no": "NbAiLab/nb-wav2vec2-1b-bokmaal-v2",
"nn": "NbAiLab/nb-wav2vec2-1b-nynorsk",
"sk": "comodoro/wav2vec2-xls-r-300m-sk-cv8",
"sl": "anton-l/wav2vec2-large-xlsr-53-slovenian",
"hr": "classla/wav2vec2-xls-r-parlaspeech-hr",
"ro": "gigant/romanian-wav2vec2",
"eu": "stefan-it/wav2vec2-large-xlsr-53-basque",
"gl": "ifrz/wav2vec2-large-xlsr-galician",
"ka": "xsway/wav2vec2-large-xlsr-georgian",
"lv": "jimregan/wav2vec2-large-xlsr-latvian-cv",
"tl": "Khalsuu/filipino-wav2vec2-l-xls-r-300m-official",
}
def load_align_model(language_code: str, device: str, model_name: Optional[str] = None, model_dir=None):
if model_name is None:
# use default model
if language_code in DEFAULT_ALIGN_MODELS_TORCH:
model_name = DEFAULT_ALIGN_MODELS_TORCH[language_code]
elif language_code in DEFAULT_ALIGN_MODELS_HF:
model_name = DEFAULT_ALIGN_MODELS_HF[language_code]
else:
print(f"There is no default alignment model set for this language ({language_code}).\
Please find a wav2vec2.0 model finetuned on this language in https://huggingface.co/models, then pass the model name in --align_model [MODEL_NAME]")
raise ValueError(f"No default align-model for language: {language_code}")
if model_name in torchaudio.pipelines.__all__:
pipeline_type = "torchaudio"
bundle = torchaudio.pipelines.__dict__[model_name]
align_model = bundle.get_model(dl_kwargs={"model_dir": model_dir}).to(device)
labels = bundle.get_labels()
align_dictionary = {c.lower(): i for i, c in enumerate(labels)}
else:
try:
processor = Wav2Vec2Processor.from_pretrained(model_name, cache_dir=model_dir)
align_model = Wav2Vec2ForCTC.from_pretrained(model_name, cache_dir=model_dir)
except Exception as e:
print(e)
print(f"Error loading model from huggingface, check https://huggingface.co/models for finetuned wav2vec2.0 models")
raise ValueError(f'The chosen align_model "{model_name}" could not be found in huggingface (https://huggingface.co/models) or torchaudio (https://pytorch.org/audio/stable/pipelines.html#id14)')
pipeline_type = "huggingface"
align_model = align_model.to(device)
labels = processor.tokenizer.get_vocab()
align_dictionary = {char.lower(): code for char,code in processor.tokenizer.get_vocab().items()}
align_metadata = {"language": language_code, "dictionary": align_dictionary, "type": pipeline_type}
return align_model, align_metadata
def align(
transcript: Iterable[SingleSegment],
model: torch.nn.Module,
align_model_metadata: dict,
audio: Union[str, np.ndarray, torch.Tensor],
device: str,
interpolate_method: str = "nearest",
return_char_alignments: bool = False,
print_progress: bool = False,
combined_progress: bool = False,
) -> AlignedTranscriptionResult:
"""
Align phoneme recognition predictions to known transcription.
"""
if not torch.is_tensor(audio):
if isinstance(audio, str):
audio = load_audio(audio)
audio = torch.from_numpy(audio)
if len(audio.shape) == 1:
audio = audio.unsqueeze(0)
MAX_DURATION = audio.shape[1] / SAMPLE_RATE
model_dictionary = align_model_metadata["dictionary"]
model_lang = align_model_metadata["language"]
model_type = align_model_metadata["type"]
# 1. Preprocess to keep only characters in dictionary
total_segments = len(transcript)
# Store temporary processing values
segment_data: dict[int, SegmentData] = {}
for sdx, segment in enumerate(transcript):
# strip spaces at beginning / end, but keep track of the amount.
if print_progress:
base_progress = ((sdx + 1) / total_segments) * 100
percent_complete = (50 + base_progress / 2) if combined_progress else base_progress
print(f"Progress: {percent_complete:.2f}%...")
num_leading = len(segment["text"]) - len(segment["text"].lstrip())
num_trailing = len(segment["text"]) - len(segment["text"].rstrip())
text = segment["text"]
# split into words
if model_lang not in LANGUAGES_WITHOUT_SPACES:
per_word = text.split(" ")
else:
per_word = text
clean_char, clean_cdx = [], []
for cdx, char in enumerate(text):
char_ = char.lower()
# wav2vec2 models use "|" character to represent spaces
if model_lang not in LANGUAGES_WITHOUT_SPACES:
char_ = char_.replace(" ", "|")
# ignore whitespace at beginning and end of transcript
if cdx < num_leading:
pass
elif cdx > len(text) - num_trailing - 1:
pass
elif char_ in model_dictionary.keys():
clean_char.append(char_)
clean_cdx.append(cdx)
else:
# add placeholder
clean_char.append('*')
clean_cdx.append(cdx)
clean_wdx = []
for wdx, wrd in enumerate(per_word):
if any([c in model_dictionary.keys() for c in wrd.lower()]):
clean_wdx.append(wdx)
else:
# index for placeholder
clean_wdx.append(wdx)
punkt_param = PunktParameters()
punkt_param.abbrev_types = set(PUNKT_ABBREVIATIONS)
sentence_splitter = PunktSentenceTokenizer(punkt_param)
sentence_spans = list(sentence_splitter.span_tokenize(text))
segment_data[sdx] = {
"clean_char": clean_char,
"clean_cdx": clean_cdx,
"clean_wdx": clean_wdx,
"sentence_spans": sentence_spans
}
aligned_segments: List[SingleAlignedSegment] = []
# 2. Get prediction matrix from alignment model & align
for sdx, segment in enumerate(transcript):
t1 = segment["start"]
t2 = segment["end"]
text = segment["text"]
aligned_seg: SingleAlignedSegment = {
"start": t1,
"end": t2,
"text": text,
"words": [],
"chars": None,
}
if return_char_alignments:
aligned_seg["chars"] = []
# check we can align
if len(segment_data[sdx]["clean_char"]) == 0:
print(f'Failed to align segment ("{segment["text"]}"): no characters in this segment found in model dictionary, resorting to original...')
aligned_segments.append(aligned_seg)
continue
if t1 >= MAX_DURATION:
print(f'Failed to align segment ("{segment["text"]}"): original start time longer than audio duration, skipping...')
aligned_segments.append(aligned_seg)
continue
text_clean = "".join(segment_data[sdx]["clean_char"])
tokens = [model_dictionary.get(c, -1) for c in text_clean]
f1 = int(t1 * SAMPLE_RATE)
f2 = int(t2 * SAMPLE_RATE)
# TODO: Probably can get some speedup gain with batched inference here
waveform_segment = audio[:, f1:f2]
# Handle the minimum input length for wav2vec2 models
if waveform_segment.shape[-1] < 400:
lengths = torch.as_tensor([waveform_segment.shape[-1]]).to(device)
waveform_segment = torch.nn.functional.pad(
waveform_segment, (0, 400 - waveform_segment.shape[-1])
)
else:
lengths = None
with torch.inference_mode():
if model_type == "torchaudio":
emissions, _ = model(waveform_segment.to(device), lengths=lengths)
elif model_type == "huggingface":
emissions = model(waveform_segment.to(device)).logits
else:
raise NotImplementedError(f"Align model of type {model_type} not supported.")
emissions = torch.log_softmax(emissions, dim=-1)
emission = emissions[0].cpu().detach()
blank_id = 0
for char, code in model_dictionary.items():
if char == '[pad]' or char == '<pad>':
blank_id = code
trellis = get_trellis(emission, tokens, blank_id)
# path = backtrack(trellis, emission, tokens, blank_id)
path = backtrack_beam(trellis, emission, tokens, blank_id, beam_width=2)
if path is None:
print(f'Failed to align segment ("{segment["text"]}"): backtrack failed, resorting to original...')
aligned_segments.append(aligned_seg)
continue
char_segments = merge_repeats(path, text_clean)
duration = t2 - t1
ratio = duration * waveform_segment.size(0) / (trellis.size(0) - 1)
# assign timestamps to aligned characters
char_segments_arr = []
word_idx = 0
for cdx, char in enumerate(text):
start, end, score = None, None, None
if cdx in segment_data[sdx]["clean_cdx"]:
char_seg = char_segments[segment_data[sdx]["clean_cdx"].index(cdx)]
start = round(char_seg.start * ratio + t1, 3)
end = round(char_seg.end * ratio + t1, 3)
score = round(char_seg.score, 3)
char_segments_arr.append(
{
"char": char,
"start": start,
"end": end,
"score": score,
"word-idx": word_idx,
}
)
# increment word_idx, nltk word tokenization would probably be more robust here, but us space for now...
if model_lang in LANGUAGES_WITHOUT_SPACES:
word_idx += 1
elif cdx == len(text) - 1 or text[cdx+1] == " ":
word_idx += 1
char_segments_arr = pd.DataFrame(char_segments_arr)
aligned_subsegments = []
# assign sentence_idx to each character index
char_segments_arr["sentence-idx"] = None
for sdx2, (sstart, send) in enumerate(segment_data[sdx]["sentence_spans"]):
curr_chars = char_segments_arr.loc[(char_segments_arr.index >= sstart) & (char_segments_arr.index <= send)]
char_segments_arr.loc[(char_segments_arr.index >= sstart) & (char_segments_arr.index <= send), "sentence-idx"] = sdx2
sentence_text = text[sstart:send]
sentence_start = curr_chars["start"].min()
end_chars = curr_chars[curr_chars["char"] != ' ']
sentence_end = end_chars["end"].max()
sentence_words = []
for word_idx in curr_chars["word-idx"].unique():
word_chars = curr_chars.loc[curr_chars["word-idx"] == word_idx]
word_text = "".join(word_chars["char"].tolist()).strip()
if len(word_text) == 0:
continue
# dont use space character for alignment
word_chars = word_chars[word_chars["char"] != " "]
word_start = word_chars["start"].min()
word_end = word_chars["end"].max()
word_score = round(word_chars["score"].mean(), 3)
# -1 indicates unalignable
word_segment = {"word": word_text}
if not np.isnan(word_start):
word_segment["start"] = word_start
if not np.isnan(word_end):
word_segment["end"] = word_end
if not np.isnan(word_score):
word_segment["score"] = word_score
sentence_words.append(word_segment)
aligned_subsegments.append({
"text": sentence_text,
"start": sentence_start,
"end": sentence_end,
"words": sentence_words,
})
if return_char_alignments:
curr_chars = curr_chars[["char", "start", "end", "score"]]
curr_chars.fillna(-1, inplace=True)
curr_chars = curr_chars.to_dict("records")
curr_chars = [{key: val for key, val in char.items() if val != -1} for char in curr_chars]
aligned_subsegments[-1]["chars"] = curr_chars
aligned_subsegments = pd.DataFrame(aligned_subsegments)
aligned_subsegments["start"] = interpolate_nans(aligned_subsegments["start"], method=interpolate_method)
aligned_subsegments["end"] = interpolate_nans(aligned_subsegments["end"], method=interpolate_method)
# concatenate sentences with same timestamps
agg_dict = {"text": " ".join, "words": "sum"}
if model_lang in LANGUAGES_WITHOUT_SPACES:
agg_dict["text"] = "".join
if return_char_alignments:
agg_dict["chars"] = "sum"
aligned_subsegments= aligned_subsegments.groupby(["start", "end"], as_index=False).agg(agg_dict)
aligned_subsegments = aligned_subsegments.to_dict('records')
aligned_segments += aligned_subsegments
# create word_segments list
word_segments: List[SingleWordSegment] = []
for segment in aligned_segments:
word_segments += segment["words"]
return {"segments": aligned_segments, "word_segments": word_segments}
"""
source: https://pytorch.org/tutorials/intermediate/forced_alignment_with_torchaudio_tutorial.html
"""
import torch
from dataclasses import dataclass
def get_trellis(emission, tokens, blank_id=0):
num_frame = emission.size(0)
num_tokens = len(tokens)
# Trellis has extra diemsions for both time axis and tokens.
# The extra dim for tokens represents <SoS> (start-of-sentence)
# The extra dim for time axis is for simplification of the code.
trellis = torch.empty((num_frame + 1, num_tokens + 1))
trellis[0, 0] = 0
trellis[1:, 0] = torch.cumsum(emission[:, 0], 0)
trellis[0, -num_tokens:] = -float("inf")
trellis[-num_tokens:, 0] = float("inf")
trellis = torch.zeros((num_frame, num_tokens))
trellis[1:, 0] = torch.cumsum(emission[1:, blank_id], 0)
trellis[0, 1:] = -float("inf")
trellis[-num_tokens + 1:, 0] = float("inf")
for t in range(num_frame):
for t in range(num_frame - 1):
trellis[t + 1, 1:] = torch.maximum(
# Score for staying at the same token
trellis[t, 1:] + emission[t, blank_id],
# Score for changing to the next token
trellis[t, :-1] + emission[t, tokens],
# trellis[t, :-1] + emission[t, tokens[1:]],
trellis[t, :-1] + get_wildcard_emission(emission[t], tokens[1:], blank_id),
)
return trellis
def get_wildcard_emission(frame_emission, tokens, blank_id):
"""Processing token emission scores containing wildcards (vectorized version)
Args:
frame_emission: Emission probability vector for the current frame
tokens: List of token indices
blank_id: ID of the blank token
Returns:
tensor: Maximum probability score for each token position
"""
assert 0 <= blank_id < len(frame_emission)
# Convert tokens to a tensor if they are not already
tokens = torch.tensor(tokens) if not isinstance(tokens, torch.Tensor) else tokens
# Create a mask to identify wildcard positions
wildcard_mask = (tokens == -1)
# Get scores for non-wildcard positions
regular_scores = frame_emission[tokens.clamp(min=0)] # clamp to avoid -1 index
# Create a mask and compute the maximum value without modifying frame_emission
max_valid_score = frame_emission.clone() # Create a copy
max_valid_score[blank_id] = float('-inf') # Modify the copy to exclude the blank token
max_valid_score = max_valid_score.max()
# Use where operation to combine results
result = torch.where(wildcard_mask, max_valid_score, regular_scores)
return result
@dataclass
class Point:
token_index: int
time_index: int
score: float
def backtrack(trellis, emission, tokens, blank_id=0):
# Note:
# j and t are indices for trellis, which has extra dimensions
# for time and tokens at the beginning.
# When referring to time frame index `T` in trellis,
# the corresponding index in emission is `T-1`.
# Similarly, when referring to token index `J` in trellis,
# the corresponding index in transcript is `J-1`.
j = trellis.size(1) - 1
t_start = torch.argmax(trellis[:, j]).item()
t, j = trellis.size(0) - 1, trellis.size(1) - 1
path = [Point(j, t, emission[t, blank_id].exp().item())]
while j > 0:
# Should not happen but just in case
assert t > 0
path = []
for t in range(t_start, 0, -1):
# 1. Figure out if the current position was stay or change
# Note (again):
# `emission[J-1]` is the emission at time frame `J` of trellis dimension.
# Score for token staying the same from time frame J-1 to T.
stayed = trellis[t - 1, j] + emission[t - 1, blank_id]
# Score for token changing from C-1 at T-1 to J at T.
changed = trellis[t - 1, j - 1] + emission[t - 1, tokens[j - 1]]
# Frame-wise score of stay vs change
p_stay = emission[t - 1, blank_id]
# p_change = emission[t - 1, tokens[j]]
p_change = get_wildcard_emission(emission[t - 1], [tokens[j]], blank_id)[0]
# 2. Store the path with frame-wise probability.
prob = emission[t - 1, tokens[j - 1] if changed > stayed else 0].exp().item()
# Return token index and time index in non-trellis coordinate.
path.append(Point(j - 1, t - 1, prob))
# Context-aware score for stay vs change
stayed = trellis[t - 1, j] + p_stay
changed = trellis[t - 1, j - 1] + p_change
# 3. Update the token
# Update position
t -= 1
if changed > stayed:
j -= 1
if j == 0:
break
else:
# failed
return None
# Store the path with frame-wise probability.
prob = (p_change if changed > stayed else p_stay).exp().item()
path.append(Point(j, t, prob))
# Now j == 0, which means, it reached the SoS.
# Fill up the rest for the sake of visualization
while t > 0:
prob = emission[t - 1, blank_id].exp().item()
path.append(Point(j, t - 1, prob))
t -= 1
return path[::-1]
@dataclass
class Path:
points: List[Point]
score: float
@dataclass
class BeamState:
"""State in beam search."""
token_index: int # Current token position
time_index: int # Current time step
score: float # Cumulative score
path: List[Point] # Path history
def backtrack_beam(trellis, emission, tokens, blank_id=0, beam_width=5):
"""Standard CTC beam search backtracking implementation.
Args:
trellis (torch.Tensor): The trellis (or lattice) of shape (T, N), where T is the number of time steps
and N is the number of tokens (including the blank token).
emission (torch.Tensor): The emission probabilities of shape (T, N).
tokens (List[int]): List of token indices (excluding the blank token).
blank_id (int, optional): The ID of the blank token. Defaults to 0.
beam_width (int, optional): The number of top paths to keep during beam search. Defaults to 5.
Returns:
List[Point]: the best path
"""
T, J = trellis.size(0) - 1, trellis.size(1) - 1
init_state = BeamState(
token_index=J,
time_index=T,
score=trellis[T, J],
path=[Point(J, T, emission[T, blank_id].exp().item())]
)
beams = [init_state]
while beams and beams[0].token_index > 0:
next_beams = []
for beam in beams:
t, j = beam.time_index, beam.token_index
if t <= 0:
continue
p_stay = emission[t - 1, blank_id]
p_change = get_wildcard_emission(emission[t - 1], [tokens[j]], blank_id)[0]
stay_score = trellis[t - 1, j]
change_score = trellis[t - 1, j - 1] if j > 0 else float('-inf')
# Stay
if not math.isinf(stay_score):
new_path = beam.path.copy()
new_path.append(Point(j, t - 1, p_stay.exp().item()))
next_beams.append(BeamState(
token_index=j,
time_index=t - 1,
score=stay_score,
path=new_path
))
# Change
if j > 0 and not math.isinf(change_score):
new_path = beam.path.copy()
new_path.append(Point(j - 1, t - 1, p_change.exp().item()))
next_beams.append(BeamState(
token_index=j - 1,
time_index=t - 1,
score=change_score,
path=new_path
))
# sort by score
beams = sorted(next_beams, key=lambda x: x.score, reverse=True)[:beam_width]
if not beams:
break
if not beams:
return None
best_beam = beams[0]
t = best_beam.time_index
j = best_beam.token_index
while t > 0:
prob = emission[t - 1, blank_id].exp().item()
best_beam.path.append(Point(j, t - 1, prob))
t -= 1
return best_beam.path[::-1]
# Merge the labels
@dataclass
class Segment:

416
whisperx/asr.py Normal file
View File

@ -0,0 +1,416 @@
import os
from typing import List, Optional, Union
from dataclasses import replace
import ctranslate2
import faster_whisper
import numpy as np
import torch
from faster_whisper.tokenizer import Tokenizer
from faster_whisper.transcribe import TranscriptionOptions, get_ctranslate2_storage
from transformers import Pipeline
from transformers.pipelines.pt_utils import PipelineIterator
from whisperx.audio import N_SAMPLES, SAMPLE_RATE, load_audio, log_mel_spectrogram
from whisperx.types import SingleSegment, TranscriptionResult
from whisperx.vads import Vad, Silero, Pyannote
def find_numeral_symbol_tokens(tokenizer):
numeral_symbol_tokens = []
for i in range(tokenizer.eot):
token = tokenizer.decode([i]).removeprefix(" ")
has_numeral_symbol = any(c in "0123456789%" for c in token)
if has_numeral_symbol:
numeral_symbol_tokens.append(i)
return numeral_symbol_tokens
class WhisperModel(faster_whisper.WhisperModel):
'''
FasterWhisperModel provides batched inference for faster-whisper.
Currently only works in non-timestamp mode and fixed prompt for all samples in batch.
'''
def generate_segment_batched(
self,
features: np.ndarray,
tokenizer: Tokenizer,
options: TranscriptionOptions,
encoder_output=None,
):
batch_size = features.shape[0]
all_tokens = []
prompt_reset_since = 0
if options.initial_prompt is not None:
initial_prompt = " " + options.initial_prompt.strip()
initial_prompt_tokens = tokenizer.encode(initial_prompt)
all_tokens.extend(initial_prompt_tokens)
previous_tokens = all_tokens[prompt_reset_since:]
prompt = self.get_prompt(
tokenizer,
previous_tokens,
without_timestamps=options.without_timestamps,
prefix=options.prefix,
hotwords=options.hotwords
)
encoder_output = self.encode(features)
max_initial_timestamp_index = int(
round(options.max_initial_timestamp / self.time_precision)
)
result = self.model.generate(
encoder_output,
[prompt] * batch_size,
beam_size=options.beam_size,
patience=options.patience,
length_penalty=options.length_penalty,
max_length=self.max_length,
suppress_blank=options.suppress_blank,
suppress_tokens=options.suppress_tokens,
)
tokens_batch = [x.sequences_ids[0] for x in result]
def decode_batch(tokens: List[List[int]]) -> str:
res = []
for tk in tokens:
res.append([token for token in tk if token < tokenizer.eot])
# text_tokens = [token for token in tokens if token < self.eot]
return tokenizer.tokenizer.decode_batch(res)
text = decode_batch(tokens_batch)
return text
def encode(self, features: np.ndarray) -> ctranslate2.StorageView:
# When the model is running on multiple GPUs, the encoder output should be moved
# to the CPU since we don't know which GPU will handle the next job.
to_cpu = self.model.device == "cuda" and len(self.model.device_index) > 1
# unsqueeze if batch size = 1
if len(features.shape) == 2:
features = np.expand_dims(features, 0)
features = get_ctranslate2_storage(features)
return self.model.encode(features, to_cpu=to_cpu)
class FasterWhisperPipeline(Pipeline):
"""
Huggingface Pipeline wrapper for FasterWhisperModel.
"""
# TODO:
# - add support for timestamp mode
# - add support for custom inference kwargs
def __init__(
self,
model: WhisperModel,
vad,
vad_params: dict,
options: TranscriptionOptions,
tokenizer: Optional[Tokenizer] = None,
device: Union[int, str, "torch.device"] = -1,
framework="pt",
language: Optional[str] = None,
suppress_numerals: bool = False,
**kwargs,
):
self.model = model
self.tokenizer = tokenizer
self.options = options
self.preset_language = language
self.suppress_numerals = suppress_numerals
self._batch_size = kwargs.pop("batch_size", None)
self._num_workers = 1
self._preprocess_params, self._forward_params, self._postprocess_params = self._sanitize_parameters(**kwargs)
self.call_count = 0
self.framework = framework
if self.framework == "pt":
if isinstance(device, torch.device):
self.device = device
elif isinstance(device, str):
self.device = torch.device(device)
elif device < 0:
self.device = torch.device("cpu")
else:
self.device = torch.device(f"cuda:{device}")
else:
self.device = device
super(Pipeline, self).__init__()
self.vad_model = vad
self._vad_params = vad_params
def _sanitize_parameters(self, **kwargs):
preprocess_kwargs = {}
if "tokenizer" in kwargs:
preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
return preprocess_kwargs, {}, {}
def preprocess(self, audio):
audio = audio['inputs']
model_n_mels = self.model.feat_kwargs.get("feature_size")
features = log_mel_spectrogram(
audio,
n_mels=model_n_mels if model_n_mels is not None else 80,
padding=N_SAMPLES - audio.shape[0],
)
return {'inputs': features}
def _forward(self, model_inputs):
outputs = self.model.generate_segment_batched(model_inputs['inputs'], self.tokenizer, self.options)
return {'text': outputs}
def postprocess(self, model_outputs):
return model_outputs
def get_iterator(
self,
inputs,
num_workers: int,
batch_size: int,
preprocess_params: dict,
forward_params: dict,
postprocess_params: dict,
):
dataset = PipelineIterator(inputs, self.preprocess, preprocess_params)
if "TOKENIZERS_PARALLELISM" not in os.environ:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# TODO hack by collating feature_extractor and image_processor
def stack(items):
return {'inputs': torch.stack([x['inputs'] for x in items])}
dataloader = torch.utils.data.DataLoader(dataset, num_workers=num_workers, batch_size=batch_size, collate_fn=stack)
model_iterator = PipelineIterator(dataloader, self.forward, forward_params, loader_batch_size=batch_size)
final_iterator = PipelineIterator(model_iterator, self.postprocess, postprocess_params)
return final_iterator
def transcribe(
self,
audio: Union[str, np.ndarray],
batch_size: Optional[int] = None,
num_workers=0,
language: Optional[str] = None,
task: Optional[str] = None,
chunk_size=30,
print_progress=False,
combined_progress=False,
verbose=False,
) -> TranscriptionResult:
if isinstance(audio, str):
audio = load_audio(audio)
def data(audio, segments):
for seg in segments:
f1 = int(seg['start'] * SAMPLE_RATE)
f2 = int(seg['end'] * SAMPLE_RATE)
# print(f2-f1)
yield {'inputs': audio[f1:f2]}
# Pre-process audio and merge chunks as defined by the respective VAD child class
# In case vad_model is manually assigned (see 'load_model') follow the functionality of pyannote toolkit
if issubclass(type(self.vad_model), Vad):
waveform = self.vad_model.preprocess_audio(audio)
merge_chunks = self.vad_model.merge_chunks
else:
waveform = Pyannote.preprocess_audio(audio)
merge_chunks = Pyannote.merge_chunks
vad_segments = self.vad_model({"waveform": waveform, "sample_rate": SAMPLE_RATE})
vad_segments = merge_chunks(
vad_segments,
chunk_size,
onset=self._vad_params["vad_onset"],
offset=self._vad_params["vad_offset"],
)
if self.tokenizer is None:
language = language or self.detect_language(audio)
task = task or "transcribe"
self.tokenizer = Tokenizer(
self.model.hf_tokenizer,
self.model.model.is_multilingual,
task=task,
language=language,
)
else:
language = language or self.tokenizer.language_code
task = task or self.tokenizer.task
if task != self.tokenizer.task or language != self.tokenizer.language_code:
self.tokenizer = Tokenizer(
self.model.hf_tokenizer,
self.model.model.is_multilingual,
task=task,
language=language,
)
if self.suppress_numerals:
previous_suppress_tokens = self.options.suppress_tokens
numeral_symbol_tokens = find_numeral_symbol_tokens(self.tokenizer)
print(f"Suppressing numeral and symbol tokens")
new_suppressed_tokens = numeral_symbol_tokens + self.options.suppress_tokens
new_suppressed_tokens = list(set(new_suppressed_tokens))
self.options = replace(self.options, suppress_tokens=new_suppressed_tokens)
segments: List[SingleSegment] = []
batch_size = batch_size or self._batch_size
total_segments = len(vad_segments)
for idx, out in enumerate(self.__call__(data(audio, vad_segments), batch_size=batch_size, num_workers=num_workers)):
if print_progress:
base_progress = ((idx + 1) / total_segments) * 100
percent_complete = base_progress / 2 if combined_progress else base_progress
print(f"Progress: {percent_complete:.2f}%...")
text = out['text']
if batch_size in [0, 1, None]:
text = text[0]
if verbose:
print(f"Transcript: [{round(vad_segments[idx]['start'], 3)} --> {round(vad_segments[idx]['end'], 3)}] {text}")
segments.append(
{
"text": text,
"start": round(vad_segments[idx]['start'], 3),
"end": round(vad_segments[idx]['end'], 3)
}
)
# revert the tokenizer if multilingual inference is enabled
if self.preset_language is None:
self.tokenizer = None
# revert suppressed tokens if suppress_numerals is enabled
if self.suppress_numerals:
self.options = replace(self.options, suppress_tokens=previous_suppress_tokens)
return {"segments": segments, "language": language}
def detect_language(self, audio: np.ndarray) -> str:
if audio.shape[0] < N_SAMPLES:
print("Warning: audio is shorter than 30s, language detection may be inaccurate.")
model_n_mels = self.model.feat_kwargs.get("feature_size")
segment = log_mel_spectrogram(audio[: N_SAMPLES],
n_mels=model_n_mels if model_n_mels is not None else 80,
padding=0 if audio.shape[0] >= N_SAMPLES else N_SAMPLES - audio.shape[0])
encoder_output = self.model.encode(segment)
results = self.model.model.detect_language(encoder_output)
language_token, language_probability = results[0][0]
language = language_token[2:-2]
print(f"Detected language: {language} ({language_probability:.2f}) in first 30s of audio...")
return language
def load_model(
whisper_arch: str,
device: str,
device_index=0,
compute_type="float16",
asr_options: Optional[dict] = None,
language: Optional[str] = None,
vad_model: Optional[Vad]= None,
vad_method: Optional[str] = "pyannote",
vad_options: Optional[dict] = None,
model: Optional[WhisperModel] = None,
task="transcribe",
download_root: Optional[str] = None,
local_files_only=False,
threads=4,
) -> FasterWhisperPipeline:
"""Load a Whisper model for inference.
Args:
whisper_arch - The name of the Whisper model to load.
device - The device to load the model on.
compute_type - The compute type to use for the model.
vad_method - The vad method to use. vad_model has higher priority if is not None.
options - A dictionary of options to use for the model.
language - The language of the model. (use English for now)
model - The WhisperModel instance to use.
download_root - The root directory to download the model to.
local_files_only - If `True`, avoid downloading the file and return the path to the local cached file if it exists.
threads - The number of cpu threads to use per worker, e.g. will be multiplied by num workers.
Returns:
A Whisper pipeline.
"""
if whisper_arch.endswith(".en"):
language = "en"
model = model or WhisperModel(whisper_arch,
device=device,
device_index=device_index,
compute_type=compute_type,
download_root=download_root,
local_files_only=local_files_only,
cpu_threads=threads)
if language is not None:
tokenizer = Tokenizer(model.hf_tokenizer, model.model.is_multilingual, task=task, language=language)
else:
print("No language specified, language will be first be detected for each audio file (increases inference time).")
tokenizer = None
default_asr_options = {
"beam_size": 5,
"best_of": 5,
"patience": 1,
"length_penalty": 1,
"repetition_penalty": 1,
"no_repeat_ngram_size": 0,
"temperatures": [0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
"compression_ratio_threshold": 2.4,
"log_prob_threshold": -1.0,
"no_speech_threshold": 0.6,
"condition_on_previous_text": False,
"prompt_reset_on_temperature": 0.5,
"initial_prompt": None,
"prefix": None,
"suppress_blank": True,
"suppress_tokens": [-1],
"without_timestamps": True,
"max_initial_timestamp": 0.0,
"word_timestamps": False,
"prepend_punctuations": "\"'“¿([{-",
"append_punctuations": "\"'.。,!?::”)]}、",
"multilingual": model.model.is_multilingual,
"suppress_numerals": False,
"max_new_tokens": None,
"clip_timestamps": None,
"hallucination_silence_threshold": None,
"hotwords": None,
}
if asr_options is not None:
default_asr_options.update(asr_options)
suppress_numerals = default_asr_options["suppress_numerals"]
del default_asr_options["suppress_numerals"]
default_asr_options = TranscriptionOptions(**default_asr_options)
default_vad_options = {
"chunk_size": 30, # needed by silero since binarization happens before merge_chunks
"vad_onset": 0.500,
"vad_offset": 0.363
}
if vad_options is not None:
default_vad_options.update(vad_options)
# Note: manually assigned vad_model has higher priority than vad_method!
if vad_model is not None:
print("Use manually assigned vad_model. vad_method is ignored.")
vad_model = vad_model
else:
if vad_method == "silero":
vad_model = Silero(**default_vad_options)
elif vad_method == "pyannote":
vad_model = Pyannote(torch.device(device), use_auth_token=None, **default_vad_options)
else:
raise ValueError(f"Invalid vad_method: {vad_method}")
return FasterWhisperPipeline(
model=model,
vad=vad_model,
options=default_asr_options,
tokenizer=tokenizer,
language=language,
suppress_numerals=suppress_numerals,
vad_params=default_vad_options,
)

File diff suppressed because it is too large Load Diff

View File

@ -1 +0,0 @@
{"bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "unk_token": "<|endoftext|>"}

View File

@ -1 +0,0 @@
{"unk_token": "<|endoftext|>", "bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "add_prefix_space": false, "model_max_length": 1024, "special_tokens_map_file": null, "name_or_path": "gpt2", "tokenizer_class": "GPT2Tokenizer"}

File diff suppressed because one or more lines are too long

BIN
whisperx/assets/mel_filters.npz Executable file → Normal file

Binary file not shown.

View File

@ -1 +0,0 @@
{"<|endoftext|>": 50257}

File diff suppressed because it is too large Load Diff

View File

@ -1 +0,0 @@
{"bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "unk_token": "<|endoftext|>"}

View File

@ -1 +0,0 @@
{"unk_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_prefix_space": false, "model_max_length": 1024, "special_tokens_map_file": null, "name_or_path": "multilingual", "errors": "replace", "tokenizer_class": "GPT2Tokenizer"}

File diff suppressed because one or more lines are too long

Binary file not shown.

View File

@ -1,25 +1,28 @@
import os
import subprocess
from functools import lru_cache
from typing import Union
from typing import Optional, Union
import ffmpeg
import numpy as np
import torch
import torch.nn.functional as F
from .utils import exact_div
from whisperx.utils import exact_div
# hard-coded audio hyperparameters
SAMPLE_RATE = 16000
N_FFT = 400
N_MELS = 80
HOP_LENGTH = 160
CHUNK_LENGTH = 30
N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE # 480000: number of samples in a chunk
N_FRAMES = exact_div(N_SAMPLES, HOP_LENGTH) # 3000: number of frames in a mel spectrogram input
N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE # 480000 samples in a 30-second chunk
N_FRAMES = exact_div(N_SAMPLES, HOP_LENGTH) # 3000 frames in a mel spectrogram input
N_SAMPLES_PER_TOKEN = HOP_LENGTH * 2 # the initial convolutions has stride 2
FRAMES_PER_SECOND = exact_div(SAMPLE_RATE, HOP_LENGTH) # 10ms per audio frame
TOKENS_PER_SECOND = exact_div(SAMPLE_RATE, N_SAMPLES_PER_TOKEN) # 20ms per audio token
def load_audio(file: str, sr: int = SAMPLE_RATE):
def load_audio(file: str, sr: int = SAMPLE_RATE) -> np.ndarray:
"""
Open an audio file and read as mono waveform, resampling as necessary
@ -36,14 +39,27 @@ def load_audio(file: str, sr: int = SAMPLE_RATE):
A NumPy array containing the audio waveform, in float32 dtype.
"""
try:
# This launches a subprocess to decode audio while down-mixing and resampling as necessary.
# Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
out, _ = (
ffmpeg.input(file, threads=0)
.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
)
except ffmpeg.Error as e:
# Launches a subprocess to decode audio while down-mixing and resampling as necessary.
# Requires the ffmpeg CLI to be installed.
cmd = [
"ffmpeg",
"-nostdin",
"-threads",
"0",
"-i",
file,
"-f",
"s16le",
"-ac",
"1",
"-acodec",
"pcm_s16le",
"-ar",
str(sr),
"-",
]
out = subprocess.run(cmd, capture_output=True, check=True).stdout
except subprocess.CalledProcessError as e:
raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
@ -55,7 +71,9 @@ def pad_or_trim(array, length: int = N_SAMPLES, *, axis: int = -1):
"""
if torch.is_tensor(array):
if array.shape[axis] > length:
array = array.index_select(dim=axis, index=torch.arange(length, device=array.device))
array = array.index_select(
dim=axis, index=torch.arange(length, device=array.device)
)
if array.shape[axis] < length:
pad_widths = [(0, 0)] * array.ndim
@ -74,7 +92,7 @@ def pad_or_trim(array, length: int = N_SAMPLES, *, axis: int = -1):
@lru_cache(maxsize=None)
def mel_filters(device, n_mels: int = N_MELS) -> torch.Tensor:
def mel_filters(device, n_mels: int) -> torch.Tensor:
"""
load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
Allows decoupling librosa dependency; saved using:
@ -84,12 +102,19 @@ def mel_filters(device, n_mels: int = N_MELS) -> torch.Tensor:
mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80),
)
"""
assert n_mels == 80, f"Unsupported n_mels: {n_mels}"
with np.load(os.path.join(os.path.dirname(__file__), "assets", "mel_filters.npz")) as f:
assert n_mels in [80, 128], f"Unsupported n_mels: {n_mels}"
with np.load(
os.path.join(os.path.dirname(__file__), "assets", "mel_filters.npz")
) as f:
return torch.from_numpy(f[f"mel_{n_mels}"]).to(device)
def log_mel_spectrogram(audio: Union[str, np.ndarray, torch.Tensor], n_mels: int = N_MELS):
def log_mel_spectrogram(
audio: Union[str, np.ndarray, torch.Tensor],
n_mels: int,
padding: int = 0,
device: Optional[Union[str, torch.device]] = None,
):
"""
Compute the log-Mel spectrogram of
@ -101,6 +126,12 @@ def log_mel_spectrogram(audio: Union[str, np.ndarray, torch.Tensor], n_mels: int
n_mels: int
The number of Mel-frequency filters, only 80 is supported
padding: int
Number of zero samples to pad to the right
device: Optional[Union[str, torch.device]]
If given, the audio tensor is moved to this device before STFT
Returns
-------
torch.Tensor, shape = (80, n_frames)
@ -111,6 +142,10 @@ def log_mel_spectrogram(audio: Union[str, np.ndarray, torch.Tensor], n_mels: int
audio = load_audio(audio)
audio = torch.from_numpy(audio)
if device is not None:
audio = audio.to(device)
if padding > 0:
audio = F.pad(audio, (0, padding))
window = torch.hann_window(N_FFT).to(audio.device)
stft = torch.stft(audio, N_FFT, HOP_LENGTH, window=window, return_complex=True)
magnitudes = stft[..., :-1].abs() ** 2
@ -121,4 +156,4 @@ def log_mel_spectrogram(audio: Union[str, np.ndarray, torch.Tensor], n_mels: int
log_spec = torch.clamp(mel_spec, min=1e-10).log10()
log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
log_spec = (log_spec + 4.0) / 4.0
return log_spec
return log_spec

47
whisperx/conjunctions.py Normal file
View File

@ -0,0 +1,47 @@
# conjunctions.py
from typing import Set
conjunctions_by_language = {
'en': {'and', 'whether', 'or', 'as', 'but', 'so', 'for', 'nor', 'which', 'yet', 'although', 'since', 'unless', 'when', 'while', 'because', 'if', 'how', 'that', 'than', 'who', 'where', 'what', 'near', 'before', 'after', 'across', 'through', 'until', 'once', 'whereas', 'even', 'both', 'either', 'neither', 'though'},
'fr': {'et', 'ou', 'mais', 'parce', 'bien', 'pendant', 'quand', '', 'comme', 'si', 'que', 'avant', 'après', 'aussitôt', 'jusquà', 'à', 'malgré', 'donc', 'tant', 'puisque', 'ni', 'soit', 'bien', 'encore', 'dès', 'lorsque'},
'de': {'und', 'oder', 'aber', 'weil', 'obwohl', 'während', 'wenn', 'wo', 'wie', 'dass', 'bevor', 'nachdem', 'sobald', 'bis', 'außer', 'trotzdem', 'also', 'sowie', 'indem', 'weder', 'sowohl', 'zwar', 'jedoch'},
'es': {'y', 'o', 'pero', 'porque', 'aunque', 'sin', 'mientras', 'cuando', 'donde', 'como', 'si', 'que', 'antes', 'después', 'tan', 'hasta', 'a', 'a', 'por', 'ya', 'ni', 'sino'},
'it': {'e', 'o', 'ma', 'perché', 'anche', 'mentre', 'quando', 'dove', 'come', 'se', 'che', 'prima', 'dopo', 'appena', 'fino', 'a', 'nonostante', 'quindi', 'poiché', '', 'ossia', 'cioè'},
'ja': {'そして', 'または', 'しかし', 'なぜなら', 'もし', 'それとも', 'だから', 'それに', 'なのに', 'そのため', 'かつ', 'それゆえに', 'ならば', 'もしくは', 'ため'},
'zh': {'', '', '但是', '因为', '任何', '', '虽然', '而且', '所以', '如果', '除非', '尽管', '既然', '即使', '只要', '直到', '然后', '因此', '不但', '而是', '不过'},
'nl': {'en', 'of', 'maar', 'omdat', 'hoewel', 'terwijl', 'wanneer', 'waar', 'zoals', 'als', 'dat', 'voordat', 'nadat', 'zodra', 'totdat', 'tenzij', 'ondanks', 'dus', 'zowel', 'noch', 'echter', 'toch'},
'uk': {'та', 'або', 'але', 'тому', 'хоча', 'поки', 'бо', 'коли', 'де', 'як', 'якщо', 'що', 'перш', 'після', 'доки', 'незважаючи', 'тому', 'ані'},
'pt': {'e', 'ou', 'mas', 'porque', 'embora', 'enquanto', 'quando', 'onde', 'como', 'se', 'que', 'antes', 'depois', 'assim', 'até', 'a', 'apesar', 'portanto', '', 'pois', 'nem', 'senão'},
'ar': {'و', 'أو', 'لكن', 'لأن', 'مع', 'بينما', 'عندما', 'حيث', 'كما', 'إذا', 'الذي', 'قبل', 'بعد', 'فور', 'حتى', 'إلا', 'رغم', 'لذلك', 'بما'},
'cs': {'a', 'nebo', 'ale', 'protože', 'ačkoli', 'zatímco', 'když', 'kde', 'jako', 'pokud', 'že', 'než', 'poté', 'jakmile', 'dokud', 'pokud ne', 'navzdory', 'tak', 'stejně', 'ani', 'tudíž'},
'ru': {'и', 'или', 'но', 'потому', 'хотя', 'пока', 'когда', 'где', 'как', 'если', 'что', 'перед', 'после', 'несмотря', 'таким', 'также', 'ни', 'зато'},
'pl': {'i', 'lub', 'ale', 'ponieważ', 'chociaż', 'podczas', 'kiedy', 'gdzie', 'jak', 'jeśli', 'że', 'zanim', 'po', 'jak tylko', 'dopóki', 'chyba', 'pomimo', 'więc', 'tak', 'ani', 'czyli'},
'hu': {'és', 'vagy', 'de', 'mert', 'habár', 'míg', 'amikor', 'ahol', 'ahogy', 'ha', 'hogy', 'mielőtt', 'miután', 'amint', 'amíg', 'hacsak', 'ellenére', 'tehát', 'úgy', 'sem', 'vagyis'},
'fi': {'ja', 'tai', 'mutta', 'koska', 'vaikka', 'kun', 'missä', 'kuten', 'jos', 'että', 'ennen', 'sen jälkeen', 'heti', 'kunnes', 'ellei', 'huolimatta', 'siis', 'sekä', 'eikä', 'vaan'},
'fa': {'و', 'یا', 'اما', 'چون', 'اگرچه', 'در حالی', 'وقتی', 'کجا', 'چگونه', 'اگر', 'که', 'قبل', 'پس', 'به محض', 'تا زمانی', 'مگر', 'با وجود', 'پس', 'همچنین', 'نه'},
'el': {'και', 'ή', 'αλλά', 'επειδή', 'αν', 'ενώ', 'όταν', 'όπου', 'όπως', 'αν', 'που', 'προτού', 'αφού', 'μόλις', 'μέχρι', 'εκτός', 'παρά', 'έτσι', 'όπως', 'ούτε', 'δηλαδή'},
'tr': {'ve', 'veya', 'ama', 'çünkü', 'her ne', 'iken', 'nerede', 'nasıl', 'eğer', 'ki', 'önce', 'sonra', 'hemen', 'kadar', 'rağmen', 'hem', 'ne', 'yani'},
'da': {'og', 'eller', 'men', 'fordi', 'selvom', 'mens', 'når', 'hvor', 'som', 'hvis', 'at', 'før', 'efter', 'indtil', 'medmindre', 'således', 'ligesom', 'hverken', 'altså'},
'he': {'ו', 'או', 'אבל', 'כי', 'אף', 'בזמן', 'כאשר', 'היכן', 'כיצד', 'אם', 'ש', 'לפני', 'אחרי', 'ברגע', 'עד', 'אלא', 'למרות', 'לכן', 'כמו', 'לא', 'אז'},
'vi': {'', 'hoặc', 'nhưng', 'bởi', 'mặc', 'trong', 'khi', '', 'như', 'nếu', 'rằng', 'trước', 'sau', 'ngay', 'cho', 'trừ', 'mặc', '', 'giống', 'cũng', 'tức'},
'ko': {'그리고', '또는','그런데','그래도', '이나', '결국', '마지막으로', '마찬가지로', '반면에', '아니면', '거나', '또는', '그럼에도', '그렇기', '때문에', '덧붙이자면', '게다가', '그러나', '', '그래서', '', '한다면', '하지만', '무엇', '왜냐하면', '비록', '동안', '언제', '어디서', '어떻게', '만약', '', '전에', '후에', '즉시', '까지', '아니라면', '불구하고', '따라서', '같은', ''},
'ur': {'اور', 'یا', 'مگر', 'کیونکہ', 'اگرچہ', 'جبکہ', 'جب', 'کہاں', 'کس طرح', 'اگر', 'کہ', 'سے پہلے', 'کے بعد', 'جیسے ہی', 'تک', 'اگر نہیں تو', 'کے باوجود', 'اس لئے', 'جیسے', 'نہ'},
'hi': {'और', 'या', 'पर', 'तो', '', 'फिर', 'हालांकि', 'चूंकि', 'अगर', 'कैसे', 'वह', 'से', 'जो', 'जहां', 'क्या', 'नजदीक', 'पहले', 'बाद', 'के', 'पार', 'माध्यम', 'तक', 'एक', 'जबकि', 'यहां', 'तक', 'दोनों', 'या', '', 'हालांकि'}
}
commas_by_language = {
'ja': '',
'zh': '',
'fa': '،',
'ur': '،'
}
def get_conjunctions(lang_code: str) -> Set[str]:
return conjunctions_by_language.get(lang_code, set())
def get_comma(lang_code: str) -> str:
return commas_by_language.get(lang_code, ",")

View File

@ -1,710 +0,0 @@
from dataclasses import dataclass, field
from typing import Dict, List, Tuple, Iterable, Optional, Sequence, Union, TYPE_CHECKING
import numpy as np
import torch
import torch.nn.functional as F
from torch import Tensor
from torch.distributions import Categorical
from .audio import CHUNK_LENGTH
from .tokenizer import Tokenizer, get_tokenizer
from .utils import compression_ratio
if TYPE_CHECKING:
from .model import Whisper
@torch.no_grad()
def detect_language(model: "Whisper", mel: Tensor, tokenizer: Tokenizer = None) -> Tuple[Tensor, List[dict]]:
"""
Detect the spoken language in the audio, and return them as list of strings, along with the ids
of the most probable language tokens and the probability distribution over all language tokens.
This is performed outside the main decode loop in order to not interfere with kv-caching.
Returns
-------
language_tokens : Tensor, shape = (n_audio,)
ids of the most probable language tokens, which appears after the startoftranscript token.
language_probs : List[Dict[str, float]], length = n_audio
list of dictionaries containing the probability distribution over all languages.
"""
if tokenizer is None:
tokenizer = get_tokenizer(model.is_multilingual)
if tokenizer.language is None or tokenizer.language_token not in tokenizer.sot_sequence:
raise ValueError(f"This model doesn't have language tokens so it can't perform lang id")
single = mel.ndim == 2
if single:
mel = mel.unsqueeze(0)
# skip encoder forward pass if already-encoded audio features were given
if mel.shape[-2:] != (model.dims.n_audio_ctx, model.dims.n_audio_state):
mel = model.encoder(mel)
# forward pass using a single token, startoftranscript
n_audio = mel.shape[0]
x = torch.tensor([[tokenizer.sot]] * n_audio).to(mel.device) # [n_audio, 1]
logits = model.logits(x, mel)[:, 0]
# collect detected languages; suppress all non-language tokens
mask = torch.ones(logits.shape[-1], dtype=torch.bool)
mask[list(tokenizer.all_language_tokens)] = False
logits[:, mask] = -np.inf
language_tokens = logits.argmax(dim=-1)
language_token_probs = logits.softmax(dim=-1).cpu()
language_probs = [
{
c: language_token_probs[i, j].item()
for j, c in zip(tokenizer.all_language_tokens, tokenizer.all_language_codes)
}
for i in range(n_audio)
]
if single:
language_tokens = language_tokens[0]
language_probs = language_probs[0]
return language_tokens, language_probs
@dataclass(frozen=True)
class DecodingOptions:
task: str = "transcribe" # whether to perform X->X "transcribe" or X->English "translate"
language: Optional[str] = None # language that the audio is in; uses detected language if None
# sampling-related options
temperature: float = 0.0
sample_len: Optional[int] = None # maximum number of tokens to sample
best_of: Optional[int] = None # number of independent samples to collect, when t > 0
beam_size: Optional[int] = None # number of beams in beam search, when t == 0
patience: Optional[float] = None # patience in beam search (https://arxiv.org/abs/2204.05424)
# options for ranking generations (either beams or best-of-N samples)
length_penalty: Optional[float] = None # "alpha" in Google NMT, None defaults to length norm
# prompt, prefix, and token suppression
prompt: Optional[Union[str, List[int]]] = None # text or tokens for the previous context
prefix: Optional[Union[str, List[int]]] = None # text or tokens to prefix the current context
suppress_blank: bool = True # this will suppress blank outputs
# list of tokens ids (or comma-separated token ids) to suppress
# "-1" will suppress a set of symbols as defined in `tokenizer.non_speech_tokens()`
suppress_tokens: Optional[Union[str, Iterable[int]]] = "-1"
# timestamp sampling options
without_timestamps: bool = False # use <|notimestamps|> to sample text tokens only
max_initial_timestamp: Optional[float] = 1.0 # the initial timestamp cannot be later than this
# implementation details
fp16: bool = True # use fp16 for most of the calculation
@dataclass(frozen=True)
class DecodingResult:
audio_features: Tensor
language: str
language_probs: Optional[Dict[str, float]] = None
tokens: List[int] = field(default_factory=list)
text: str = ""
avg_logprob: float = np.nan
no_speech_prob: float = np.nan
temperature: float = np.nan
compression_ratio: float = np.nan
class Inference:
def logits(self, tokens: Tensor, audio_features: Tensor) -> Tensor:
"""Perform a forward pass on the decoder and return per-token logits"""
raise NotImplementedError
def rearrange_kv_cache(self, source_indices) -> None:
"""Update the key-value cache according to the updated beams"""
raise NotImplementedError
def cleanup_caching(self) -> None:
"""Clean up any resources or hooks after decoding is finished"""
pass
class PyTorchInference(Inference):
def __init__(self, model: "Whisper", initial_token_length: int):
self.model: "Whisper" = model
self.initial_token_length = initial_token_length
self.kv_cache = {}
self.hooks = []
def logits(self, tokens: Tensor, audio_features: Tensor) -> Tensor:
if not self.kv_cache:
self.kv_cache, self.hooks = self.model.install_kv_cache_hooks()
if tokens.shape[-1] > self.initial_token_length:
# only need to use the last token except in the first forward pass
tokens = tokens[:, -1:]
return self.model.decoder(tokens, audio_features, kv_cache=self.kv_cache)
def cleanup_caching(self):
for hook in self.hooks:
hook.remove()
self.kv_cache = {}
self.hooks = []
def rearrange_kv_cache(self, source_indices):
for module, tensor in self.kv_cache.items():
# update the key/value cache to contain the selected sequences
self.kv_cache[module] = tensor[source_indices].detach()
class SequenceRanker:
def rank(self, tokens: List[List[Tensor]], sum_logprobs: List[List[float]]) -> List[int]:
"""
Given a list of groups of samples and their cumulative log probabilities,
return the indices of the samples in each group to select as the final result
"""
raise NotImplementedError
class MaximumLikelihoodRanker(SequenceRanker):
"""
Select the sample with the highest log probabilities, penalized using either
a simple length normalization or Google NMT paper's length penalty
"""
def __init__(self, length_penalty: Optional[float]):
self.length_penalty = length_penalty
def rank(self, tokens: List[List[Tensor]], sum_logprobs: List[List[float]]):
def scores(logprobs, lengths):
result = []
for logprob, length in zip(logprobs, lengths):
if self.length_penalty is None:
penalty = length
else:
# from the Google NMT paper
penalty = ((5 + length) / 6) ** self.length_penalty
result.append(logprob / penalty)
return result
# get the sequence with the highest score
lengths = [[len(t) for t in s] for s in tokens]
return [np.argmax(scores(p, l)) for p, l in zip(sum_logprobs, lengths)]
class TokenDecoder:
def reset(self):
"""Initialize any stateful variables for decoding a new sequence"""
def update(self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor) -> Tuple[Tensor, bool]:
"""Specify how to select the next token, based on the current trace and logits
Parameters
----------
tokens : Tensor, shape = (n_batch, current_sequence_length)
all tokens in the context so far, including the prefix and sot_sequence tokens
logits : Tensor, shape = (n_batch, vocab_size)
per-token logits of the probability distribution at the current step
sum_logprobs : Tensor, shape = (n_batch)
cumulative log probabilities for each sequence
Returns
-------
tokens : Tensor, shape = (n_batch, current_sequence_length + 1)
the tokens, appended with the selected next token
completed : bool
True if all sequences has reached the end of text
"""
raise NotImplementedError
def finalize(
self, tokens: Tensor, sum_logprobs: Tensor
) -> Tuple[Sequence[Sequence[Tensor]], List[List[float]]]:
"""Finalize search and return the final candidate sequences
Parameters
----------
tokens : Tensor, shape = (n_audio, n_group, current_sequence_length)
all tokens in the context so far, including the prefix and sot_sequence
sum_logprobs : Tensor, shape = (n_audio, n_group)
cumulative log probabilities for each sequence
Returns
-------
tokens : Sequence[Sequence[Tensor]], length = n_audio
sequence of Tensors containing candidate token sequences, for each audio input
sum_logprobs : List[List[float]], length = n_audio
sequence of cumulative log probabilities corresponding to the above
"""
raise NotImplementedError
class GreedyDecoder(TokenDecoder):
def __init__(self, temperature: float, eot: int):
self.temperature = temperature
self.eot = eot
def update(self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor) -> Tuple[Tensor, bool]:
temperature = self.temperature
if temperature == 0:
next_tokens = logits.argmax(dim=-1)
else:
next_tokens = Categorical(logits=logits / temperature).sample()
logprobs = F.log_softmax(logits.float(), dim=-1)
current_logprobs = logprobs[torch.arange(logprobs.shape[0]), next_tokens]
sum_logprobs += current_logprobs * (tokens[:, -1] != self.eot)
next_tokens[tokens[:, -1] == self.eot] = self.eot
tokens = torch.cat([tokens, next_tokens[:, None]], dim=-1)
completed = (tokens[:, -1] == self.eot).all()
return tokens, completed
def finalize(self, tokens: Tensor, sum_logprobs: Tensor):
# make sure each sequence has at least one EOT token at the end
tokens = F.pad(tokens, (0, 1), value=self.eot)
return tokens, sum_logprobs.tolist()
class BeamSearchDecoder(TokenDecoder):
def __init__(self, beam_size: int, eot: int, inference: Inference, patience: Optional[float] = None):
self.beam_size = beam_size
self.eot = eot
self.inference = inference
self.patience = patience or 1.0
self.max_candidates: int = round(beam_size * self.patience)
self.finished_sequences = None
assert self.max_candidates > 0, f"Invalid beam size ({beam_size}) or patience ({patience})"
def reset(self):
self.finished_sequences = None
def update(self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor) -> Tuple[Tensor, bool]:
if tokens.shape[0] % self.beam_size != 0:
raise ValueError(f"{tokens.shape}[0] % {self.beam_size} != 0")
n_audio = tokens.shape[0] // self.beam_size
if self.finished_sequences is None: # for the first update
self.finished_sequences = [{} for _ in range(n_audio)]
logprobs = F.log_softmax(logits.float(), dim=-1)
next_tokens, source_indices, finished_sequences = [], [], []
for i in range(n_audio):
scores, sources, finished = {}, {}, {}
# STEP 1: calculate the cumulative log probabilities for possible candidates
for j in range(self.beam_size):
idx = i * self.beam_size + j
prefix = tokens[idx].tolist()
for logprob, token in zip(*logprobs[idx].topk(self.beam_size + 1)):
new_logprob = (sum_logprobs[idx] + logprob).item()
sequence = tuple(prefix + [token.item()])
scores[sequence] = new_logprob
sources[sequence] = idx
# STEP 2: rank the candidates and keep the top beam_size sequences for each audio
saved = 0
for sequence in sorted(scores, key=scores.get, reverse=True):
if sequence[-1] == self.eot:
finished[sequence] = scores[sequence]
else:
sum_logprobs[len(next_tokens)] = scores[sequence]
next_tokens.append(sequence)
source_indices.append(sources[sequence])
saved += 1
if saved == self.beam_size:
break
finished_sequences.append(finished)
tokens = torch.tensor(next_tokens, device=tokens.device)
self.inference.rearrange_kv_cache(source_indices)
# add newly finished sequences to self.finished_sequences
assert len(self.finished_sequences) == len(finished_sequences)
for previously_finished, newly_finished in zip(self.finished_sequences, finished_sequences):
for seq in sorted(newly_finished, key=newly_finished.get, reverse=True):
if len(previously_finished) >= self.max_candidates:
break # the candidate list is full
previously_finished[seq] = newly_finished[seq]
# mark as completed if all audio has enough number of samples
completed = all(
len(sequences) >= self.max_candidates for sequences in self.finished_sequences
)
return tokens, completed
def finalize(self, preceding_tokens: Tensor, sum_logprobs: Tensor):
# collect all finished sequences, including patience, and add unfinished ones if not enough
sum_logprobs = sum_logprobs.cpu()
for i, sequences in enumerate(self.finished_sequences):
if len(sequences) < self.beam_size: # when not enough sequences are finished
for j in list(np.argsort(sum_logprobs[i]))[::-1]:
sequence = preceding_tokens[i, j].tolist() + [self.eot]
sequences[tuple(sequence)] = sum_logprobs[i][j].item()
if len(sequences) >= self.beam_size:
break
tokens: List[List[Tensor]] = [
[torch.tensor(seq) for seq in sequences.keys()] for sequences in self.finished_sequences
]
sum_logprobs: List[List[float]] = [
list(sequences.values()) for sequences in self.finished_sequences
]
return tokens, sum_logprobs
class LogitFilter:
def apply(self, logits: Tensor, tokens: Tensor) -> None:
"""Apply any filtering or masking to logits in-place
Parameters
----------
logits : Tensor, shape = (n_batch, vocab_size)
per-token logits of the probability distribution at the current step
tokens : Tensor, shape = (n_batch, current_sequence_length)
all tokens in the context so far, including the prefix and sot_sequence tokens
"""
raise NotImplementedError
class SuppressBlank(LogitFilter):
def __init__(self, tokenizer: Tokenizer, sample_begin: int):
self.tokenizer = tokenizer
self.sample_begin = sample_begin
def apply(self, logits: Tensor, tokens: Tensor):
if tokens.shape[1] == self.sample_begin:
logits[:, self.tokenizer.encode(" ") + [self.tokenizer.eot]] = -np.inf
class SuppressTokens(LogitFilter):
def __init__(self, suppress_tokens: Sequence[int]):
self.suppress_tokens = list(suppress_tokens)
def apply(self, logits: Tensor, tokens: Tensor):
logits[:, self.suppress_tokens] = -np.inf
class ApplyTimestampRules(LogitFilter):
def __init__(
self, tokenizer: Tokenizer, sample_begin: int, max_initial_timestamp_index: Optional[int]
):
self.tokenizer = tokenizer
self.sample_begin = sample_begin
self.max_initial_timestamp_index = max_initial_timestamp_index
def apply(self, logits: Tensor, tokens: Tensor):
# suppress <|notimestamps|> which is handled by without_timestamps
if self.tokenizer.no_timestamps is not None:
logits[:, self.tokenizer.no_timestamps] = -np.inf
# timestamps have to appear in pairs, except directly before EOT; mask logits accordingly
for k in range(tokens.shape[0]):
seq = [t for t in tokens[k, self.sample_begin :].tolist()]
last_was_timestamp = len(seq) >= 1 and seq[-1] >= self.tokenizer.timestamp_begin
penultimate_was_timestamp = len(seq) < 2 or seq[-2] >= self.tokenizer.timestamp_begin
if last_was_timestamp:
if penultimate_was_timestamp: # has to be non-timestamp
logits[k, self.tokenizer.timestamp_begin :] = -np.inf
else: # cannot be normal text tokens
logits[k, : self.tokenizer.eot] = -np.inf
if tokens.shape[1] == self.sample_begin:
# suppress generating non-timestamp tokens at the beginning
logits[:, : self.tokenizer.timestamp_begin] = -np.inf
# apply the `max_initial_timestamp` option
if self.max_initial_timestamp_index is not None:
last_allowed = self.tokenizer.timestamp_begin + self.max_initial_timestamp_index
logits[:, last_allowed + 1 :] = -np.inf
# if sum of probability over timestamps is above any other token, sample timestamp
logprobs = F.log_softmax(logits.float(), dim=-1)
for k in range(tokens.shape[0]):
timestamp_logprob = logprobs[k, self.tokenizer.timestamp_begin :].logsumexp(dim=-1)
max_text_token_logprob = logprobs[k, : self.tokenizer.timestamp_begin].max()
if timestamp_logprob > max_text_token_logprob:
logits[k, : self.tokenizer.timestamp_begin] = -np.inf
class DecodingTask:
inference: Inference
sequence_ranker: SequenceRanker
decoder: TokenDecoder
logit_filters: List[LogitFilter]
def __init__(self, model: "Whisper", options: DecodingOptions):
self.model = model
language = options.language or "en"
tokenizer = get_tokenizer(model.is_multilingual, language=language, task=options.task)
self.tokenizer: Tokenizer = tokenizer
self.options: DecodingOptions = self._verify_options(options)
self.n_group: int = options.beam_size or options.best_of or 1
self.n_ctx: int = model.dims.n_text_ctx
self.sample_len: int = options.sample_len or model.dims.n_text_ctx // 2
self.sot_sequence: Tuple[int] = tokenizer.sot_sequence
if self.options.without_timestamps:
self.sot_sequence = tokenizer.sot_sequence_including_notimestamps
self.initial_tokens: Tuple[int] = self._get_initial_tokens()
self.sample_begin: int = len(self.initial_tokens)
self.sot_index: int = self.initial_tokens.index(tokenizer.sot)
# inference: implements the forward pass through the decoder, including kv caching
self.inference = PyTorchInference(model, len(self.initial_tokens))
# sequence ranker: implements how to rank a group of sampled sequences
self.sequence_ranker = MaximumLikelihoodRanker(options.length_penalty)
# decoder: implements how to select the next tokens, given the autoregressive distribution
if options.beam_size is not None:
self.decoder = BeamSearchDecoder(
options.beam_size, tokenizer.eot, self.inference, options.patience
)
else:
self.decoder = GreedyDecoder(options.temperature, tokenizer.eot)
# logit filters: applies various rules to suppress or penalize certain tokens
self.logit_filters = []
if self.options.suppress_blank:
self.logit_filters.append(SuppressBlank(self.tokenizer, self.sample_begin))
if self.options.suppress_tokens:
self.logit_filters.append(SuppressTokens(self._get_suppress_tokens()))
if not options.without_timestamps:
precision = CHUNK_LENGTH / model.dims.n_audio_ctx # usually 0.02 seconds
max_initial_timestamp_index = None
if options.max_initial_timestamp:
max_initial_timestamp_index = round(self.options.max_initial_timestamp / precision)
self.logit_filters.append(
ApplyTimestampRules(tokenizer, self.sample_begin, max_initial_timestamp_index)
)
def _verify_options(self, options: DecodingOptions) -> DecodingOptions:
if options.beam_size is not None and options.best_of is not None:
raise ValueError("beam_size and best_of can't be given together")
if options.temperature == 0:
if options.best_of is not None:
raise ValueError("best_of with greedy sampling (T=0) is not compatible")
if options.patience is not None and options.beam_size is None:
raise ValueError("patience requires beam_size to be given")
if options.length_penalty is not None and not (0 <= options.length_penalty <= 1):
raise ValueError("length_penalty (alpha) should be a value between 0 and 1")
return options
def _get_initial_tokens(self) -> Tuple[int]:
tokens = list(self.sot_sequence)
prefix = self.options.prefix
prompt = self.options.prompt
if prefix:
prefix_tokens = (
self.tokenizer.encode(" " + prefix.strip()) if isinstance(prefix, str) else prefix
)
if self.sample_len is not None:
max_prefix_len = self.n_ctx // 2 - self.sample_len
prefix_tokens = prefix_tokens[-max_prefix_len:]
tokens = tokens + prefix_tokens
if prompt:
prompt_tokens = (
self.tokenizer.encode(" " + prompt.strip()) if isinstance(prompt, str) else prompt
)
tokens = [self.tokenizer.sot_prev] + prompt_tokens[-(self.n_ctx // 2 - 1) :] + tokens
return tuple(tokens)
def _get_suppress_tokens(self) -> Tuple[int]:
suppress_tokens = self.options.suppress_tokens
if isinstance(suppress_tokens, str):
suppress_tokens = [int(t) for t in suppress_tokens.split(",")]
if -1 in suppress_tokens:
suppress_tokens = [t for t in suppress_tokens if t >= 0]
suppress_tokens.extend(self.tokenizer.non_speech_tokens)
elif suppress_tokens is None or len(suppress_tokens) == 0:
suppress_tokens = [] # interpret empty string as an empty list
else:
assert isinstance(suppress_tokens, list), "suppress_tokens must be a list"
suppress_tokens.extend(
[self.tokenizer.sot, self.tokenizer.sot_prev, self.tokenizer.sot_lm]
)
if self.tokenizer.no_speech is not None:
# no-speech probability is collected separately
suppress_tokens.append(self.tokenizer.no_speech)
return tuple(sorted(set(suppress_tokens)))
def _get_audio_features(self, mel: Tensor):
if self.options.fp16:
mel = mel.half()
if mel.shape[-2:] == (self.model.dims.n_audio_ctx, self.model.dims.n_audio_state):
# encoded audio features are given; skip audio encoding
audio_features = mel
else:
audio_features = self.model.encoder(mel)
if audio_features.dtype != (torch.float16 if self.options.fp16 else torch.float32):
return TypeError(f"audio_features has an incorrect dtype: {audio_features.dtype}")
return audio_features
def _detect_language(self, audio_features: Tensor, tokens: Tensor):
languages = [self.options.language] * audio_features.shape[0]
lang_probs = None
if self.options.language is None or self.options.task == "lang_id":
lang_tokens, lang_probs = self.model.detect_language(audio_features, self.tokenizer)
languages = [max(probs, key=probs.get) for probs in lang_probs]
if self.options.language is None:
tokens[:, self.sot_index + 1] = lang_tokens # write language tokens
return languages, lang_probs
def _main_loop(self, audio_features: Tensor, tokens: Tensor):
assert audio_features.shape[0] == tokens.shape[0]
n_batch = tokens.shape[0]
sum_logprobs: Tensor = torch.zeros(n_batch, device=audio_features.device)
no_speech_probs = [np.nan] * n_batch
try:
for i in range(self.sample_len):
logits = self.inference.logits(tokens, audio_features)
if i == 0 and self.tokenizer.no_speech is not None: # save no_speech_probs
probs_at_sot = logits[:, self.sot_index].float().softmax(dim=-1)
no_speech_probs = probs_at_sot[:, self.tokenizer.no_speech].tolist()
# now we need to consider the logits at the last token only
logits = logits[:, -1]
# apply the logit filters, e.g. for suppressing or applying penalty to
for logit_filter in self.logit_filters:
logit_filter.apply(logits, tokens)
# expand the tokens tensor with the selected next tokens
tokens, completed = self.decoder.update(tokens, logits, sum_logprobs)
if completed or tokens.shape[-1] > self.n_ctx:
break
finally:
self.inference.cleanup_caching()
return tokens, sum_logprobs, no_speech_probs
@torch.no_grad()
def run(self, mel: Tensor) -> List[DecodingResult]:
self.decoder.reset()
tokenizer: Tokenizer = self.tokenizer
n_audio: int = mel.shape[0]
audio_features: Tensor = self._get_audio_features(mel) # encoder forward pass
tokens: Tensor = torch.tensor([self.initial_tokens]).repeat(n_audio, 1)
# detect language if requested, overwriting the language token
languages, language_probs = self._detect_language(audio_features, tokens)
if self.options.task == "lang_id":
return [
DecodingResult(audio_features=features, language=language, language_probs=probs)
for features, language, probs in zip(audio_features, languages, language_probs)
]
# repeat the audio & text tensors by the group size, for beam search or best-of-n sampling
audio_features = audio_features.repeat_interleave(self.n_group, dim=0)
tokens = tokens.repeat_interleave(self.n_group, dim=0).to(audio_features.device)
# call the main sampling loop
tokens, sum_logprobs, no_speech_probs = self._main_loop(audio_features, tokens)
# reshape the tensors to have (n_audio, n_group) as the first two dimensions
audio_features = audio_features[:: self.n_group]
no_speech_probs = no_speech_probs[:: self.n_group]
assert audio_features.shape[0] == len(no_speech_probs) == n_audio
tokens = tokens.reshape(n_audio, self.n_group, -1)
sum_logprobs = sum_logprobs.reshape(n_audio, self.n_group)
# get the final candidates for each group, and slice between the first sampled token and EOT
tokens, sum_logprobs = self.decoder.finalize(tokens, sum_logprobs)
tokens: List[List[Tensor]] = [
[t[self.sample_begin : (t == tokenizer.eot).nonzero()[0, 0]] for t in s] for s in tokens
]
# select the top-ranked sample in each group
selected = self.sequence_ranker.rank(tokens, sum_logprobs)
tokens: List[List[int]] = [t[i].tolist() for i, t in zip(selected, tokens)]
texts: List[str] = [tokenizer.decode(t).strip() for t in tokens]
sum_logprobs: List[float] = [lp[i] for i, lp in zip(selected, sum_logprobs)]
avg_logprobs: List[float] = [lp / (len(t) + 1) for t, lp in zip(tokens, sum_logprobs)]
fields = (texts, languages, tokens, audio_features, avg_logprobs, no_speech_probs)
if len(set(map(len, fields))) != 1:
raise RuntimeError(f"inconsistent result lengths: {list(map(len, fields))}")
return [
DecodingResult(
audio_features=features,
language=language,
tokens=tokens,
text=text,
avg_logprob=avg_logprob,
no_speech_prob=no_speech_prob,
temperature=self.options.temperature,
compression_ratio=compression_ratio(text),
)
for text, language, tokens, features, avg_logprob, no_speech_prob in zip(*fields)
]
@torch.no_grad()
def decode(model: "Whisper", mel: Tensor, options: DecodingOptions = DecodingOptions()) -> Union[DecodingResult, List[DecodingResult]]:
"""
Performs decoding of 30-second audio segment(s), provided as Mel spectrogram(s).
Parameters
----------
model: Whisper
the Whisper model instance
mel: torch.Tensor, shape = (80, 3000) or (*, 80, 3000)
A tensor containing the Mel spectrogram(s)
options: DecodingOptions
A dataclass that contains all necessary options for decoding 30-second segments
Returns
-------
result: Union[DecodingResult, List[DecodingResult]]
The result(s) of decoding contained in `DecodingResult` dataclass instance(s)
"""
single = mel.ndim == 2
if single:
mel = mel.unsqueeze(0)
result = DecodingTask(model, options).run(mel)
if single:
result = result[0]
return result

86
whisperx/diarize.py Normal file
View File

@ -0,0 +1,86 @@
import numpy as np
import pandas as pd
from pyannote.audio import Pipeline
from typing import Optional, Union
import torch
from whisperx.audio import load_audio, SAMPLE_RATE
from whisperx.types import TranscriptionResult, AlignedTranscriptionResult
class DiarizationPipeline:
def __init__(
self,
model_name=None,
use_auth_token=None,
device: Optional[Union[str, torch.device]] = "cpu",
):
if isinstance(device, str):
device = torch.device(device)
model_config = model_name or "pyannote/speaker-diarization-3.1"
self.model = Pipeline.from_pretrained(model_config, use_auth_token=use_auth_token).to(device)
def __call__(
self,
audio: Union[str, np.ndarray],
num_speakers: Optional[int] = None,
min_speakers: Optional[int] = None,
max_speakers: Optional[int] = None,
):
if isinstance(audio, str):
audio = load_audio(audio)
audio_data = {
'waveform': torch.from_numpy(audio[None, :]),
'sample_rate': SAMPLE_RATE
}
segments = self.model(audio_data, num_speakers = num_speakers, min_speakers=min_speakers, max_speakers=max_speakers)
diarize_df = pd.DataFrame(segments.itertracks(yield_label=True), columns=['segment', 'label', 'speaker'])
diarize_df['start'] = diarize_df['segment'].apply(lambda x: x.start)
diarize_df['end'] = diarize_df['segment'].apply(lambda x: x.end)
return diarize_df
def assign_word_speakers(
diarize_df: pd.DataFrame,
transcript_result: Union[AlignedTranscriptionResult, TranscriptionResult],
fill_nearest=False,
) -> dict:
transcript_segments = transcript_result["segments"]
for seg in transcript_segments:
# assign speaker to segment (if any)
diarize_df['intersection'] = np.minimum(diarize_df['end'], seg['end']) - np.maximum(diarize_df['start'], seg['start'])
diarize_df['union'] = np.maximum(diarize_df['end'], seg['end']) - np.minimum(diarize_df['start'], seg['start'])
# remove no hit, otherwise we look for closest (even negative intersection...)
if not fill_nearest:
dia_tmp = diarize_df[diarize_df['intersection'] > 0]
else:
dia_tmp = diarize_df
if len(dia_tmp) > 0:
# sum over speakers
speaker = dia_tmp.groupby("speaker")["intersection"].sum().sort_values(ascending=False).index[0]
seg["speaker"] = speaker
# assign speaker to words
if 'words' in seg:
for word in seg['words']:
if 'start' in word:
diarize_df['intersection'] = np.minimum(diarize_df['end'], word['end']) - np.maximum(diarize_df['start'], word['start'])
diarize_df['union'] = np.maximum(diarize_df['end'], word['end']) - np.minimum(diarize_df['start'], word['start'])
# remove no hit
if not fill_nearest:
dia_tmp = diarize_df[diarize_df['intersection'] > 0]
else:
dia_tmp = diarize_df
if len(dia_tmp) > 0:
# sum over speakers
speaker = dia_tmp.groupby("speaker")["intersection"].sum().sort_values(ascending=False).index[0]
word["speaker"] = speaker
return transcript_result
class Segment:
def __init__(self, start:int, end:int, speaker:Optional[str]=None):
self.start = start
self.end = end
self.speaker = speaker

View File

@ -1,268 +0,0 @@
from dataclasses import dataclass
from typing import Dict
from typing import Iterable, Optional
import numpy as np
import torch
import torch.nn.functional as F
from torch import Tensor
from torch import nn
from .transcribe import transcribe as transcribe_function
from .decoding import detect_language as detect_language_function, decode as decode_function
@dataclass
class ModelDimensions:
n_mels: int
n_audio_ctx: int
n_audio_state: int
n_audio_head: int
n_audio_layer: int
n_vocab: int
n_text_ctx: int
n_text_state: int
n_text_head: int
n_text_layer: int
class LayerNorm(nn.LayerNorm):
def forward(self, x: Tensor) -> Tensor:
return super().forward(x.float()).type(x.dtype)
class Linear(nn.Linear):
def forward(self, x: Tensor) -> Tensor:
return F.linear(
x, self.weight.to(x.dtype), None if self.bias is None else self.bias.to(x.dtype)
)
class Conv1d(nn.Conv1d):
def _conv_forward(self, x: Tensor, weight: Tensor, bias: Optional[Tensor]) -> Tensor:
return super()._conv_forward(
x, weight.to(x.dtype), None if bias is None else bias.to(x.dtype)
)
def sinusoids(length, channels, max_timescale=10000):
"""Returns sinusoids for positional embedding"""
assert channels % 2 == 0
log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2))
scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
return torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
class MultiHeadAttention(nn.Module):
def __init__(self, n_state: int, n_head: int):
super().__init__()
self.n_head = n_head
self.query = Linear(n_state, n_state)
self.key = Linear(n_state, n_state, bias=False)
self.value = Linear(n_state, n_state)
self.out = Linear(n_state, n_state)
def forward(
self,
x: Tensor,
xa: Optional[Tensor] = None,
mask: Optional[Tensor] = None,
kv_cache: Optional[dict] = None,
):
q = self.query(x)
if kv_cache is None or xa is None or self.key not in kv_cache:
# hooks, if installed (i.e. kv_cache is not None), will prepend the cached kv tensors;
# otherwise, perform key/value projections for self- or cross-attention as usual.
k = self.key(x if xa is None else xa)
v = self.value(x if xa is None else xa)
else:
# for cross-attention, calculate keys and values once and reuse in subsequent calls.
k = kv_cache[self.key]
v = kv_cache[self.value]
wv, qk = self.qkv_attention(q, k, v, mask)
return self.out(wv), qk
def qkv_attention(self, q: Tensor, k: Tensor, v: Tensor, mask: Optional[Tensor] = None):
n_batch, n_ctx, n_state = q.shape
scale = (n_state // self.n_head) ** -0.25
q = q.view(*q.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) * scale
k = k.view(*k.shape[:2], self.n_head, -1).permute(0, 2, 3, 1) * scale
v = v.view(*v.shape[:2], self.n_head, -1).permute(0, 2, 1, 3)
qk = q @ k
if mask is not None:
qk = qk + mask[:n_ctx, :n_ctx]
qk = qk.float()
w = F.softmax(qk, dim=-1).to(q.dtype)
return (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2), qk.detach()
class ResidualAttentionBlock(nn.Module):
def __init__(self, n_state: int, n_head: int, cross_attention: bool = False):
super().__init__()
self.attn = MultiHeadAttention(n_state, n_head)
self.attn_ln = LayerNorm(n_state)
self.cross_attn = MultiHeadAttention(n_state, n_head) if cross_attention else None
self.cross_attn_ln = LayerNorm(n_state) if cross_attention else None
n_mlp = n_state * 4
self.mlp = nn.Sequential(Linear(n_state, n_mlp), nn.GELU(), Linear(n_mlp, n_state))
self.mlp_ln = LayerNorm(n_state)
def forward(
self,
x: Tensor,
xa: Optional[Tensor] = None,
mask: Optional[Tensor] = None,
kv_cache: Optional[dict] = None,
):
x = x + self.attn(self.attn_ln(x), mask=mask, kv_cache=kv_cache)[0]
if self.cross_attn:
x = x + self.cross_attn(self.cross_attn_ln(x), xa, kv_cache=kv_cache)[0]
x = x + self.mlp(self.mlp_ln(x))
return x
class AudioEncoder(nn.Module):
def __init__(self, n_mels: int, n_ctx: int, n_state: int, n_head: int, n_layer: int):
super().__init__()
self.conv1 = Conv1d(n_mels, n_state, kernel_size=3, padding=1)
self.conv2 = Conv1d(n_state, n_state, kernel_size=3, stride=2, padding=1)
self.register_buffer("positional_embedding", sinusoids(n_ctx, n_state))
self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
[ResidualAttentionBlock(n_state, n_head) for _ in range(n_layer)]
)
self.ln_post = LayerNorm(n_state)
def forward(self, x: Tensor):
"""
x : torch.Tensor, shape = (batch_size, n_mels, n_ctx)
the mel spectrogram of the audio
"""
x = F.gelu(self.conv1(x))
x = F.gelu(self.conv2(x))
x = x.permute(0, 2, 1)
assert x.shape[1:] == self.positional_embedding.shape, "incorrect audio shape"
x = (x + self.positional_embedding).to(x.dtype)
for block in self.blocks:
x = block(x)
x = self.ln_post(x)
return x
class TextDecoder(nn.Module):
def __init__(self, n_vocab: int, n_ctx: int, n_state: int, n_head: int, n_layer: int):
super().__init__()
self.token_embedding = nn.Embedding(n_vocab, n_state)
self.positional_embedding = nn.Parameter(torch.empty(n_ctx, n_state))
self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
[ResidualAttentionBlock(n_state, n_head, cross_attention=True) for _ in range(n_layer)]
)
self.ln = LayerNorm(n_state)
mask = torch.empty(n_ctx, n_ctx).fill_(-np.inf).triu_(1)
self.register_buffer("mask", mask, persistent=False)
def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None):
"""
x : torch.LongTensor, shape = (batch_size, <= n_ctx)
the text tokens
xa : torch.Tensor, shape = (batch_size, n_mels, n_audio_ctx)
the encoded audio features to be attended on
"""
offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0
x = self.token_embedding(x) + self.positional_embedding[offset : offset + x.shape[-1]]
x = x.to(xa.dtype)
for block in self.blocks:
x = block(x, xa, mask=self.mask, kv_cache=kv_cache)
x = self.ln(x)
logits = (x @ torch.transpose(self.token_embedding.weight.to(x.dtype), 0, 1)).float()
return logits
class Whisper(nn.Module):
def __init__(self, dims: ModelDimensions):
super().__init__()
self.dims = dims
self.encoder = AudioEncoder(
self.dims.n_mels,
self.dims.n_audio_ctx,
self.dims.n_audio_state,
self.dims.n_audio_head,
self.dims.n_audio_layer,
)
self.decoder = TextDecoder(
self.dims.n_vocab,
self.dims.n_text_ctx,
self.dims.n_text_state,
self.dims.n_text_head,
self.dims.n_text_layer,
)
def embed_audio(self, mel: torch.Tensor):
return self.encoder(mel)
def logits(self, tokens: torch.Tensor, audio_features: torch.Tensor):
return self.decoder(tokens, audio_features)
def forward(self, mel: torch.Tensor, tokens: torch.Tensor) -> Dict[str, torch.Tensor]:
return self.decoder(tokens, self.encoder(mel))
@property
def device(self):
return next(self.parameters()).device
@property
def is_multilingual(self):
return self.dims.n_vocab == 51865
def install_kv_cache_hooks(self, cache: Optional[dict] = None):
"""
The `MultiHeadAttention` module optionally accepts `kv_cache` which stores the key and value
tensors calculated for the previous positions. This method returns a dictionary that stores
all caches, and the necessary hooks for the key and value projection modules that save the
intermediate tensors to be reused during later calculations.
Returns
-------
cache : Dict[nn.Module, torch.Tensor]
A dictionary object mapping the key/value projection modules to its cache
hooks : List[RemovableHandle]
List of PyTorch RemovableHandle objects to stop the hooks to be called
"""
cache = {**cache} if cache is not None else {}
hooks = []
def save_to_cache(module, _, output):
if module not in cache or output.shape[1] > self.decoder.positional_embedding.shape[0]:
cache[module] = output # save as-is, for the first token or cross attention
else:
cache[module] = torch.cat([cache[module], output], dim=1).detach()
return cache[module]
def install_hooks(layer: nn.Module):
if isinstance(layer, MultiHeadAttention):
hooks.append(layer.key.register_forward_hook(save_to_cache))
hooks.append(layer.value.register_forward_hook(save_to_cache))
self.decoder.apply(install_hooks)
return cache, hooks
detect_language = detect_language_function
transcribe = transcribe_function
decode = decode_function

View File

@ -1,2 +0,0 @@
from .basic import BasicTextNormalizer
from .english import EnglishTextNormalizer

View File

@ -1,71 +0,0 @@
import re
import unicodedata
import regex
# non-ASCII letters that are not separated by "NFKD" normalization
ADDITIONAL_DIACRITICS = {
"œ": "oe",
"Œ": "OE",
"ø": "o",
"Ø": "O",
"æ": "ae",
"Æ": "AE",
"ß": "ss",
"": "SS",
"đ": "d",
"Đ": "D",
"ð": "d",
"Ð": "D",
"þ": "th",
"Þ": "th",
"ł": "l",
"Ł": "L",
}
def remove_symbols_and_diacritics(s: str, keep=""):
"""
Replace any other markers, symbols, and punctuations with a space,
and drop any diacritics (category 'Mn' and some manual mappings)
"""
return "".join(
c
if c in keep
else ADDITIONAL_DIACRITICS[c]
if c in ADDITIONAL_DIACRITICS
else ""
if unicodedata.category(c) == "Mn"
else " "
if unicodedata.category(c)[0] in "MSP"
else c
for c in unicodedata.normalize("NFKD", s)
)
def remove_symbols(s: str):
"""
Replace any other markers, symbols, punctuations with a space, keeping diacritics
"""
return "".join(
" " if unicodedata.category(c)[0] in "MSP" else c for c in unicodedata.normalize("NFKC", s)
)
class BasicTextNormalizer:
def __init__(self, remove_diacritics: bool = False, split_letters: bool = False):
self.clean = remove_symbols_and_diacritics if remove_diacritics else remove_symbols
self.split_letters = split_letters
def __call__(self, s: str):
s = s.lower()
s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets
s = re.sub(r"\(([^)]+?)\)", "", s) # remove words between parenthesis
s = self.clean(s).lower()
if self.split_letters:
s = " ".join(regex.findall(r"\X", s, regex.U))
s = re.sub(r"\s+", " ", s) # replace any successive whitespace characters with a space
return s

File diff suppressed because it is too large Load Diff

View File

@ -1,543 +0,0 @@
import json
import os
import re
from fractions import Fraction
from typing import Iterator, List, Match, Optional, Union
from more_itertools import windowed
from .basic import remove_symbols_and_diacritics
class EnglishNumberNormalizer:
"""
Convert any spelled-out numbers into arabic numbers, while handling:
- remove any commas
- keep the suffixes such as: `1960s`, `274th`, `32nd`, etc.
- spell out currency symbols after the number. e.g. `$20 million` -> `20000000 dollars`
- spell out `one` and `ones`
- interpret successive single-digit numbers as nominal: `one oh one` -> `101`
"""
def __init__(self):
super().__init__()
self.zeros = {"o", "oh", "zero"}
self.ones = {
name: i
for i, name in enumerate(
[
"one",
"two",
"three",
"four",
"five",
"six",
"seven",
"eight",
"nine",
"ten",
"eleven",
"twelve",
"thirteen",
"fourteen",
"fifteen",
"sixteen",
"seventeen",
"eighteen",
"nineteen",
],
start=1,
)
}
self.ones_plural = {
"sixes" if name == "six" else name + "s": (value, "s")
for name, value in self.ones.items()
}
self.ones_ordinal = {
"zeroth": (0, "th"),
"first": (1, "st"),
"second": (2, "nd"),
"third": (3, "rd"),
"fifth": (5, "th"),
"twelfth": (12, "th"),
**{
name + ("h" if name.endswith("t") else "th"): (value, "th")
for name, value in self.ones.items()
if value > 3 and value != 5 and value != 12
},
}
self.ones_suffixed = {**self.ones_plural, **self.ones_ordinal}
self.tens = {
"twenty": 20,
"thirty": 30,
"forty": 40,
"fifty": 50,
"sixty": 60,
"seventy": 70,
"eighty": 80,
"ninety": 90,
}
self.tens_plural = {
name.replace("y", "ies"): (value, "s") for name, value in self.tens.items()
}
self.tens_ordinal = {
name.replace("y", "ieth"): (value, "th") for name, value in self.tens.items()
}
self.tens_suffixed = {**self.tens_plural, **self.tens_ordinal}
self.multipliers = {
"hundred": 100,
"thousand": 1_000,
"million": 1_000_000,
"billion": 1_000_000_000,
"trillion": 1_000_000_000_000,
"quadrillion": 1_000_000_000_000_000,
"quintillion": 1_000_000_000_000_000_000,
"sextillion": 1_000_000_000_000_000_000_000,
"septillion": 1_000_000_000_000_000_000_000_000,
"octillion": 1_000_000_000_000_000_000_000_000_000,
"nonillion": 1_000_000_000_000_000_000_000_000_000_000,
"decillion": 1_000_000_000_000_000_000_000_000_000_000_000,
}
self.multipliers_plural = {
name + "s": (value, "s") for name, value in self.multipliers.items()
}
self.multipliers_ordinal = {
name + "th": (value, "th") for name, value in self.multipliers.items()
}
self.multipliers_suffixed = {**self.multipliers_plural, **self.multipliers_ordinal}
self.decimals = {*self.ones, *self.tens, *self.zeros}
self.preceding_prefixers = {
"minus": "-",
"negative": "-",
"plus": "+",
"positive": "+",
}
self.following_prefixers = {
"pound": "£",
"pounds": "£",
"euro": "",
"euros": "",
"dollar": "$",
"dollars": "$",
"cent": "¢",
"cents": "¢",
}
self.prefixes = set(
list(self.preceding_prefixers.values()) + list(self.following_prefixers.values())
)
self.suffixers = {
"per": {"cent": "%"},
"percent": "%",
}
self.specials = {"and", "double", "triple", "point"}
self.words = set(
[
key
for mapping in [
self.zeros,
self.ones,
self.ones_suffixed,
self.tens,
self.tens_suffixed,
self.multipliers,
self.multipliers_suffixed,
self.preceding_prefixers,
self.following_prefixers,
self.suffixers,
self.specials,
]
for key in mapping
]
)
self.literal_words = {"one", "ones"}
def process_words(self, words: List[str]) -> Iterator[str]:
prefix: Optional[str] = None
value: Optional[Union[str, int]] = None
skip = False
def to_fraction(s: str):
try:
return Fraction(s)
except ValueError:
return None
def output(result: Union[str, int]):
nonlocal prefix, value
result = str(result)
if prefix is not None:
result = prefix + result
value = None
prefix = None
return result
if len(words) == 0:
return
for prev, current, next in windowed([None] + words + [None], 3):
if skip:
skip = False
continue
next_is_numeric = next is not None and re.match(r"^\d+(\.\d+)?$", next)
has_prefix = current[0] in self.prefixes
current_without_prefix = current[1:] if has_prefix else current
if re.match(r"^\d+(\.\d+)?$", current_without_prefix):
# arabic numbers (potentially with signs and fractions)
f = to_fraction(current_without_prefix)
assert f is not None
if value is not None:
if isinstance(value, str) and value.endswith("."):
# concatenate decimals / ip address components
value = str(value) + str(current)
continue
else:
yield output(value)
prefix = current[0] if has_prefix else prefix
if f.denominator == 1:
value = f.numerator # store integers as int
else:
value = current_without_prefix
elif current not in self.words:
# non-numeric words
if value is not None:
yield output(value)
yield output(current)
elif current in self.zeros:
value = str(value or "") + "0"
elif current in self.ones:
ones = self.ones[current]
if value is None:
value = ones
elif isinstance(value, str) or prev in self.ones:
if prev in self.tens and ones < 10: # replace the last zero with the digit
assert value[-1] == "0"
value = value[:-1] + str(ones)
else:
value = str(value) + str(ones)
elif ones < 10:
if value % 10 == 0:
value += ones
else:
value = str(value) + str(ones)
else: # eleven to nineteen
if value % 100 == 0:
value += ones
else:
value = str(value) + str(ones)
elif current in self.ones_suffixed:
# ordinal or cardinal; yield the number right away
ones, suffix = self.ones_suffixed[current]
if value is None:
yield output(str(ones) + suffix)
elif isinstance(value, str) or prev in self.ones:
if prev in self.tens and ones < 10:
assert value[-1] == "0"
yield output(value[:-1] + str(ones) + suffix)
else:
yield output(str(value) + str(ones) + suffix)
elif ones < 10:
if value % 10 == 0:
yield output(str(value + ones) + suffix)
else:
yield output(str(value) + str(ones) + suffix)
else: # eleven to nineteen
if value % 100 == 0:
yield output(str(value + ones) + suffix)
else:
yield output(str(value) + str(ones) + suffix)
value = None
elif current in self.tens:
tens = self.tens[current]
if value is None:
value = tens
elif isinstance(value, str):
value = str(value) + str(tens)
else:
if value % 100 == 0:
value += tens
else:
value = str(value) + str(tens)
elif current in self.tens_suffixed:
# ordinal or cardinal; yield the number right away
tens, suffix = self.tens_suffixed[current]
if value is None:
yield output(str(tens) + suffix)
elif isinstance(value, str):
yield output(str(value) + str(tens) + suffix)
else:
if value % 100 == 0:
yield output(str(value + tens) + suffix)
else:
yield output(str(value) + str(tens) + suffix)
elif current in self.multipliers:
multiplier = self.multipliers[current]
if value is None:
value = multiplier
elif isinstance(value, str) or value == 0:
f = to_fraction(value)
p = f * multiplier if f is not None else None
if f is not None and p.denominator == 1:
value = p.numerator
else:
yield output(value)
value = multiplier
else:
before = value // 1000 * 1000
residual = value % 1000
value = before + residual * multiplier
elif current in self.multipliers_suffixed:
multiplier, suffix = self.multipliers_suffixed[current]
if value is None:
yield output(str(multiplier) + suffix)
elif isinstance(value, str):
f = to_fraction(value)
p = f * multiplier if f is not None else None
if f is not None and p.denominator == 1:
yield output(str(p.numerator) + suffix)
else:
yield output(value)
yield output(str(multiplier) + suffix)
else: # int
before = value // 1000 * 1000
residual = value % 1000
value = before + residual * multiplier
yield output(str(value) + suffix)
value = None
elif current in self.preceding_prefixers:
# apply prefix (positive, minus, etc.) if it precedes a number
if value is not None:
yield output(value)
if next in self.words or next_is_numeric:
prefix = self.preceding_prefixers[current]
else:
yield output(current)
elif current in self.following_prefixers:
# apply prefix (dollars, cents, etc.) only after a number
if value is not None:
prefix = self.following_prefixers[current]
yield output(value)
else:
yield output(current)
elif current in self.suffixers:
# apply suffix symbols (percent -> '%')
if value is not None:
suffix = self.suffixers[current]
if isinstance(suffix, dict):
if next in suffix:
yield output(str(value) + suffix[next])
skip = True
else:
yield output(value)
yield output(current)
else:
yield output(str(value) + suffix)
else:
yield output(current)
elif current in self.specials:
if next not in self.words and not next_is_numeric:
# apply special handling only if the next word can be numeric
if value is not None:
yield output(value)
yield output(current)
elif current == "and":
# ignore "and" after hundreds, thousands, etc.
if prev not in self.multipliers:
if value is not None:
yield output(value)
yield output(current)
elif current == "double" or current == "triple":
if next in self.ones or next in self.zeros:
repeats = 2 if current == "double" else 3
ones = self.ones.get(next, 0)
value = str(value or "") + str(ones) * repeats
skip = True
else:
if value is not None:
yield output(value)
yield output(current)
elif current == "point":
if next in self.decimals or next_is_numeric:
value = str(value or "") + "."
else:
# should all have been covered at this point
raise ValueError(f"Unexpected token: {current}")
else:
# all should have been covered at this point
raise ValueError(f"Unexpected token: {current}")
if value is not None:
yield output(value)
def preprocess(self, s: str):
# replace "<number> and a half" with "<number> point five"
results = []
segments = re.split(r"\band\s+a\s+half\b", s)
for i, segment in enumerate(segments):
if len(segment.strip()) == 0:
continue
if i == len(segments) - 1:
results.append(segment)
else:
results.append(segment)
last_word = segment.rsplit(maxsplit=2)[-1]
if last_word in self.decimals or last_word in self.multipliers:
results.append("point five")
else:
results.append("and a half")
s = " ".join(results)
# put a space at number/letter boundary
s = re.sub(r"([a-z])([0-9])", r"\1 \2", s)
s = re.sub(r"([0-9])([a-z])", r"\1 \2", s)
# but remove spaces which could be a suffix
s = re.sub(r"([0-9])\s+(st|nd|rd|th|s)\b", r"\1\2", s)
return s
def postprocess(self, s: str):
def combine_cents(m: Match):
try:
currency = m.group(1)
integer = m.group(2)
cents = int(m.group(3))
return f"{currency}{integer}.{cents:02d}"
except ValueError:
return m.string
def extract_cents(m: Match):
try:
return f"¢{int(m.group(1))}"
except ValueError:
return m.string
# apply currency postprocessing; "$2 and ¢7" -> "$2.07"
s = re.sub(r"([€£$])([0-9]+) (?:and )?¢([0-9]{1,2})\b", combine_cents, s)
s = re.sub(r"[€£$]0.([0-9]{1,2})\b", extract_cents, s)
# write "one(s)" instead of "1(s)", just for the readability
s = re.sub(r"\b1(s?)\b", r"one\1", s)
return s
def __call__(self, s: str):
s = self.preprocess(s)
s = " ".join(word for word in self.process_words(s.split()) if word is not None)
s = self.postprocess(s)
return s
class EnglishSpellingNormalizer:
"""
Applies British-American spelling mappings as listed in [1].
[1] https://www.tysto.com/uk-us-spelling-list.html
"""
def __init__(self):
mapping_path = os.path.join(os.path.dirname(__file__), "english.json")
self.mapping = json.load(open(mapping_path))
def __call__(self, s: str):
return " ".join(self.mapping.get(word, word) for word in s.split())
class EnglishTextNormalizer:
def __init__(self):
self.ignore_patterns = r"\b(hmm|mm|mhm|mmm|uh|um)\b"
self.replacers = {
# common contractions
r"\bwon't\b": "will not",
r"\bcan't\b": "can not",
r"\blet's\b": "let us",
r"\bain't\b": "aint",
r"\by'all\b": "you all",
r"\bwanna\b": "want to",
r"\bgotta\b": "got to",
r"\bgonna\b": "going to",
r"\bi'ma\b": "i am going to",
r"\bimma\b": "i am going to",
r"\bwoulda\b": "would have",
r"\bcoulda\b": "could have",
r"\bshoulda\b": "should have",
r"\bma'am\b": "madam",
# contractions in titles/prefixes
r"\bmr\b": "mister ",
r"\bmrs\b": "missus ",
r"\bst\b": "saint ",
r"\bdr\b": "doctor ",
r"\bprof\b": "professor ",
r"\bcapt\b": "captain ",
r"\bgov\b": "governor ",
r"\bald\b": "alderman ",
r"\bgen\b": "general ",
r"\bsen\b": "senator ",
r"\brep\b": "representative ",
r"\bpres\b": "president ",
r"\brev\b": "reverend ",
r"\bhon\b": "honorable ",
r"\basst\b": "assistant ",
r"\bassoc\b": "associate ",
r"\blt\b": "lieutenant ",
r"\bcol\b": "colonel ",
r"\bjr\b": "junior ",
r"\bsr\b": "senior ",
r"\besq\b": "esquire ",
# prefect tenses, ideally it should be any past participles, but it's harder..
r"'d been\b": " had been",
r"'s been\b": " has been",
r"'d gone\b": " had gone",
r"'s gone\b": " has gone",
r"'d done\b": " had done", # "'s done" is ambiguous
r"'s got\b": " has got",
# general contractions
r"n't\b": " not",
r"'re\b": " are",
r"'s\b": " is",
r"'d\b": " would",
r"'ll\b": " will",
r"'t\b": " not",
r"'ve\b": " have",
r"'m\b": " am",
}
self.standardize_numbers = EnglishNumberNormalizer()
self.standardize_spellings = EnglishSpellingNormalizer()
def __call__(self, s: str):
s = s.lower()
s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets
s = re.sub(r"\(([^)]+?)\)", "", s) # remove words between parenthesis
s = re.sub(self.ignore_patterns, "", s)
s = re.sub(r"\s+'", "'", s) # standardize when there's a space before an apostrophe
for pattern, replacement in self.replacers.items():
s = re.sub(pattern, replacement, s)
s = re.sub(r"(\d),(\d)", r"\1\2", s) # remove commas between digits
s = re.sub(r"\.([^0-9]|$)", r" \1", s) # remove periods not followed by numbers
s = remove_symbols_and_diacritics(s, keep=".%$¢€£") # keep some symbols for numerics
s = self.standardize_numbers(s)
s = self.standardize_spellings(s)
# now remove prefix/suffix symbols that are not preceded/followed by numbers
s = re.sub(r"[.$¢€£]([^0-9])", r" \1", s)
s = re.sub(r"([^0-9])%", r"\1 ", s)
s = re.sub(r"\s+", " ", s) # replace any successive whitespace characters with a space
return s

View File

@ -1,331 +0,0 @@
import os
from dataclasses import dataclass
from functools import lru_cache
from typing import List, Optional, Tuple, Union
import numpy as np
import torch
from transformers import GPT2TokenizerFast
LANGUAGES = {
"en": "english",
"zh": "chinese",
"de": "german",
"es": "spanish",
"ru": "russian",
"ko": "korean",
"fr": "french",
"ja": "japanese",
"pt": "portuguese",
"tr": "turkish",
"pl": "polish",
"ca": "catalan",
"nl": "dutch",
"ar": "arabic",
"sv": "swedish",
"it": "italian",
"id": "indonesian",
"hi": "hindi",
"fi": "finnish",
"vi": "vietnamese",
"he": "hebrew",
"uk": "ukrainian",
"el": "greek",
"ms": "malay",
"cs": "czech",
"ro": "romanian",
"da": "danish",
"hu": "hungarian",
"ta": "tamil",
"no": "norwegian",
"th": "thai",
"ur": "urdu",
"hr": "croatian",
"bg": "bulgarian",
"lt": "lithuanian",
"la": "latin",
"mi": "maori",
"ml": "malayalam",
"cy": "welsh",
"sk": "slovak",
"te": "telugu",
"fa": "persian",
"lv": "latvian",
"bn": "bengali",
"sr": "serbian",
"az": "azerbaijani",
"sl": "slovenian",
"kn": "kannada",
"et": "estonian",
"mk": "macedonian",
"br": "breton",
"eu": "basque",
"is": "icelandic",
"hy": "armenian",
"ne": "nepali",
"mn": "mongolian",
"bs": "bosnian",
"kk": "kazakh",
"sq": "albanian",
"sw": "swahili",
"gl": "galician",
"mr": "marathi",
"pa": "punjabi",
"si": "sinhala",
"km": "khmer",
"sn": "shona",
"yo": "yoruba",
"so": "somali",
"af": "afrikaans",
"oc": "occitan",
"ka": "georgian",
"be": "belarusian",
"tg": "tajik",
"sd": "sindhi",
"gu": "gujarati",
"am": "amharic",
"yi": "yiddish",
"lo": "lao",
"uz": "uzbek",
"fo": "faroese",
"ht": "haitian creole",
"ps": "pashto",
"tk": "turkmen",
"nn": "nynorsk",
"mt": "maltese",
"sa": "sanskrit",
"lb": "luxembourgish",
"my": "myanmar",
"bo": "tibetan",
"tl": "tagalog",
"mg": "malagasy",
"as": "assamese",
"tt": "tatar",
"haw": "hawaiian",
"ln": "lingala",
"ha": "hausa",
"ba": "bashkir",
"jw": "javanese",
"su": "sundanese",
}
# language code lookup by name, with a few language aliases
TO_LANGUAGE_CODE = {
**{language: code for code, language in LANGUAGES.items()},
"burmese": "my",
"valencian": "ca",
"flemish": "nl",
"haitian": "ht",
"letzeburgesch": "lb",
"pushto": "ps",
"panjabi": "pa",
"moldavian": "ro",
"moldovan": "ro",
"sinhalese": "si",
"castilian": "es",
}
@dataclass(frozen=True)
class Tokenizer:
"""A thin wrapper around `GPT2TokenizerFast` providing quick access to special tokens"""
tokenizer: "GPT2TokenizerFast"
language: Optional[str]
sot_sequence: Tuple[int]
def encode(self, text, **kwargs):
return self.tokenizer.encode(text, **kwargs)
def decode(self, token_ids: Union[int, List[int], np.ndarray, torch.Tensor], **kwargs):
return self.tokenizer.decode(token_ids, **kwargs)
def decode_with_timestamps(self, tokens) -> str:
"""
Timestamp tokens are above the special tokens' id range and are ignored by `decode()`.
This method decodes given tokens with timestamps tokens annotated, e.g. "<|1.08|>".
"""
outputs = [[]]
for token in tokens:
if token >= self.timestamp_begin:
timestamp = f"<|{(token - self.timestamp_begin) * 0.02:.2f}|>"
outputs.append(timestamp)
outputs.append([])
else:
outputs[-1].append(token)
outputs = [s if isinstance(s, str) else self.tokenizer.decode(s) for s in outputs]
return "".join(outputs)
@property
@lru_cache()
def eot(self) -> int:
return self.tokenizer.eos_token_id
@property
@lru_cache()
def sot(self) -> int:
return self._get_single_token_id("<|startoftranscript|>")
@property
@lru_cache()
def sot_lm(self) -> int:
return self._get_single_token_id("<|startoflm|>")
@property
@lru_cache()
def sot_prev(self) -> int:
return self._get_single_token_id("<|startofprev|>")
@property
@lru_cache()
def no_speech(self) -> int:
return self._get_single_token_id("<|nospeech|>")
@property
@lru_cache()
def no_timestamps(self) -> int:
return self._get_single_token_id("<|notimestamps|>")
@property
@lru_cache()
def timestamp_begin(self) -> int:
return self.tokenizer.all_special_ids[-1] + 1
@property
@lru_cache()
def language_token(self) -> int:
"""Returns the token id corresponding to the value of the `language` field"""
if self.language is None:
raise ValueError(f"This tokenizer does not have language token configured")
additional_tokens = dict(
zip(
self.tokenizer.additional_special_tokens,
self.tokenizer.additional_special_tokens_ids,
)
)
candidate = f"<|{self.language}|>"
if candidate in additional_tokens:
return additional_tokens[candidate]
raise KeyError(f"Language {self.language} not found in tokenizer.")
@property
@lru_cache()
def all_language_tokens(self) -> Tuple[int]:
result = []
for token, token_id in zip(
self.tokenizer.additional_special_tokens,
self.tokenizer.additional_special_tokens_ids,
):
if token.strip("<|>") in LANGUAGES:
result.append(token_id)
return tuple(result)
@property
@lru_cache()
def all_language_codes(self) -> Tuple[str]:
return tuple(self.decode([l]).strip("<|>") for l in self.all_language_tokens)
@property
@lru_cache()
def sot_sequence_including_notimestamps(self) -> Tuple[int]:
return tuple(list(self.sot_sequence) + [self.no_timestamps])
@property
@lru_cache()
def non_speech_tokens(self) -> Tuple[int]:
"""
Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech
annotations, to prevent sampling texts that are not actually spoken in the audio, e.g.
- ♪♪♪
- ( SPEAKING FOREIGN LANGUAGE )
- [DAVID] Hey there,
keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
"""
symbols = list("\"#()*+/:;<=>@[\\]^_`{|}~「」『』")
symbols += "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split()
# symbols that may be a single token or multiple tokens depending on the tokenizer.
# In case they're multiple tokens, suppress the first token, which is safe because:
# These are between U+2640 and U+267F miscellaneous symbols that are okay to suppress
# in generations, and in the 3-byte UTF-8 representation they share the first two bytes.
miscellaneous = set("♩♪♫♬♭♮♯")
assert all(0x2640 <= ord(c) <= 0x267F for c in miscellaneous)
# allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word
result = {self.tokenizer.encode(" -")[0], self.tokenizer.encode(" '")[0]}
for symbol in symbols + list(miscellaneous):
for tokens in [self.tokenizer.encode(symbol), self.tokenizer.encode(" " + symbol)]:
if len(tokens) == 1 or symbol in miscellaneous:
result.add(tokens[0])
return tuple(sorted(result))
def _get_single_token_id(self, text) -> int:
tokens = self.tokenizer.encode(text)
assert len(tokens) == 1, f"{text} is not encoded as a single token"
return tokens[0]
@lru_cache(maxsize=None)
def build_tokenizer(name: str = "gpt2"):
os.environ["TOKENIZERS_PARALLELISM"] = "false"
path = os.path.join(os.path.dirname(__file__), "assets", name)
tokenizer = GPT2TokenizerFast.from_pretrained(path)
specials = [
"<|startoftranscript|>",
*[f"<|{lang}|>" for lang in LANGUAGES.keys()],
"<|translate|>",
"<|transcribe|>",
"<|startoflm|>",
"<|startofprev|>",
"<|nospeech|>",
"<|notimestamps|>",
]
tokenizer.add_special_tokens(dict(additional_special_tokens=specials))
return tokenizer
@lru_cache(maxsize=None)
def get_tokenizer(
multilingual: bool,
*,
task: Optional[str] = None, # Literal["transcribe", "translate", None]
language: Optional[str] = None,
) -> Tokenizer:
if language is not None:
language = language.lower()
if language not in LANGUAGES:
if language in TO_LANGUAGE_CODE:
language = TO_LANGUAGE_CODE[language]
else:
raise ValueError(f"Unsupported language: {language}")
if multilingual:
tokenizer_name = "multilingual"
task = task or "transcribe"
language = language or "en"
else:
tokenizer_name = "gpt2"
task = None
language = None
tokenizer = build_tokenizer(name=tokenizer_name)
all_special_ids: List[int] = tokenizer.all_special_ids
sot: int = all_special_ids[1]
translate: int = all_special_ids[-6]
transcribe: int = all_special_ids[-5]
langs = tuple(LANGUAGES.keys())
sot_sequence = [sot]
if language is not None:
sot_sequence.append(sot + 1 + langs.index(language))
if task is not None:
sot_sequence.append(transcribe if task == "transcribe" else translate)
return Tokenizer(tokenizer=tokenizer, language=language, sot_sequence=tuple(sot_sequence))

File diff suppressed because it is too large Load Diff

69
whisperx/types.py Normal file
View File

@ -0,0 +1,69 @@
from typing import TypedDict, Optional, List, Tuple
class SingleWordSegment(TypedDict):
"""
A single word of a speech.
"""
word: str
start: float
end: float
score: float
class SingleCharSegment(TypedDict):
"""
A single char of a speech.
"""
char: str
start: float
end: float
score: float
class SingleSegment(TypedDict):
"""
A single segment (up to multiple sentences) of a speech.
"""
start: float
end: float
text: str
class SegmentData(TypedDict):
"""
Temporary processing data used during alignment.
Contains cleaned and preprocessed data for each segment.
"""
clean_char: List[str] # Cleaned characters that exist in model dictionary
clean_cdx: List[int] # Original indices of cleaned characters
clean_wdx: List[int] # Indices of words containing valid characters
sentence_spans: List[Tuple[int, int]] # Start and end indices of sentences
class SingleAlignedSegment(TypedDict):
"""
A single segment (up to multiple sentences) of a speech with word alignment.
"""
start: float
end: float
text: str
words: List[SingleWordSegment]
chars: Optional[List[SingleCharSegment]]
class TranscriptionResult(TypedDict):
"""
A list of segments and word segments of a speech.
"""
segments: List[SingleSegment]
language: str
class AlignedTranscriptionResult(TypedDict):
"""
A list of segments and word segments of a speech.
"""
segments: List[SingleAlignedSegment]
word_segments: List[SingleWordSegment]

View File

@ -1,7 +1,146 @@
import json
import os
import re
import sys
import zlib
from typing import Callable, TextIO, Iterator, Tuple
import pandas as pd
from typing import Callable, Optional, TextIO
LANGUAGES = {
"en": "english",
"zh": "chinese",
"de": "german",
"es": "spanish",
"ru": "russian",
"ko": "korean",
"fr": "french",
"ja": "japanese",
"pt": "portuguese",
"tr": "turkish",
"pl": "polish",
"ca": "catalan",
"nl": "dutch",
"ar": "arabic",
"sv": "swedish",
"it": "italian",
"id": "indonesian",
"hi": "hindi",
"fi": "finnish",
"vi": "vietnamese",
"he": "hebrew",
"uk": "ukrainian",
"el": "greek",
"ms": "malay",
"cs": "czech",
"ro": "romanian",
"da": "danish",
"hu": "hungarian",
"ta": "tamil",
"no": "norwegian",
"th": "thai",
"ur": "urdu",
"hr": "croatian",
"bg": "bulgarian",
"lt": "lithuanian",
"la": "latin",
"mi": "maori",
"ml": "malayalam",
"cy": "welsh",
"sk": "slovak",
"te": "telugu",
"fa": "persian",
"lv": "latvian",
"bn": "bengali",
"sr": "serbian",
"az": "azerbaijani",
"sl": "slovenian",
"kn": "kannada",
"et": "estonian",
"mk": "macedonian",
"br": "breton",
"eu": "basque",
"is": "icelandic",
"hy": "armenian",
"ne": "nepali",
"mn": "mongolian",
"bs": "bosnian",
"kk": "kazakh",
"sq": "albanian",
"sw": "swahili",
"gl": "galician",
"mr": "marathi",
"pa": "punjabi",
"si": "sinhala",
"km": "khmer",
"sn": "shona",
"yo": "yoruba",
"so": "somali",
"af": "afrikaans",
"oc": "occitan",
"ka": "georgian",
"be": "belarusian",
"tg": "tajik",
"sd": "sindhi",
"gu": "gujarati",
"am": "amharic",
"yi": "yiddish",
"lo": "lao",
"uz": "uzbek",
"fo": "faroese",
"ht": "haitian creole",
"ps": "pashto",
"tk": "turkmen",
"nn": "nynorsk",
"mt": "maltese",
"sa": "sanskrit",
"lb": "luxembourgish",
"my": "myanmar",
"bo": "tibetan",
"tl": "tagalog",
"mg": "malagasy",
"as": "assamese",
"tt": "tatar",
"haw": "hawaiian",
"ln": "lingala",
"ha": "hausa",
"ba": "bashkir",
"jw": "javanese",
"su": "sundanese",
"yue": "cantonese",
}
# language code lookup by name, with a few language aliases
TO_LANGUAGE_CODE = {
**{language: code for code, language in LANGUAGES.items()},
"burmese": "my",
"valencian": "ca",
"flemish": "nl",
"haitian": "ht",
"letzeburgesch": "lb",
"pushto": "ps",
"panjabi": "pa",
"moldavian": "ro",
"moldovan": "ro",
"sinhalese": "si",
"castilian": "es",
}
LANGUAGES_WITHOUT_SPACES = ["ja", "zh"]
system_encoding = sys.getdefaultencoding()
if system_encoding != "utf-8":
def make_safe(string):
# replaces any character not representable using the system default encoding with an '?',
# avoiding UnicodeEncodeError (https://github.com/openai/whisper/discussions/729).
return string.encode(system_encoding, errors="replace").decode(system_encoding)
else:
def make_safe(string):
# utf-8 can encode any Unicode code point, so no need to do the round-trip encoding
return string
def exact_div(x, y):
assert x % y == 0
@ -29,7 +168,9 @@ def compression_ratio(text) -> float:
return len(text_bytes) / len(zlib.compress(text_bytes))
def format_timestamp(seconds: float, always_include_hours: bool = False, decimal_marker: str = '.'):
def format_timestamp(
seconds: float, always_include_hours: bool = False, decimal_marker: str = "."
):
assert seconds >= 0, "non-negative timestamp expected"
milliseconds = round(seconds * 1000.0)
@ -43,212 +184,259 @@ def format_timestamp(seconds: float, always_include_hours: bool = False, decimal
milliseconds -= seconds * 1_000
hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
return f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
return (
f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
)
def write_txt(transcript: Iterator[dict], file: TextIO):
for segment in transcript:
print(segment['text'].strip(), file=file, flush=True)
class ResultWriter:
extension: str
def __init__(self, output_dir: str):
self.output_dir = output_dir
def write_vtt(transcript: Iterator[dict], file: TextIO):
print("WEBVTT\n", file=file)
for segment in transcript:
print(
f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n"
f"{segment['text'].strip().replace('-->', '->')}\n",
file=file,
flush=True,
def __call__(self, result: dict, audio_path: str, options: dict):
audio_basename = os.path.basename(audio_path)
audio_basename = os.path.splitext(audio_basename)[0]
output_path = os.path.join(
self.output_dir, audio_basename + "." + self.extension
)
def write_tsv(transcript: Iterator[dict], file: TextIO):
print("start", "end", "text", sep="\t", file=file)
for segment in transcript:
print(round(1000 * segment['start']), file=file, end="\t")
print(round(1000 * segment['end']), file=file, end="\t")
print(segment['text'].strip().replace("\t", " "), file=file, flush=True)
with open(output_path, "w", encoding="utf-8") as f:
self.write_result(result, file=f, options=options)
def write_result(self, result: dict, file: TextIO, options: dict):
raise NotImplementedError
def write_srt(transcript: Iterator[dict], file: TextIO):
"""
Write a transcript to a file in SRT format.
class WriteTXT(ResultWriter):
extension: str = "txt"
Example usage:
from pathlib import Path
from whisper.utils import write_srt
result = transcribe(model, audio_path, temperature=temperature, **args)
# save SRT
audio_basename = Path(audio_path).stem
with open(Path(output_dir) / (audio_basename + ".srt"), "w", encoding="utf-8") as srt:
write_srt(result["segments"], file=srt)
"""
for i, segment in enumerate(transcript, start=1):
# write srt lines
print(
f"{i}\n"
f"{format_timestamp(segment['start'], always_include_hours=True, decimal_marker=',')} --> "
f"{format_timestamp(segment['end'], always_include_hours=True, decimal_marker=',')}\n"
f"{segment['text'].strip().replace('-->', '->')}\n",
file=file,
flush=True,
)
def write_ass(transcript: Iterator[dict],
file: TextIO,
resolution: str = "word",
color: str = None, underline=True,
prefmt: str = None, suffmt: str = None,
font: str = None, font_size: int = 24,
strip=True, **kwargs):
"""
Credit: https://github.com/jianfch/stable-ts/blob/ff79549bd01f764427879f07ecd626c46a9a430a/stable_whisper/text_output.py
Generate Advanced SubStation Alpha (ass) file from results to
display both phrase-level & word-level timestamp simultaneously by:
-using segment-level timestamps display phrases as usual
-using word-level timestamps change formats (e.g. color/underline) of the word in the displayed segment
Note: ass file is used in the same way as srt, vtt, etc.
Parameters
----------
transcript: dict
results from modified model
file: TextIO
file object to write to
resolution: str
"word" or "char", timestamp resolution to highlight.
color: str
color code for a word at its corresponding timestamp
<bbggrr> reverse order hexadecimal RGB value (e.g. FF0000 is full intensity blue. Default: 00FF00)
underline: bool
whether to underline a word at its corresponding timestamp
prefmt: str
used to specify format for word-level timestamps (must be use with 'suffmt' and overrides 'color'&'underline')
appears as such in the .ass file:
Hi, {<prefmt>}how{<suffmt>} are you?
reference [Appendix A: Style override codes] in http://www.tcax.org/docs/ass-specs.htm
suffmt: str
used to specify format for word-level timestamps (must be use with 'prefmt' and overrides 'color'&'underline')
appears as such in the .ass file:
Hi, {<prefmt>}how{<suffmt>} are you?
reference [Appendix A: Style override codes] in http://www.tcax.org/docs/ass-specs.htm
font: str
word font (default: Arial)
font_size: int
word font size (default: 48)
kwargs:
used for format styles:
'Name', 'Fontname', 'Fontsize', 'PrimaryColour', 'SecondaryColour', 'OutlineColour', 'BackColour', 'Bold',
'Italic', 'Underline', 'StrikeOut', 'ScaleX', 'ScaleY', 'Spacing', 'Angle', 'BorderStyle', 'Outline',
'Shadow', 'Alignment', 'MarginL', 'MarginR', 'MarginV', 'Encoding'
"""
fmt_style_dict = {'Name': 'Default', 'Fontname': 'Arial', 'Fontsize': '48', 'PrimaryColour': '&Hffffff',
'SecondaryColour': '&Hffffff', 'OutlineColour': '&H0', 'BackColour': '&H0', 'Bold': '0',
'Italic': '0', 'Underline': '0', 'StrikeOut': '0', 'ScaleX': '100', 'ScaleY': '100',
'Spacing': '0', 'Angle': '0', 'BorderStyle': '1', 'Outline': '1', 'Shadow': '0',
'Alignment': '2', 'MarginL': '10', 'MarginR': '10', 'MarginV': '10', 'Encoding': '0'}
for k, v in filter(lambda x: 'colour' in x[0].lower() and not str(x[1]).startswith('&H'), kwargs.items()):
kwargs[k] = f'&H{kwargs[k]}'
fmt_style_dict.update((k, v) for k, v in kwargs.items() if k in fmt_style_dict)
if font:
fmt_style_dict.update(Fontname=font)
if font_size:
fmt_style_dict.update(Fontsize=font_size)
fmts = f'Format: {", ".join(map(str, fmt_style_dict.keys()))}'
styles = f'Style: {",".join(map(str, fmt_style_dict.values()))}'
ass_str = f'[Script Info]\nScriptType: v4.00+\nPlayResX: 384\nPlayResY: 288\nScaledBorderAndShadow: yes\n\n' \
f'[V4+ Styles]\n{fmts}\n{styles}\n\n' \
f'[Events]\nFormat: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n\n'
if prefmt or suffmt:
if suffmt:
assert prefmt, 'prefmt must be used along with suffmt'
else:
suffmt = r'\r'
else:
if not color:
color = 'HFF00'
underline_code = r'\u1' if underline else ''
prefmt = r'{\1c&' + f'{color.upper()}&{underline_code}' + '}'
suffmt = r'{\r}'
def secs_to_hhmmss(secs: Tuple[float, int]):
mm, ss = divmod(secs, 60)
hh, mm = divmod(mm, 60)
return f'{hh:0>1.0f}:{mm:0>2.0f}:{ss:0>2.2f}'
def dialogue(chars: str, start: float, end: float, idx_0: int, idx_1: int) -> str:
if idx_0 == -1:
text = chars
else:
text = f'{chars[:idx_0]}{prefmt}{chars[idx_0:idx_1]}{suffmt}{chars[idx_1:]}'
return f"Dialogue: 0,{secs_to_hhmmss(start)},{secs_to_hhmmss(end)}," \
f"Default,,0,0,0,,{text.strip() if strip else text}"
if resolution == "word":
resolution_key = "word-segments"
elif resolution == "char":
resolution_key = "char-segments"
else:
raise ValueError(".ass resolution should be 'word' or 'char', not ", resolution)
ass_arr = []
for segment in transcript:
if resolution_key in segment:
res_segs = pd.DataFrame(segment[resolution_key])
prev = segment['start']
if "speaker" in segment:
speaker_str = f"[{segment['speaker']}]: "
def write_result(self, result: dict, file: TextIO, options: dict):
for segment in result["segments"]:
speaker = segment.get("speaker")
text = segment["text"].strip()
if speaker is not None:
print(f"[{speaker}]: {text}", file=file, flush=True)
else:
speaker_str = ""
for cdx, crow in res_segs.iterrows():
if crow['start'] is not None:
if resolution == "char":
idx_0 = cdx
idx_1 = cdx + 1
elif resolution == "word":
idx_0 = int(crow["segment-text-start"])
idx_1 = int(crow["segment-text-end"])
# fill gap
if crow['start'] > prev:
filler_ts = {
"chars": speaker_str + segment['text'],
"start": prev,
"end": crow['start'],
"idx_0": -1,
"idx_1": -1
}
print(text, file=file, flush=True)
ass_arr.append(filler_ts)
# highlight current word
f_word_ts = {
"chars": speaker_str + segment['text'],
"start": crow['start'],
"end": crow['end'],
"idx_0": idx_0 + len(speaker_str),
"idx_1": idx_1 + len(speaker_str)
}
ass_arr.append(f_word_ts)
prev = crow['end']
ass_str += '\n'.join(map(lambda x: dialogue(**x), ass_arr))
class SubtitlesWriter(ResultWriter):
always_include_hours: bool
decimal_marker: str
file.write(ass_str)
def iterate_result(self, result: dict, options: dict):
raw_max_line_width: Optional[int] = options["max_line_width"]
max_line_count: Optional[int] = options["max_line_count"]
highlight_words: bool = options["highlight_words"]
max_line_width = 1000 if raw_max_line_width is None else raw_max_line_width
preserve_segments = max_line_count is None or raw_max_line_width is None
if len(result["segments"]) == 0:
return
def iterate_subtitles():
line_len = 0
line_count = 1
# the next subtitle to yield (a list of word timings with whitespace)
subtitle: list[dict] = []
times: list[tuple] = []
last = result["segments"][0]["start"]
for segment in result["segments"]:
for i, original_timing in enumerate(segment["words"]):
timing = original_timing.copy()
long_pause = not preserve_segments
if "start" in timing:
long_pause = long_pause and timing["start"] - last > 3.0
else:
long_pause = False
has_room = line_len + len(timing["word"]) <= max_line_width
seg_break = i == 0 and len(subtitle) > 0 and preserve_segments
if line_len > 0 and has_room and not long_pause and not seg_break:
# line continuation
line_len += len(timing["word"])
else:
# new line
timing["word"] = timing["word"].strip()
if (
len(subtitle) > 0
and max_line_count is not None
and (long_pause or line_count >= max_line_count)
or seg_break
):
# subtitle break
yield subtitle, times
subtitle = []
times = []
line_count = 1
elif line_len > 0:
# line break
line_count += 1
timing["word"] = "\n" + timing["word"]
line_len = len(timing["word"].strip())
subtitle.append(timing)
times.append((segment["start"], segment["end"], segment.get("speaker")))
if "start" in timing:
last = timing["start"]
if len(subtitle) > 0:
yield subtitle, times
if "words" in result["segments"][0]:
for subtitle, _ in iterate_subtitles():
sstart, ssend, speaker = _[0]
subtitle_start = self.format_timestamp(sstart)
subtitle_end = self.format_timestamp(ssend)
if result["language"] in LANGUAGES_WITHOUT_SPACES:
subtitle_text = "".join([word["word"] for word in subtitle])
else:
subtitle_text = " ".join([word["word"] for word in subtitle])
has_timing = any(["start" in word for word in subtitle])
# add [$SPEAKER_ID]: to each subtitle if speaker is available
prefix = ""
if speaker is not None:
prefix = f"[{speaker}]: "
if highlight_words and has_timing:
last = subtitle_start
all_words = [timing["word"] for timing in subtitle]
for i, this_word in enumerate(subtitle):
if "start" in this_word:
start = self.format_timestamp(this_word["start"])
end = self.format_timestamp(this_word["end"])
if last != start:
yield last, start, prefix + subtitle_text
yield start, end, prefix + " ".join(
[
re.sub(r"^(\s*)(.*)$", r"\1<u>\2</u>", word)
if j == i
else word
for j, word in enumerate(all_words)
]
)
last = end
else:
yield subtitle_start, subtitle_end, prefix + subtitle_text
else:
for segment in result["segments"]:
segment_start = self.format_timestamp(segment["start"])
segment_end = self.format_timestamp(segment["end"])
segment_text = segment["text"].strip().replace("-->", "->")
if "speaker" in segment:
segment_text = f"[{segment['speaker']}]: {segment_text}"
yield segment_start, segment_end, segment_text
def format_timestamp(self, seconds: float):
return format_timestamp(
seconds=seconds,
always_include_hours=self.always_include_hours,
decimal_marker=self.decimal_marker,
)
class WriteVTT(SubtitlesWriter):
extension: str = "vtt"
always_include_hours: bool = False
decimal_marker: str = "."
def write_result(self, result: dict, file: TextIO, options: dict):
print("WEBVTT\n", file=file)
for start, end, text in self.iterate_result(result, options):
print(f"{start} --> {end}\n{text}\n", file=file, flush=True)
class WriteSRT(SubtitlesWriter):
extension: str = "srt"
always_include_hours: bool = True
decimal_marker: str = ","
def write_result(self, result: dict, file: TextIO, options: dict):
for i, (start, end, text) in enumerate(
self.iterate_result(result, options), start=1
):
print(f"{i}\n{start} --> {end}\n{text}\n", file=file, flush=True)
class WriteTSV(ResultWriter):
"""
Write a transcript to a file in TSV (tab-separated values) format containing lines like:
<start time in integer milliseconds>\t<end time in integer milliseconds>\t<transcript text>
Using integer milliseconds as start and end times means there's no chance of interference from
an environment setting a language encoding that causes the decimal in a floating point number
to appear as a comma; also is faster and more efficient to parse & store, e.g., in C++.
"""
extension: str = "tsv"
def write_result(self, result: dict, file: TextIO, options: dict):
print("start", "end", "text", sep="\t", file=file)
for segment in result["segments"]:
print(round(1000 * segment["start"]), file=file, end="\t")
print(round(1000 * segment["end"]), file=file, end="\t")
print(segment["text"].strip().replace("\t", " "), file=file, flush=True)
class WriteAudacity(ResultWriter):
"""
Write a transcript to a text file that audacity can import as labels.
The extension used is "aud" to distinguish it from the txt file produced by WriteTXT.
Yet this is not an audacity project but only a label file!
Please note : Audacity uses seconds in timestamps not ms!
Also there is no header expected.
If speaker is provided it is prepended to the text between double square brackets [[]].
"""
extension: str = "aud"
def write_result(self, result: dict, file: TextIO, options: dict):
ARROW = " "
for segment in result["segments"]:
print(segment["start"], file=file, end=ARROW)
print(segment["end"], file=file, end=ARROW)
print( ( ("[[" + segment["speaker"] + "]]") if "speaker" in segment else "") + segment["text"].strip().replace("\t", " "), file=file, flush=True)
class WriteJSON(ResultWriter):
extension: str = "json"
def write_result(self, result: dict, file: TextIO, options: dict):
json.dump(result, file, ensure_ascii=False)
def get_writer(
output_format: str, output_dir: str
) -> Callable[[dict, str, dict], None]:
writers = {
"txt": WriteTXT,
"vtt": WriteVTT,
"srt": WriteSRT,
"tsv": WriteTSV,
"json": WriteJSON,
}
optional_writers = {
"aud": WriteAudacity,
}
if output_format == "all":
all_writers = [writer(output_dir) for writer in writers.values()]
def write_all(result: dict, file: str, options: dict):
for writer in all_writers:
writer(result, file, options)
return write_all
if output_format in optional_writers:
return optional_writers[output_format](output_dir)
return writers[output_format](output_dir)
def interpolate_nans(x, method='nearest'):
if x.notnull().sum() > 1:
return x.interpolate(method=method).ffill().bfill()
else:
return x.ffill().bfill()
return x.ffill().bfill()

View File

@ -0,0 +1,3 @@
from whisperx.vads.pyannote import Pyannote as Pyannote
from whisperx.vads.silero import Silero as Silero
from whisperx.vads.vad import Vad as Vad

263
whisperx/vads/pyannote.py Normal file
View File

@ -0,0 +1,263 @@
import os
from typing import Callable, Text, Union
from typing import Optional
import numpy as np
import torch
from pyannote.audio import Model
from pyannote.audio.core.io import AudioFile
from pyannote.audio.pipelines import VoiceActivityDetection
from pyannote.audio.pipelines.utils import PipelineModel
from pyannote.core import Annotation, SlidingWindowFeature
from pyannote.core import Segment
from whisperx.diarize import Segment as SegmentX
from whisperx.vads.vad import Vad
def load_vad_model(device, vad_onset=0.500, vad_offset=0.363, use_auth_token=None, model_fp=None):
model_dir = torch.hub._get_torch_home()
main_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
os.makedirs(model_dir, exist_ok = True)
if model_fp is None:
# Dynamically resolve the path to the model file
model_fp = os.path.join(main_dir, "assets", "pytorch_model.bin")
model_fp = os.path.abspath(model_fp) # Ensure the path is absolute
else:
model_fp = os.path.abspath(model_fp) # Ensure any provided path is absolute
# Check if the resolved model file exists
if not os.path.exists(model_fp):
raise FileNotFoundError(f"Model file not found at {model_fp}")
if os.path.exists(model_fp) and not os.path.isfile(model_fp):
raise RuntimeError(f"{model_fp} exists and is not a regular file")
model_bytes = open(model_fp, "rb").read()
vad_model = Model.from_pretrained(model_fp, use_auth_token=use_auth_token)
hyperparameters = {"onset": vad_onset,
"offset": vad_offset,
"min_duration_on": 0.1,
"min_duration_off": 0.1}
vad_pipeline = VoiceActivitySegmentation(segmentation=vad_model, device=torch.device(device))
vad_pipeline.instantiate(hyperparameters)
return vad_pipeline
class Binarize:
"""Binarize detection scores using hysteresis thresholding, with min-cut operation
to ensure not segments are longer than max_duration.
Parameters
----------
onset : float, optional
Onset threshold. Defaults to 0.5.
offset : float, optional
Offset threshold. Defaults to `onset`.
min_duration_on : float, optional
Remove active regions shorter than that many seconds. Defaults to 0s.
min_duration_off : float, optional
Fill inactive regions shorter than that many seconds. Defaults to 0s.
pad_onset : float, optional
Extend active regions by moving their start time by that many seconds.
Defaults to 0s.
pad_offset : float, optional
Extend active regions by moving their end time by that many seconds.
Defaults to 0s.
max_duration: float
The maximum length of an active segment, divides segment at timestamp with lowest score.
Reference
---------
Gregory Gelly and Jean-Luc Gauvain. "Minimum Word Error Training of
RNN-based Voice Activity Detection", InterSpeech 2015.
Modified by Max Bain to include WhisperX's min-cut operation
https://arxiv.org/abs/2303.00747
Pyannote-audio
"""
def __init__(
self,
onset: float = 0.5,
offset: Optional[float] = None,
min_duration_on: float = 0.0,
min_duration_off: float = 0.0,
pad_onset: float = 0.0,
pad_offset: float = 0.0,
max_duration: float = float('inf')
):
super().__init__()
self.onset = onset
self.offset = offset or onset
self.pad_onset = pad_onset
self.pad_offset = pad_offset
self.min_duration_on = min_duration_on
self.min_duration_off = min_duration_off
self.max_duration = max_duration
def __call__(self, scores: SlidingWindowFeature) -> Annotation:
"""Binarize detection scores
Parameters
----------
scores : SlidingWindowFeature
Detection scores.
Returns
-------
active : Annotation
Binarized scores.
"""
num_frames, num_classes = scores.data.shape
frames = scores.sliding_window
timestamps = [frames[i].middle for i in range(num_frames)]
# annotation meant to store 'active' regions
active = Annotation()
for k, k_scores in enumerate(scores.data.T):
label = k if scores.labels is None else scores.labels[k]
# initial state
start = timestamps[0]
is_active = k_scores[0] > self.onset
curr_scores = [k_scores[0]]
curr_timestamps = [start]
t = start
for t, y in zip(timestamps[1:], k_scores[1:]):
# currently active
if is_active:
curr_duration = t - start
if curr_duration > self.max_duration:
search_after = len(curr_scores) // 2
# divide segment
min_score_div_idx = search_after + np.argmin(curr_scores[search_after:])
min_score_t = curr_timestamps[min_score_div_idx]
region = Segment(start - self.pad_onset, min_score_t + self.pad_offset)
active[region, k] = label
start = curr_timestamps[min_score_div_idx]
curr_scores = curr_scores[min_score_div_idx + 1:]
curr_timestamps = curr_timestamps[min_score_div_idx + 1:]
# switching from active to inactive
elif y < self.offset:
region = Segment(start - self.pad_onset, t + self.pad_offset)
active[region, k] = label
start = t
is_active = False
curr_scores = []
curr_timestamps = []
curr_scores.append(y)
curr_timestamps.append(t)
# currently inactive
else:
# switching from inactive to active
if y > self.onset:
start = t
is_active = True
# if active at the end, add final region
if is_active:
region = Segment(start - self.pad_onset, t + self.pad_offset)
active[region, k] = label
# because of padding, some active regions might be overlapping: merge them.
# also: fill same speaker gaps shorter than min_duration_off
if self.pad_offset > 0.0 or self.pad_onset > 0.0 or self.min_duration_off > 0.0:
if self.max_duration < float("inf"):
raise NotImplementedError(f"This would break current max_duration param")
active = active.support(collar=self.min_duration_off)
# remove tracks shorter than min_duration_on
if self.min_duration_on > 0:
for segment, track in list(active.itertracks()):
if segment.duration < self.min_duration_on:
del active[segment, track]
return active
class VoiceActivitySegmentation(VoiceActivityDetection):
def __init__(
self,
segmentation: PipelineModel = "pyannote/segmentation",
fscore: bool = False,
use_auth_token: Union[Text, None] = None,
**inference_kwargs,
):
super().__init__(segmentation=segmentation, fscore=fscore, use_auth_token=use_auth_token, **inference_kwargs)
def apply(self, file: AudioFile, hook: Optional[Callable] = None) -> Annotation:
"""Apply voice activity detection
Parameters
----------
file : AudioFile
Processed file.
hook : callable, optional
Hook called after each major step of the pipeline with the following
signature: hook("step_name", step_artefact, file=file)
Returns
-------
speech : Annotation
Speech regions.
"""
# setup hook (e.g. for debugging purposes)
hook = self.setup_hook(file, hook=hook)
# apply segmentation model (only if needed)
# output shape is (num_chunks, num_frames, 1)
if self.training:
if self.CACHED_SEGMENTATION in file:
segmentations = file[self.CACHED_SEGMENTATION]
else:
segmentations = self._segmentation(file)
file[self.CACHED_SEGMENTATION] = segmentations
else:
segmentations: SlidingWindowFeature = self._segmentation(file)
return segmentations
class Pyannote(Vad):
def __init__(self, device, use_auth_token=None, model_fp=None, **kwargs):
print(">>Performing voice activity detection using Pyannote...")
super().__init__(kwargs['vad_onset'])
self.vad_pipeline = load_vad_model(device, use_auth_token=use_auth_token, model_fp=model_fp)
def __call__(self, audio: AudioFile, **kwargs):
return self.vad_pipeline(audio)
@staticmethod
def preprocess_audio(audio):
return torch.from_numpy(audio).unsqueeze(0)
@staticmethod
def merge_chunks(segments,
chunk_size,
onset: float = 0.5,
offset: Optional[float] = None,
):
assert chunk_size > 0
binarize = Binarize(max_duration=chunk_size, onset=onset, offset=offset)
segments = binarize(segments)
segments_list = []
for speech_turn in segments.get_timeline():
segments_list.append(SegmentX(speech_turn.start, speech_turn.end, "UNKNOWN"))
if len(segments_list) == 0:
print("No active speech found in audio")
return []
assert segments_list, "segments_list is empty."
return Vad.merge_chunks(segments_list, chunk_size, onset, offset)

66
whisperx/vads/silero.py Normal file
View File

@ -0,0 +1,66 @@
from io import IOBase
from pathlib import Path
from typing import Mapping, Text
from typing import Optional
from typing import Union
import torch
from whisperx.diarize import Segment as SegmentX
from whisperx.vads.vad import Vad
AudioFile = Union[Text, Path, IOBase, Mapping]
class Silero(Vad):
# check again default values
def __init__(self, **kwargs):
print(">>Performing voice activity detection using Silero...")
super().__init__(kwargs['vad_onset'])
self.vad_onset = kwargs['vad_onset']
self.chunk_size = kwargs['chunk_size']
self.vad_pipeline, vad_utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
model='silero_vad',
force_reload=False,
onnx=False,
trust_repo=True)
(self.get_speech_timestamps, _, self.read_audio, _, _) = vad_utils
def __call__(self, audio: AudioFile, **kwargs):
"""use silero to get segments of speech"""
# Only accept 16000 Hz for now.
# Note: Silero models support both 8000 and 16000 Hz. Although other values are not directly supported,
# multiples of 16000 (e.g. 32000 or 48000) are cast to 16000 inside of the JIT model!
sample_rate = audio["sample_rate"]
if sample_rate != 16000:
raise ValueError("Only 16000Hz sample rate is allowed")
timestamps = self.get_speech_timestamps(audio["waveform"],
model=self.vad_pipeline,
sampling_rate=sample_rate,
max_speech_duration_s=self.chunk_size,
threshold=self.vad_onset
# min_silence_duration_ms = self.min_duration_off/1000
# min_speech_duration_ms = self.min_duration_on/1000
# ...
# See silero documentation for full option list
)
return [SegmentX(i['start'] / sample_rate, i['end'] / sample_rate, "UNKNOWN") for i in timestamps]
@staticmethod
def preprocess_audio(audio):
return audio
@staticmethod
def merge_chunks(segments_list,
chunk_size,
onset: float = 0.5,
offset: Optional[float] = None,
):
assert chunk_size > 0
if len(segments_list) == 0:
print("No active speech found in audio")
return []
assert segments_list, "segments_list is empty."
return Vad.merge_chunks(segments_list, chunk_size, onset, offset)

74
whisperx/vads/vad.py Normal file
View File

@ -0,0 +1,74 @@
from typing import Optional
import pandas as pd
from pyannote.core import Annotation, Segment
class Vad:
def __init__(self, vad_onset):
if not (0 < vad_onset < 1):
raise ValueError(
"vad_onset is a decimal value between 0 and 1."
)
@staticmethod
def preprocess_audio(audio):
pass
# keep merge_chunks as static so it can be also used by manually assigned vad_model (see 'load_model')
@staticmethod
def merge_chunks(segments,
chunk_size,
onset: float,
offset: Optional[float]):
"""
Merge operation described in paper
"""
curr_end = 0
merged_segments = []
seg_idxs: list[tuple]= []
speaker_idxs: list[Optional[str]] = []
curr_start = segments[0].start
for seg in segments:
if seg.end - curr_start > chunk_size and curr_end - curr_start > 0:
merged_segments.append({
"start": curr_start,
"end": curr_end,
"segments": seg_idxs,
})
curr_start = seg.start
seg_idxs = []
speaker_idxs = []
curr_end = seg.end
seg_idxs.append((seg.start, seg.end))
speaker_idxs.append(seg.speaker)
# add final
merged_segments.append({
"start": curr_start,
"end": curr_end,
"segments": seg_idxs,
})
return merged_segments
# Unused function
@staticmethod
def merge_vad(vad_arr, pad_onset=0.0, pad_offset=0.0, min_duration_off=0.0, min_duration_on=0.0):
active = Annotation()
for k, vad_t in enumerate(vad_arr):
region = Segment(vad_t[0] - pad_onset, vad_t[1] + pad_offset)
active[region, k] = 1
if pad_offset > 0.0 or pad_onset > 0.0 or min_duration_off > 0.0:
active = active.support(collar=min_duration_off)
# remove tracks shorter than min_duration_on
if min_duration_on > 0:
for segment, track in list(active.itertracks()):
if segment.duration < min_duration_on:
del active[segment, track]
active = active.for_json()
active_segs = pd.DataFrame([x['segment'] for x in active['content']])
return active_segs