bump: update version to 3.3.1

Update tmp.yml
Create tmp.yml
2025-07-01 18:17:27 -04:00 · 2025-01-08 18:00:34 +01:00 · 2025-01-08 17:59:28 +01:00 · 2025-01-08 17:59:28 +01:00 · 2025-01-08 17:59:13 +01:00 · 2025-01-05 18:56:19 +01:00
22 changed files with 2233 additions and 1348 deletions
--- a/.github/workflows/build-and-release.yml
+++ b/.github/workflows/build-and-release.yml
@ -0,0 +1,35 @@
 name: Build and release
 on:
  release:
    types: [published]
 jobs:
  build:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: "3.9"
      - name: Install dependencies
        run: |
          python -m pip install build
      - name: Build wheels
        run: python -m build --wheel
      - name: Release to Github
        uses: softprops/action-gh-release@v2
        with:
          files: dist/*
      - name: Publish package to PyPi
        uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
        with:
          user: __token__
          password: ${{ secrets.PYPI_API_TOKEN }}
--- a/.github/workflows/python-compatibility.yml
+++ b/.github/workflows/python-compatibility.yml
@ -0,0 +1,32 @@
 name: Python Compatibility Test
 on:
  push:
    branches: [main]
  pull_request:
    branches: [main]
  workflow_dispatch:  # Allows manual triggering from GitHub UI
 jobs:
  test:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        python-version: ["3.9", "3.10", "3.11", "3.12"]
    steps:
      - uses: actions/checkout@v4
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python-version }}
      - name: Install package
        run: |
          python -m pip install --upgrade pip
          pip install .
      - name: Test import
        run: |
          python -c "import whisperx; print('Successfully imported whisperx')"
--- a/.github/workflows/tmp.yml
+++ b/.github/workflows/tmp.yml
@ -0,0 +1,35 @@
 name: Python Compatibility Test (PyPi)
 on:
  push:
    branches: [main]
  pull_request:
    branches: [main]
  workflow_dispatch:  # Allows manual triggering from GitHub UI
 jobs:
  test:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        python-version: ["3.9", "3.10", "3.11", "3.12"]
    steps:
      - uses: actions/checkout@v4
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python-version }}
      - name: Install package
        run: |
          pip install whisperx
      - name: Print packages
        run: |
          pip list
      - name: Test import
        run: |
          python -c "import whisperx; print('Successfully imported whisperx')"
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,171 @@
-whisperx.egg-info/
+# Byte-compiled / optimized / DLL files
-**/__pycache__/
+__pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 cover/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 .pybuilder/
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # UV
 #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 #uv.lock
 # poetry
 #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
 #poetry.lock
 # pdm
 #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
 #pdm.lock
 #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
 #   in version control.
 #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
 .pdm.toml
 .pdm-python
 .pdm-build/
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # Cython debug symbols
 cython_debug/
 # PyCharm
 #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 # PyPI configuration file
 .pypirc
--- a/33
+++ b/33
@ -1,27 +1,24 @@
-Copyright (c) 2022, Max Bain
+BSD 2-Clause License
-All rights reserved.
+
 Copyright (c) 2024, Max Bain
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
 1. Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions and the following disclaimer in the
   documentation and/or other materials provided with the distribution.
 3. All advertising materials mentioning features or use of this software
   must display the following acknowledgement:
   This product includes software developed by Max Bain.
 4. Neither the name of Max Bain nor the
   names of its contributors may be used to endorse or promote products
   derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ''AS IS'' AND ANY
+1. Redistributions of source code must retain the above copyright notice, this
-EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+   list of conditions and the following disclaimer.
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+
 2. Redistributions in binary form must reproduce the above copyright notice,
   this list of conditions and the following disclaimer in the documentation
   and/or other materials provided with the distribution.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,4 +1,3 @@
 include whisperx/assets/*
-include whisperx/assets/gpt2/*
+include LICENSE
-include whisperx/assets/multilingual/*
+include requirements.txt
 include whisperx/normalizers/english.json
--- a/README.md
+++ b/README.md
@ -13,36 +13,36 @@
        <img src="https://img.shields.io/github/license/m-bain/whisperX.svg"
             alt="GitHub license">
  </a>
  <a href="https://arxiv.org/abs/2303.00747">
        <img src="http://img.shields.io/badge/Arxiv-2303.00747-B31B1B.svg"
             alt="ArXiv paper">
  </a>
  <a href="https://twitter.com/intent/tweet?text=&url=https%3A%2F%2Fgithub.com%2Fm-bain%2FwhisperX">
  <img src="https://img.shields.io/twitter/url/https/github.com/m-bain/whisperX.svg?style=social" alt="Twitter">
  </a>      
 </p>
-<p align="center">
+
-  <a href="#what-is-it">What is it</a> •
+<img width="1216" align="center" alt="whisperx-arch" src="https://raw.githubusercontent.com/m-bain/whisperX/refs/heads/main/figures/pipeline.png">
  <a href="#setup">Setup</a> •
  <a href="#example">Usage</a> •
  <a href="#other-languages">Multilingual</a> •
  <a href="#contribute">Contribute</a> •
  <a href="EXAMPLES.md">More examples</a> •
  <a href="https://arxiv.org/abs/2303.00747">Paper</a>
 </p>
-<img width="1216" align="center" alt="whisperx-arch" src="figures/pipeline.png">
+<!-- <p align="left">Whisper-Based Automatic Speech Recognition (ASR) with improved timestamp accuracy + quality via forced phoneme alignment and voice-activity based batching for fast inference.</p> -->
-<p align="left">Whisper-Based Automatic Speech Recognition (ASR) with improved timestamp accuracy using forced alignment.
+<!-- <h2 align="left", id="what-is-it">What is it 🔎</h2> -->
 </p>
-<h2 align="left", id="what-is-it">What is it 🔎</h2>
+This repository provides fast automatic speech recognition (70x realtime with large-v2) with word-level timestamps and speaker diarization.
-This repository refines the timestamps of openAI's Whisper model via forced aligment with phoneme-based ASR models (e.g. wav2vec2.0) and VAD preprocesssing, multilingual use-case.
+- ⚡️ Batched inference for 70x realtime transcription using whisper large-v2
 - 🪶 [faster-whisper](https://github.com/guillaumekln/faster-whisper) backend, requires <8GB gpu memory for large-v2 with beam_size=5
 - 🎯 Accurate word-level timestamps using wav2vec2 alignment
 - 👯‍♂️ Multispeaker ASR using speaker diarization from [pyannote-audio](https://github.com/pyannote/pyannote-audio) (speaker ID labels) 
 - 🗣️ VAD preprocessing, reduces hallucination & batching with no WER degradation
-**Whisper** is an ASR model [developed by OpenAI](https://github.com/openai/whisper), trained on a large dataset of diverse audio. Whilst it does produces highly accurate transcriptions, the corresponding timestamps are at the utterance-level, not per word, and can be inaccurate by several seconds.
+
 **Whisper** is an ASR model [developed by OpenAI](https://github.com/openai/whisper), trained on a large dataset of diverse audio. Whilst it does produces highly accurate transcriptions, the corresponding timestamps are at the utterance-level, not per word, and can be inaccurate by several seconds. OpenAI's whisper does not natively support batching.
 **Phoneme-Based ASR** A suite of models finetuned to recognise the smallest unit of speech distinguishing one word from another, e.g. the element p in "tap". A popular example model is [wav2vec2.0](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self).
@ -50,58 +50,88 @@ This repository refines the timestamps of openAI's Whisper model via forced alig
 **Voice Activity Detection (VAD)** is the detection of the presence or absence of human speech.
 **Speaker Diarization** is the process of partitioning an audio stream containing human speech into homogeneous segments according to the identity of each speaker.
 <h2 align="left", id="highlights">New🚨</h2>
- v2 released, code cleanup, imports whisper library, batched inference from paper not included (contact for licensing / batched model API). VAD filtering is now turned on by default, as in the paper.
+- 1st place at [Ego4d transcription challenge](https://eval.ai/web/challenges/challenge-page/1637/leaderboard/3931/WER)  🏆
- Paper drop🎓👨‍🏫! Please see our [ArxiV preprint](https://arxiv.org/abs/2303.00747) for benchmarking and details of WhisperX. We also introduce more efficient batch inference resulting in large-v2 with *60-70x REAL TIME speed (not provided in this repo).
+- _WhisperX_ accepted at INTERSPEECH 2023 
- VAD filtering: Voice Activity Detection (VAD) from [Pyannote.audio](https://huggingface.co/pyannote/voice-activity-detection) is used as a preprocessing step to remove reliance on whisper timestamps and only transcribe audio segments containing speech. add `--vad_filter True` flag, increases timestamp accuracy and robustness (requires more GPU mem due to 30s inputs in wav2vec2)
+- v3 transcript segment-per-sentence: using nltk sent_tokenize for better subtitlting & better diarization
- Character level timestamps (see `*.char.ass` file output)
+- v3 released, 70x speed-up open-sourced. Using batched whisper with [faster-whisper](https://github.com/guillaumekln/faster-whisper) backend!
- Diarization (still in beta, add `--diarize`)
+- v2 released, code cleanup, imports whisper library VAD filtering is now turned on by default, as in the paper.
-
+- Paper drop🎓👨‍🏫! Please see our [ArxiV preprint](https://arxiv.org/abs/2303.00747) for benchmarking and details of WhisperX. We also introduce more efficient batch inference resulting in large-v2 with *60-70x REAL TIME speed.
 <h2 align="left" id="setup">Setup ⚙️</h2>
-Install this package using
+Tested for PyTorch 2.0, Python 3.10 (use other versions at your own risk!)
-`pip install git+https://github.com/m-bain/whisperx.git`
+GPU execution requires the NVIDIA libraries cuBLAS 11.x and cuDNN 8.x to be installed on the system. Please refer to the [CTranslate2 documentation](https://opennmt.net/CTranslate2/installation.html).
 If already installed, update package to most recent commit
-`pip install git+https://github.com/m-bain/whisperx.git --upgrade`
+### 1. Create Python3.10 environment
-If wishing to modify this package, clone and install in editable mode:
+`conda create --name whisperx python=3.10`
 `conda activate whisperx`
 ### 2. Install PyTorch, e.g. for Linux and Windows CUDA11.8:
 `conda install pytorch==2.0.0 torchaudio==2.0.0 pytorch-cuda=11.8 -c pytorch -c nvidia`
 See other methods [here.](https://pytorch.org/get-started/previous-versions/#v200)
 ### 3. Install WhisperX
 You have several installation options:
 #### Option A: Stable Release (recommended)
 Install the latest stable version from PyPI:
 ```bash
 pip install whisperx
 ```
-$ git clone https://github.com/m-bain/whisperX.git
+
-$ cd whisperX
+#### Option B: Development Version
-$ pip install -e .
+Install the latest development version directly from GitHub (may be unstable):
 ```bash
 pip install git+https://github.com/m-bain/whisperx.git
 ```
 If already installed, update to the most recent commit:
 ```bash
 pip install git+https://github.com/m-bain/whisperx.git --upgrade
 ```
 #### Option C: Development Mode
 If you wish to modify the package, clone and install in editable mode:
 ```bash
 git clone https://github.com/m-bain/whisperX.git
 cd whisperX
 pip install -e .
 ```
 > **Note**: The development version may contain experimental features and bugs. Use the stable PyPI release for production environments.
 You may also need to install ffmpeg, rust etc. Follow openAI instructions here https://github.com/openai/whisper#setup.
 ### Setup not working???
 Safest to use install pytorch as follows (for gpu)
 `conda install pytorch==1.11.0 torchvision==0.12.0 torchaudio==0.11.0 -c pytorch
 `
 ### Speaker Diarization
-To **enable Speaker. Diarization**, include your Hugging Face access token that you can generate from [Here](https://huggingface.co/settings/tokens) after the `--hf_token` argument and accept the user agreement for the following models: [Segmentation](https://huggingface.co/pyannote/segmentation) , [Voice Activity Detection (VAD)](https://huggingface.co/pyannote/voice-activity-detection) , and [Speaker Diarization](https://huggingface.co/pyannote/speaker-diarization)
+To **enable Speaker Diarization**, include your Hugging Face access token (read) that you can generate from [Here](https://huggingface.co/settings/tokens) after the `--hf_token` argument and accept the user agreement for the following models: [Segmentation](https://huggingface.co/pyannote/segmentation-3.0) and [Speaker-Diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1) (if you choose to use Speaker-Diarization 2.x, follow requirements [here](https://huggingface.co/pyannote/speaker-diarization) instead.)
 > **Note**<br>
 > As of Oct 11, 2023, there is a known issue regarding slow performance with pyannote/Speaker-Diarization-3.0 in whisperX. It is due to dependency conflicts between faster-whisper and pyannote-audio 3.0.0. Please see [this issue](https://github.com/m-bain/whisperX/issues/499) for more details and potential workarounds.
 <h2 align="left" id="example">Usage 💬 (command line)</h2>
 ### English
-Run whisper on example segment (using default params)
+Run whisper on example segment (using default params, whisper small) add `--highlight_words True` to visualise word timings in the .srt file.
    whisperx examples/sample01.wav
 For increased timestamp accuracy, at the cost of higher gpu mem, use bigger models (bigger alignment model not found to be that helpful, see paper) e.g.
    whisperx examples/sample01.wav --model large-v2 --align_model WAV2VEC2_ASR_LARGE_LV60K_960H
 Result using *WhisperX* with forced alignment to wav2vec2.0 large:
 https://user-images.githubusercontent.com/36994049/208253969-7e35fe2a-7541-434a-ae91-8e919540555d.mp4
@ -110,6 +140,20 @@ Compare this to original whisper out the box, where many transcriptions are out
 https://user-images.githubusercontent.com/36994049/207743923-b4f0d537-29ae-4be2-b404-bb941db73652.mov
 For increased timestamp accuracy, at the cost of higher gpu mem, use bigger models (bigger alignment model not found to be that helpful, see paper) e.g.
    whisperx examples/sample01.wav --model large-v2 --align_model WAV2VEC2_ASR_LARGE_LV60K_960H --batch_size 4
 To label the transcript with speaker ID's (set number of speakers if known e.g. `--min_speakers 2` `--max_speakers 2`):
    whisperx examples/sample01.wav --model large-v2 --diarize --highlight_words True
 To run on CPU instead of GPU (and for running on Mac OS X):
    whisperx examples/sample01.wav --compute_type int8
 ### Other languages
 The phoneme ASR alignment model is *language-specific*, for tested languages these models are [automatically picked from torchaudio pipelines or huggingface](https://github.com/m-bain/whisperX/blob/e909f2f766b23b2000f2d95df41f9b844ac53e49/whisperx/transcribe.py#L22).
@ -119,7 +163,7 @@ Currently default models provided for `{en, fr, de, es, it, ja, zh, nl, uk, pt}`
 #### E.g. German
-    whisperx --model large --language de examples/sample_de_01.wav
+    whisperx --model large-v2 --language de examples/sample_de_01.wav
 https://user-images.githubusercontent.com/36994049/208298811-e36002ba-3698-4731-97d4-0aebd07e0eb3.mov
@ -130,72 +174,119 @@ See more examples in other languages [here](EXAMPLES.md).
 ```python
 import whisperx
-import whisper
+import gc 
 device = "cuda" 
 audio_file = "audio.mp3"
 batch_size = 16 # reduce if low on GPU mem
 compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)
-# transcribe with original whisper
+# 1. Transcribe with original whisper (batched)
-model = whisper.load_model("large", device)
+model = whisperx.load_model("large-v2", device, compute_type=compute_type)
 result = model.transcribe(audio_file)
 # save model to local path (optional)
 # model_dir = "/path/"
 # model = whisperx.load_model("large-v2", device, compute_type=compute_type, download_root=model_dir)
 audio = whisperx.load_audio(audio_file)
 result = model.transcribe(audio, batch_size=batch_size)
 print(result["segments"]) # before alignment
-# load alignment model and metadata
+# delete model if low on GPU resources
 # import gc; gc.collect(); torch.cuda.empty_cache(); del model
 # 2. Align whisper output
 model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
 result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
-# align whisper output
+print(result["segments"]) # after alignment
 result_aligned = whisperx.align(result["segments"], model_a, metadata, audio_file, device)
-print(result_aligned["segments"]) # after alignment
+# delete model if low on GPU resources
-print(result_aligned["word_segments"]) # after alignment
+# import gc; gc.collect(); torch.cuda.empty_cache(); del model_a
 # 3. Assign speaker labels
 diarize_model = whisperx.DiarizationPipeline(use_auth_token=YOUR_HF_TOKEN, device=device)
 # add min/max number of speakers if known
 diarize_segments = diarize_model(audio)
 # diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)
 result = whisperx.assign_word_speakers(diarize_segments, result)
 print(diarize_segments)
 print(result["segments"]) # segments are now assigned speaker IDs
 ```
 ## Demos 🚀
-<h2 align="left" id="whisper-mod">Whisper Modifications</h2>
+[![Replicate (large-v3](https://img.shields.io/static/v1?label=Replicate+WhisperX+large-v3&message=Demo+%26+Cloud+API&color=blue)](https://replicate.com/victor-upmeet/whisperx) 
 [![Replicate (large-v2](https://img.shields.io/static/v1?label=Replicate+WhisperX+large-v2&message=Demo+%26+Cloud+API&color=blue)](https://replicate.com/daanelson/whisperx) 
 [![Replicate (medium)](https://img.shields.io/static/v1?label=Replicate+WhisperX+medium&message=Demo+%26+Cloud+API&color=blue)](https://replicate.com/carnifexer/whisperx) 
-In addition to forced alignment, the following two modifications have been made to the whisper transcription method:
+If you don't have access to your own GPUs, use the links above to try out WhisperX. 
-1. `--condition_on_prev_text` is set to `False` by default (reduces hallucination)
+<h2 align="left" id="whisper-mod">Technical Details 👷‍♂️</h2>
 For specific details on the batching and alignment, the effect of VAD, as well as the chosen alignment model, see the preprint [paper](https://www.robots.ox.ac.uk/~vgg/publications/2023/Bain23/bain23.pdf).
 To reduce GPU memory requirements, try any of the following (2. & 3. can affect quality):
 1.  reduce batch size, e.g. `--batch_size 4`
 2. use a smaller ASR model `--model base`
 3. Use lighter compute type `--compute_type int8`
 Transcription differences from openai's whisper:
 1. Transcription without timestamps. To enable single pass batching, whisper inference is performed `--without_timestamps True`, this ensures 1 forward pass per sample in the batch. However, this can cause discrepancies the default whisper output.
 2. VAD-based segment transcription, unlike the buffered transcription of openai's. In the WhisperX paper we show this reduces WER, and enables accurate batched inference
 3.  `--condition_on_prev_text` is set to `False` by default (reduces hallucination)
 <h2 align="left" id="limitations">Limitations ⚠️</h2>
- Whisper normalises spoken numbers e.g. "fifty seven" to arabic numerals "57". Need to perform this normalization after alignment, so the phonemes can be aligned. Currently just ignores numbers.
+- Transcript words which do not contain characters in the alignment models dictionary e.g. "2014." or "£13.60" cannot be aligned and therefore are not given a timing.
 - If setting `--vad_filter False`, then whisperx assumes the initial whisper timestamps are accurate to some degree (within margin of 2 seconds, adjust if needed -- bigger margins more prone to alignment errors)
 - Overlapping speech is not handled particularly well by whisper nor whisperx
- Diariazation is far from perfect.
+- Diarization is far from perfect
 - Language specific wav2vec2 model is needed
 <h2 align="left" id="contribute">Contribute 🧑‍🏫</h2>
-If you are multilingual, a major way you can contribute to this project is to find phoneme models on huggingface (or train your own) and test them on speech for the target language. If the results look good send a merge request and some examples showing its success.
+If you are multilingual, a major way you can contribute to this project is to find phoneme models on huggingface (or train your own) and test them on speech for the target language. If the results look good send a pull request and some examples showing its success.
-The next major upgrade we are working on is whisper with speaker diarization, so if you have any experience on this please share.
+Bug finding and pull requests are also highly appreciated to keep this project going, since it's already diverging from the original research scope.
-<h2 align="left" id="coming-soon">Coming Soon 🗓</h2>
+<h2 align="left" id="coming-soon">TODO 🗓</h2>
 * [x] Multilingual init
 * [x] Subtitle .ass output
 * [x] Automatic align model selection based on language detection
 * [x] Python usage
 * [x] Character level timestamps
 * [x] Incorporating  speaker diarization
-* [ ] Automatic .wav conversion to make VAD compatible
+* [x] Model flush, for low gpu mem resources
-* [ ] Model flush, for low gpu mem resources
+* [x] Faster-whisper backend
 * [x] Add max-line etc. see (openai's whisper utils.py)
 * [x] Sentence-level segments (nltk toolbox)
 * [x] Improve alignment logic
 * [ ] update examples with diarization and word highlighting
 * [ ] Subtitle .ass output <- bring this back (removed in v3)
 * [ ] Add benchmarking code (TEDLIUM for spd/WER & word segmentation)
 * [ ] Allow silero-vad as alternative VAD option
 * [ ] Improve diarization (word level). *Harder than first thought...*
 <h2 align="left" id="contact">Contact/Support 📇</h2>
-Contact maxhbain@gmail.com for queries and licensing / early access to a model API with batched inference (transcribe 1hr audio in under 1min).
+
 Contact maxhbain@gmail.com for queries.
 <a href="https://www.buymeacoffee.com/maxhbain" target="_blank"><img src="https://cdn.buymeacoffee.com/buttons/default-orange.png" alt="Buy Me A Coffee" height="41" width="174"></a>
@ -204,12 +295,19 @@ Contact maxhbain@gmail.com for queries and licensing / early access to a model A
 This work, and my PhD, is supported by the [VGG (Visual Geometry Group)](https://www.robots.ox.ac.uk/~vgg/) and the University of Oxford.
 Of course, this is builds on [openAI's whisper](https://github.com/openai/whisper).
-And borrows important alignment code from [PyTorch tutorial on forced alignment](https://pytorch.org/tutorials/intermediate/forced_alignment_with_torchaudio_tutorial.html)
+Borrows important alignment code from [PyTorch tutorial on forced alignment](https://pytorch.org/tutorials/intermediate/forced_alignment_with_torchaudio_tutorial.html)
 And uses the wonderful pyannote VAD / Diarization https://github.com/pyannote/pyannote-audio
 Valuable VAD & Diarization Models from [pyannote audio](https://github.com/pyannote/pyannote-audio)
 Great backend from [faster-whisper](https://github.com/guillaumekln/faster-whisper) and [CTranslate2](https://github.com/OpenNMT/CTranslate2)
 Those who have [supported this work financially](https://www.buymeacoffee.com/maxhbain) 🙏
 Finally, thanks to the OS [contributors](https://github.com/m-bain/whisperX/graphs/contributors) of this project, keeping it going and identifying bugs.
 <h2 align="left" id="cite">Citation</h2>
 If you use this in your research, please cite the paper:
@ -217,40 +315,7 @@ If you use this in your research, please cite the paper:
@article{bain2022whisperx,
  title={WhisperX: Time-Accurate Speech Transcription of Long-Form Audio},
  author={Bain, Max and Huh, Jaesung and Han, Tengda and Zisserman, Andrew},
-  journal={arXiv preprint, arXiv:2303.00747},
+  journal={INTERSPEECH 2023},
  year={2023}
 }
 ```
 as well the following works, used in each stage of the pipeline:
 ```bibtex
@article{radford2022robust,
  title={Robust speech recognition via large-scale weak supervision},
  author={Radford, Alec and Kim, Jong Wook and Xu, Tao and Brockman, Greg and McLeavey, Christine and Sutskever, Ilya},
  journal={arXiv preprint arXiv:2212.04356},
  year={2022}
 }
 ```
 ```bibtex
@article{baevski2020wav2vec,
  title={wav2vec 2.0: A framework for self-supervised learning of speech representations},
  author={Baevski, Alexei and Zhou, Yuhao and Mohamed, Abdelrahman and Auli, Michael},
  journal={Advances in neural information processing systems},
  volume={33},
  pages={12449--12460},
  year={2020}
 }
 ```
 ```bibtex
@inproceedings{bredin2020pyannote,
  title={Pyannote. audio: neural building blocks for speaker diarization},
  author={Bredin, Herv{\'e} and Yin, Ruiqing and Coria, Juan Manuel and Gelly, Gregory and Korshunov, Pavel and Lavechin, Marvin and Fustes, Diego and Titeux, Hadrien and Bouaziz, Wassim and Gill, Marie-Philippe},
  booktitle={ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
  pages={7124--7128},
  year={2020},
  organization={IEEE}
 }
 ```
--- a/requirements.txt
+++ b/requirements.txt
@ -1,10 +1,8 @@
-numpy
+torch>=2
 torchaudio>=2
 faster-whisper==1.1.0
 ctranslate2<4.5.0
 transformers
 pandas
-torch >=1.9
+setuptools>=65
-torchaudio >=0.10,<1.0
+nltk
 tqdm
 more-itertools
 transformers>=4.19.0
 ffmpeg-python==0.2.0
 pyannote.audio
 openai-whisper==20230314
--- a/setup.py
+++ b/setup.py
@ -1,28 +1,33 @@
 import os
 import pkg_resources
-from setuptools import setup, find_packages
+from setuptools import find_packages, setup
 with open("README.md", "r", encoding="utf-8") as f:
    long_description = f.read()
 setup(
    name="whisperx",
    py_modules=["whisperx"],
-    version="2.0",
+    version="3.3.1",
    description="Time-Accurate Automatic Speech Recognition using Whisper.",
-    readme="README.md",
+    long_description=long_description,
-    python_requires=">=3.8",
+    long_description_content_type="text/markdown",
    python_requires=">=3.9, <3.13",
    author="Max Bain",
    url="https://github.com/m-bain/whisperx",
-    license="MIT",
+    license="BSD-2-Clause",
    packages=find_packages(exclude=["tests*"]),
    install_requires=[
        str(r)
        for r in pkg_resources.parse_requirements(
            open(os.path.join(os.path.dirname(__file__), "requirements.txt"))
        )
-    ],
+    ]
-    entry_points = {
+    + [f"pyannote.audio==3.3.2"],
-        'console_scripts': ['whisperx=whisperx.transcribe:cli'],
+    entry_points={
        "console_scripts": ["whisperx=whisperx.transcribe:cli"],
    },
    include_package_data=True,
-    extras_require={'dev': ['pytest']},
+    extras_require={"dev": ["pytest"]},
 )
--- a/whisperx/SubtitlesProcessor.py
+++ b/whisperx/SubtitlesProcessor.py
@ -0,0 +1,227 @@
 import math
 from .conjunctions import get_conjunctions, get_comma
 from typing import TextIO
 def normal_round(n):
    if n - math.floor(n) < 0.5:
        return math.floor(n)
    return math.ceil(n)
 def format_timestamp(seconds: float, is_vtt: bool = False):
    assert seconds >= 0, "non-negative timestamp expected"
    milliseconds = round(seconds * 1000.0)
    hours = milliseconds // 3_600_000
    milliseconds -= hours * 3_600_000
    minutes = milliseconds // 60_000
    milliseconds -= minutes * 60_000
    seconds = milliseconds // 1_000
    milliseconds -= seconds * 1_000
    separator = '.' if is_vtt else ','
    hours_marker = f"{hours:02d}:"
    return (
        f"{hours_marker}{minutes:02d}:{seconds:02d}{separator}{milliseconds:03d}"
    )
 class SubtitlesProcessor:
    def __init__(self, segments, lang, max_line_length = 45, min_char_length_splitter = 30, is_vtt = False):
        self.comma = get_comma(lang)
        self.conjunctions = set(get_conjunctions(lang))
        self.segments = segments
        self.lang = lang
        self.max_line_length = max_line_length
        self.min_char_length_splitter = min_char_length_splitter
        self.is_vtt = is_vtt
        complex_script_languages = ['th', 'lo', 'my', 'km', 'am', 'ko', 'ja', 'zh', 'ti', 'ta', 'te', 'kn', 'ml', 'hi', 'ne', 'mr', 'ar', 'fa', 'ur', 'ka']
        if self.lang in complex_script_languages:
            self.max_line_length = 30
            self.min_char_length_splitter = 20
    def estimate_timestamp_for_word(self, words, i, next_segment_start_time=None):
        k = 0.25
        has_prev_end = i > 0 and 'end' in words[i - 1]
        has_next_start = i < len(words) - 1 and 'start' in words[i + 1]
        if has_prev_end:
            words[i]['start'] = words[i - 1]['end']
            if has_next_start:
                words[i]['end'] = words[i + 1]['start']
            else:
                if next_segment_start_time:
                    words[i]['end'] = next_segment_start_time if next_segment_start_time - words[i - 1]['end'] <= 1 else next_segment_start_time - 0.5
                else:
                    words[i]['end'] = words[i]['start'] + len(words[i]['word']) * k
        elif has_next_start:
            words[i]['start'] = words[i + 1]['start'] - len(words[i]['word']) * k
            words[i]['end'] = words[i + 1]['start']
        else:
            if next_segment_start_time:
                words[i]['start'] = next_segment_start_time - 1
                words[i]['end'] = next_segment_start_time - 0.5
            else:
                words[i]['start'] = 0
                words[i]['end'] = 0
    def process_segments(self, advanced_splitting=True):
        subtitles = []
        for i, segment in enumerate(self.segments):
            next_segment_start_time = self.segments[i + 1]['start'] if i + 1 < len(self.segments) else None
            if advanced_splitting:
                split_points = self.determine_advanced_split_points(segment, next_segment_start_time)
                subtitles.extend(self.generate_subtitles_from_split_points(segment, split_points, next_segment_start_time))
            else:
                words = segment['words']
                for i, word in enumerate(words):
                    if 'start' not in word or 'end' not in word:
                        self.estimate_timestamp_for_word(words, i, next_segment_start_time)
                subtitles.append({
                'start': segment['start'],
                'end': segment['end'],
                'text': segment['text']
            })
        return subtitles
    def determine_advanced_split_points(self, segment, next_segment_start_time=None):
        split_points = []
        last_split_point = 0
        char_count = 0
        words = segment.get('words', segment['text'].split())
        add_space = 0 if self.lang in ['zh', 'ja'] else 1
        total_char_count = sum(len(word['word']) if isinstance(word, dict) else len(word) + add_space for word in words)
        char_count_after = total_char_count
        for i, word in enumerate(words):
            word_text = word['word'] if isinstance(word, dict) else word
            word_length = len(word_text) + add_space
            char_count += word_length
            char_count_after -= word_length
            char_count_before = char_count - word_length
            if isinstance(word, dict) and ('start' not in word or 'end' not in word):
                self.estimate_timestamp_for_word(words, i, next_segment_start_time)
            if char_count >= self.max_line_length:
                midpoint = normal_round((last_split_point + i) / 2)
                if char_count_before >= self.min_char_length_splitter:
                    split_points.append(midpoint)
                    last_split_point = midpoint + 1
                    char_count = sum(len(words[j]['word']) if isinstance(words[j], dict) else len(words[j]) + add_space for j in range(last_split_point, i + 1))
            elif word_text.endswith(self.comma) and char_count_before >= self.min_char_length_splitter and char_count_after >= self.min_char_length_splitter:
                split_points.append(i)
                last_split_point = i + 1
                char_count = 0
            elif word_text.lower() in self.conjunctions and char_count_before >= self.min_char_length_splitter and char_count_after >= self.min_char_length_splitter:
                split_points.append(i - 1)
                last_split_point = i
                char_count = word_length
        return split_points
    def generate_subtitles_from_split_points(self, segment, split_points, next_start_time=None):
        subtitles = []
        words = segment.get('words', segment['text'].split())
        total_word_count = len(words)
        total_time = segment['end'] - segment['start']
        elapsed_time = segment['start']
        prefix = ' ' if self.lang not in ['zh', 'ja'] else ''
        start_idx = 0
        for split_point in split_points:
            fragment_words = words[start_idx:split_point + 1]
            current_word_count = len(fragment_words)
            if isinstance(fragment_words[0], dict):
                start_time = fragment_words[0]['start']
                end_time = fragment_words[-1]['end']
                next_start_time_for_word = words[split_point + 1]['start'] if split_point + 1 < len(words) else None
                if next_start_time_for_word and (next_start_time_for_word - end_time) <= 0.8:
                    end_time = next_start_time_for_word
            else:
                fragment = prefix.join(fragment_words).strip()
                current_duration = (current_word_count / total_word_count) * total_time
                start_time = elapsed_time
                end_time = elapsed_time + current_duration
                elapsed_time += current_duration
            subtitles.append({
                'start': start_time,
                'end': end_time,
                'text': fragment if not isinstance(fragment_words[0], dict) else prefix.join(word['word'] for word in fragment_words)
            })
            start_idx = split_point + 1
        # Handle the last fragment
        if start_idx < len(words):
            fragment_words = words[start_idx:]
            current_word_count = len(fragment_words)
            if isinstance(fragment_words[0], dict):
                start_time = fragment_words[0]['start']
                end_time = fragment_words[-1]['end']
            else:
                fragment = prefix.join(fragment_words).strip()
                current_duration = (current_word_count / total_word_count) * total_time
                start_time = elapsed_time
                end_time = elapsed_time + current_duration
            if next_start_time and (next_start_time - end_time) <= 0.8:
                end_time = next_start_time
            subtitles.append({
                'start': start_time,
                'end': end_time if end_time is not None else segment['end'],
                'text': fragment if not isinstance(fragment_words[0], dict) else prefix.join(word['word'] for word in fragment_words)
            })
        return subtitles
    def save(self, filename="subtitles.srt", advanced_splitting=True):
        subtitles = self.process_segments(advanced_splitting)
        def write_subtitle(file, idx, start_time, end_time, text):
            file.write(f"{idx}\n")
            file.write(f"{start_time} --> {end_time}\n")
            file.write(text + "\n\n")
        with open(filename, 'w', encoding='utf-8') as file:
            if self.is_vtt:
                file.write("WEBVTT\n\n")
            if advanced_splitting:
                for idx, subtitle in enumerate(subtitles, 1):
                    start_time = format_timestamp(subtitle['start'], self.is_vtt)
                    end_time = format_timestamp(subtitle['end'], self.is_vtt)
                    text = subtitle['text'].strip()
                    write_subtitle(file, idx, start_time, end_time, text)
        return len(subtitles)
--- a/whisperx/init.py
+++ b/whisperx/init.py
@ -1,3 +1,4 @@
-from .transcribe import transcribe, transcribe_with_vad
+from .transcribe import load_model
 from .alignment import load_align_model, align
-from .vad import load_vad_model
+from .audio import load_audio
 from .diarize import assign_word_speakers, DiarizationPipeline
--- a/whisperx/alignment.py
+++ b/whisperx/alignment.py
@ -2,16 +2,22 @@
 Forced Alignment with Whisper
 C. Max Bain
 """
 from dataclasses import dataclass
 from typing import Iterable, Optional, Union, List
 import numpy as np
 import pandas as pd
 from typing import List, Union, Iterator, TYPE_CHECKING
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 import torchaudio
 import torch
-from dataclasses import dataclass
+import torchaudio
-from whisper.audio import SAMPLE_RATE, load_audio
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 from .utils import interpolate_nans
 from .audio import SAMPLE_RATE, load_audio
 from .utils import interpolate_nans
 from .types import AlignedTranscriptionResult, SingleSegment, SingleAlignedSegment, SingleWordSegment
 import nltk
 from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
 PUNKT_ABBREVIATIONS = ['dr', 'vs', 'mr', 'mrs', 'prof']
 LANGUAGES_WITHOUT_SPACES = ["ja", "zh"]
@ -30,6 +36,7 @@ DEFAULT_ALIGN_MODELS_HF = {
    "uk": "Yehor/wav2vec2-xls-r-300m-uk-with-small-lm",
    "pt": "jonatasgrosman/wav2vec2-large-xlsr-53-portuguese",
    "ar": "jonatasgrosman/wav2vec2-large-xlsr-53-arabic",
    "cs": "comodoro/wav2vec2-xls-r-300m-cs-250",
    "ru": "jonatasgrosman/wav2vec2-large-xlsr-53-russian",
    "pl": "jonatasgrosman/wav2vec2-large-xlsr-53-polish",
    "hu": "jonatasgrosman/wav2vec2-large-xlsr-53-hungarian",
@ -37,10 +44,28 @@ DEFAULT_ALIGN_MODELS_HF = {
    "fa": "jonatasgrosman/wav2vec2-large-xlsr-53-persian",
    "el": "jonatasgrosman/wav2vec2-large-xlsr-53-greek",
    "tr": "mpoyraz/wav2vec2-xls-r-300m-cv7-turkish",
    "da": "saattrupdan/wav2vec2-xls-r-300m-ftspeech",
    "he": "imvladikon/wav2vec2-xls-r-300m-hebrew",
    "vi": 'nguyenvulebinh/wav2vec2-base-vi',
    "ko": "kresnik/wav2vec2-large-xlsr-korean",
    "ur": "kingabzpro/wav2vec2-large-xls-r-300m-Urdu",
    "te": "anuragshas/wav2vec2-large-xlsr-53-telugu",
    "hi": "theainerd/Wav2Vec2-large-xlsr-hindi",
    "ca": "softcatala/wav2vec2-large-xlsr-catala",
    "ml": "gvs/wav2vec2-large-xlsr-malayalam",
    "no": "NbAiLab/nb-wav2vec2-1b-bokmaal-v2",
    "nn": "NbAiLab/nb-wav2vec2-1b-nynorsk",
    "sk": "comodoro/wav2vec2-xls-r-300m-sk-cv8",
    "sl": "anton-l/wav2vec2-large-xlsr-53-slovenian",
    "hr": "classla/wav2vec2-xls-r-parlaspeech-hr",
    "ro": "gigant/romanian-wav2vec2",
    "eu": "stefan-it/wav2vec2-large-xlsr-53-basque",
    "gl": "ifrz/wav2vec2-large-xlsr-galician",
    "ka": "xsway/wav2vec2-large-xlsr-georgian",
 }
-def load_align_model(language_code, device, model_name=None, model_dir=None):
+def load_align_model(language_code: str, device: str, model_name: Optional[str] = None, model_dir=None):
    if model_name is None:
        # use default model
        if language_code in DEFAULT_ALIGN_MODELS_TORCH:
@ -60,8 +85,8 @@ def load_align_model(language_code, device, model_name=None, model_dir=None):
        align_dictionary = {c.lower(): i for i, c in enumerate(labels)}
    else:
        try:
-            processor = Wav2Vec2Processor.from_pretrained(model_name)
+            processor = Wav2Vec2Processor.from_pretrained(model_name, cache_dir=model_dir)
-            align_model = Wav2Vec2ForCTC.from_pretrained(model_name)
+            align_model = Wav2Vec2ForCTC.from_pretrained(model_name, cache_dir=model_dir)
        except Exception as e:
            print(e)
            print(f"Error loading model from huggingface, check https://huggingface.co/models for finetuned wav2vec2.0 models")
@ -77,49 +102,20 @@ def load_align_model(language_code, device, model_name=None, model_dir=None):
 def align(
-    transcript: Iterator[dict],
+    transcript: Iterable[SingleSegment],
    model: torch.nn.Module,
    align_model_metadata: dict,
    audio: Union[str, np.ndarray, torch.Tensor],
    device: str,
    extend_duration: float = 0.0,
    start_from_previous: bool = True,
    interpolate_method: str = "nearest",
-):
+    return_char_alignments: bool = False,
    print_progress: bool = False,
    combined_progress: bool = False,
 ) -> AlignedTranscriptionResult:
    """
-    Force align phoneme recognition predictions to known transcription
+    Align phoneme recognition predictions to known transcription.
    Parameters
    ----------
    transcript: Iterator[dict]
        The Whisper model instance
    model: torch.nn.Module
        Alignment model (wav2vec2)
    audio: Union[str, np.ndarray, torch.Tensor]
        The path to the audio file to open, or the audio waveform
    device: str
        cuda device
    diarization: pd.DataFrame {'start': List[float], 'end': List[float], 'speaker': List[float]}
        diarization segments with speaker labels.
    extend_duration: float
        Amount to pad input segments by. If not using vad--filter then recommended to use 2 seconds
        If the gzip compression ratio is above this value, treat as failed
    interpolate_method: str ["nearest", "linear", "ignore"]
        Method to assign timestamps to non-aligned words. Words are not able to be aligned when none of the characters occur in the align model dictionary.
        "nearest" copies timestamp of nearest word within the segment. "linear" is linear interpolation. "drop" removes that word from output.
    Returns
    -------
    A dictionary containing the resulting text ("text") and segment-level details ("segments"), and
    the spoken language ("language"), which is detected when `decode_options["language"]` is None.
    """
    if not torch.is_tensor(audio):
        if isinstance(audio, str):
            audio = load_audio(audio)
@ -133,42 +129,27 @@ def align(
    model_lang = align_model_metadata["language"]
    model_type = align_model_metadata["type"]
-    aligned_segments = []
+    # 1. Preprocess to keep only characters in dictionary
-
+    total_segments = len(transcript)
    prev_t2 = 0
    char_segments_arr = {
        "segment-idx": [],
        "subsegment-idx": [],
        "word-idx": [],
        "char": [],
        "start": [],
        "end": [],
        "score": [],
    }
    for sdx, segment in enumerate(transcript):
        while True:
            segment_align_success = False
        # strip spaces at beginning / end, but keep track of the amount.
        if print_progress:
            base_progress = ((sdx + 1) / total_segments) * 100
            percent_complete = (50 + base_progress / 2) if combined_progress else base_progress
            print(f"Progress: {percent_complete:.2f}%...")
        num_leading = len(segment["text"]) - len(segment["text"].lstrip())
        num_trailing = len(segment["text"]) - len(segment["text"].rstrip())
-            transcription = segment["text"]
+        text = segment["text"]
            # TODO: convert number tokenizer / symbols to phonetic words for alignment.
            # e.g. "$300" -> "three hundred dollars"
            # currently "$300" is ignored since no characters present in the phonetic dictionary
        # split into words
        if model_lang not in LANGUAGES_WITHOUT_SPACES:
-                per_word = transcription.split(" ")
+            per_word = text.split(" ")
        else:
-                per_word = transcription
+            per_word = text
            # first check that characters in transcription can be aligned (they are contained in align model"s dictionary)
        clean_char, clean_cdx = [], []
-            for cdx, char in enumerate(transcription):
+        for cdx, char in enumerate(text):
            char_ = char.lower()
            # wav2vec2 models use "|" character to represent spaces
            if model_lang not in LANGUAGES_WITHOUT_SPACES:
@ -177,7 +158,7 @@ def align(
            # ignore whitespace at beginning and end of transcript
            if cdx < num_leading:
                pass
-                elif cdx > len(transcription) - num_trailing - 1:
+            elif cdx > len(text) - num_trailing - 1:
                pass
            elif char_ in model_dictionary.keys():
                clean_char.append(char_)
@ -188,40 +169,67 @@ def align(
            if any([c in model_dictionary.keys() for c in wrd]):
                clean_wdx.append(wdx)
-            # if no characters are in the dictionary, then we skip this segment...
+                
-            if len(clean_char) == 0:
+        punkt_param = PunktParameters()
        punkt_param.abbrev_types = set(PUNKT_ABBREVIATIONS)
        sentence_splitter = PunktSentenceTokenizer(punkt_param)
        sentence_spans = list(sentence_splitter.span_tokenize(text))
        segment["clean_char"] = clean_char
        segment["clean_cdx"] = clean_cdx
        segment["clean_wdx"] = clean_wdx
        segment["sentence_spans"] = sentence_spans
    aligned_segments: List[SingleAlignedSegment] = []
    # 2. Get prediction matrix from alignment model & align
    for sdx, segment in enumerate(transcript):
        t1 = segment["start"]
        t2 = segment["end"]
        text = segment["text"]
        aligned_seg: SingleAlignedSegment = {
            "start": t1,
            "end": t2,
            "text": text,
            "words": [],
        }
        if return_char_alignments:
            aligned_seg["chars"] = []
        # check we can align
        if len(segment["clean_char"]) == 0:
            print(f'Failed to align segment ("{segment["text"]}"): no characters in this segment found in model dictionary, resorting to original...')
-                break          
+            aligned_segments.append(aligned_seg)
            continue
            transcription_cleaned = "".join(clean_char)
            tokens = [model_dictionary[c] for c in transcription_cleaned]
            # we only pad if not using VAD filtering
            if "seg_text" not in segment:
                # pad according original timestamps
                t1 = max(segment["start"] - extend_duration, 0)
                t2 = min(segment["end"] + extend_duration, MAX_DURATION)
            # use prev_t2 as current t1 if it"s later
            if start_from_previous and t1 < prev_t2:
                t1 = prev_t2
            # check if timestamp range is still valid
        if t1 >= MAX_DURATION:
-                print("Failed to align segment: original start time longer than audio duration, skipping...")
+            print(f'Failed to align segment ("{segment["text"]}"): original start time longer than audio duration, skipping...')
-                break
+            aligned_segments.append(aligned_seg)
-            if t2 - t1 < 0.02:
+            continue
-                print("Failed to align segment: duration smaller than 0.02s time precision")
+
-                break
+        text_clean = "".join(segment["clean_char"])
        tokens = [model_dictionary[c] for c in text_clean]
        f1 = int(t1 * SAMPLE_RATE)
        f2 = int(t2 * SAMPLE_RATE)
        # TODO: Probably can get some speedup gain with batched inference here
        waveform_segment = audio[:, f1:f2]
        # Handle the minimum input length for wav2vec2 models
        if waveform_segment.shape[-1] < 400:
            lengths = torch.as_tensor([waveform_segment.shape[-1]]).to(device)
            waveform_segment = torch.nn.functional.pad(
                waveform_segment, (0, 400 - waveform_segment.shape[-1])
            )
        else:
            lengths = None
        with torch.inference_mode():
            if model_type == "torchaudio":
-                    emissions, _ = model(waveform_segment.to(device))
+                emissions, _ = model(waveform_segment.to(device), lengths=lengths)
            elif model_type == "huggingface":
                emissions = model(waveform_segment.to(device)).logits
            else:
@ -230,206 +238,124 @@ def align(
        emission = emissions[0].cpu().detach()
-            trellis = get_trellis(emission, tokens)
+        blank_id = 0
-            path = backtrack(trellis, emission, tokens)
+        for char, code in model_dictionary.items():
            if char == '[pad]' or char == '<pad>':
                blank_id = code
        trellis = get_trellis(emission, tokens, blank_id)
        path = backtrack(trellis, emission, tokens, blank_id)
        if path is None:
            print(f'Failed to align segment ("{segment["text"]}"): backtrack failed, resorting to original...')
-                break
+            aligned_segments.append(aligned_seg)
-            char_segments = merge_repeats(path, transcription_cleaned)
+            continue
            # word_segments = merge_words(char_segments)
        char_segments = merge_repeats(path, text_clean)
-            # sub-segments
+        duration = t2 -t1
            if "seg-text" not in segment:
                segment["seg-text"] = [transcription]
            seg_lens = [0] + [len(x) for x in segment["seg-text"]]
            seg_lens_cumsum = list(np.cumsum(seg_lens))
            sub_seg_idx = 0
            wdx = 0
            duration = t2 - t1
        ratio = duration * waveform_segment.size(0) / (trellis.size(0) - 1)
            for cdx, char in enumerate(transcription + " "):
                is_last = False
                if cdx == len(transcription):
                    break
                elif cdx+1 == len(transcription):
                    is_last = True
        # assign timestamps to aligned characters
        char_segments_arr = []
        word_idx = 0
        for cdx, char in enumerate(text):
            start, end, score = None, None, None
-                if cdx in clean_cdx:
+            if cdx in segment["clean_cdx"]:
-                    char_seg = char_segments[clean_cdx.index(cdx)]
+                char_seg = char_segments[segment["clean_cdx"].index(cdx)]
-                    start = char_seg.start * ratio + t1
+                start = round(char_seg.start * ratio + t1, 3)
-                    end = char_seg.end * ratio + t1
+                end = round(char_seg.end * ratio + t1, 3)
-                    score = char_seg.score        
+                score = round(char_seg.score, 3)
-                char_segments_arr["char"].append(char)
+            char_segments_arr.append(
-                char_segments_arr["start"].append(start)
+                {
-                char_segments_arr["end"].append(end)
+                    "char": char,
-                char_segments_arr["score"].append(score)
+                    "start": start,
-                char_segments_arr["word-idx"].append(wdx)
+                    "end": end,
-                char_segments_arr["segment-idx"].append(sdx)
+                    "score": score,
-                char_segments_arr["subsegment-idx"].append(sub_seg_idx)
+                    "word-idx": word_idx,
                }
            )
-                # word-level info
+            # increment word_idx, nltk word tokenization would probably be more robust here, but us space for now...
            if model_lang in LANGUAGES_WITHOUT_SPACES:
-                    # character == word
+                word_idx += 1
-                    wdx += 1
+            elif cdx == len(text) - 1 or text[cdx+1] == " ":
-                elif is_last or transcription[cdx+1] == " " or cdx == seg_lens_cumsum[sub_seg_idx+1] - 1:
+                word_idx += 1
                    wdx += 1
                if is_last or cdx == seg_lens_cumsum[sub_seg_idx+1] - 1:
                    wdx = 0
                    sub_seg_idx += 1
            prev_t2 = segment["end"]
            segment_align_success = True
            # end while True loop
            break
        # reset prev_t2 due to drifting issues
        if not segment_align_success:
            prev_t2 = 0
        char_segments_arr = pd.DataFrame(char_segments_arr)
    not_space = char_segments_arr["char"] != " "
-    per_seg_grp = char_segments_arr.groupby(["segment-idx", "subsegment-idx"], as_index = False)
+        aligned_subsegments = []
-    char_segments_arr = per_seg_grp.apply(lambda x: x.reset_index(drop = True)).reset_index()
+        # assign sentence_idx to each character index
-    per_word_grp = char_segments_arr[not_space].groupby(["segment-idx", "subsegment-idx", "word-idx"])
+        char_segments_arr["sentence-idx"] = None
-    per_subseg_grp = char_segments_arr[not_space].groupby(["segment-idx", "subsegment-idx"])
+        for sdx, (sstart, send) in enumerate(segment["sentence_spans"]):
-    per_seg_grp = char_segments_arr[not_space].groupby(["segment-idx"])
+            curr_chars = char_segments_arr.loc[(char_segments_arr.index >= sstart) & (char_segments_arr.index <= send)]
-    char_segments_arr["local-char-idx"] = char_segments_arr.groupby(["segment-idx", "subsegment-idx"]).cumcount()
+            char_segments_arr.loc[(char_segments_arr.index >= sstart) & (char_segments_arr.index <= send), "sentence-idx"] = sdx
    per_word_grp = char_segments_arr[not_space].groupby(["segment-idx", "subsegment-idx", "word-idx"]) # regroup
-    word_segments_arr = {}
+            sentence_text = text[sstart:send]
            sentence_start = curr_chars["start"].min()
            end_chars = curr_chars[curr_chars["char"] != ' ']
            sentence_end = end_chars["end"].max()
            sentence_words = []
-    # start of word is first char with a timestamp
+            for word_idx in curr_chars["word-idx"].unique():
-    word_segments_arr["start"] = per_word_grp["start"].min().values
+                word_chars = curr_chars.loc[curr_chars["word-idx"] == word_idx]
-    # end of word is last char with a timestamp
+                word_text = "".join(word_chars["char"].tolist()).strip()
-    word_segments_arr["end"] = per_word_grp["end"].max().values
+                if len(word_text) == 0:
-    # score of word is mean (excluding nan)
+                    continue
    word_segments_arr["score"] = per_word_grp["score"].mean().values
-    word_segments_arr["segment-text-start"] = per_word_grp["local-char-idx"].min().astype(int).values
+                # dont use space character for alignment
-    word_segments_arr["segment-text-end"] = per_word_grp["local-char-idx"].max().astype(int).values+1
+                word_chars = word_chars[word_chars["char"] != " "]
    word_segments_arr = pd.DataFrame(word_segments_arr)
-    word_segments_arr[["segment-idx", "subsegment-idx", "word-idx"]] = per_word_grp["local-char-idx"].min().reset_index()[["segment-idx", "subsegment-idx", "word-idx"]].astype(int)
+                word_start = word_chars["start"].min()
-    segments_arr = {}
+                word_end = word_chars["end"].max()
-    segments_arr["start"] = per_subseg_grp["start"].min().reset_index()["start"]
+                word_score = round(word_chars["score"].mean(), 3)
    segments_arr["end"] = per_subseg_grp["end"].max().reset_index()["end"]
    segments_arr = pd.DataFrame(segments_arr)
    segments_arr[["segment-idx", "subsegment-idx-start"]] = per_subseg_grp["start"].min().reset_index()[["segment-idx", "subsegment-idx"]]
    segments_arr["subsegment-idx-end"] = segments_arr["subsegment-idx-start"] + 1
-    # interpolate missing words / sub-segments
+                # -1 indicates unalignable 
-    if interpolate_method != "ignore":
+                word_segment = {"word": word_text}
        wrd_subseg_grp = word_segments_arr.groupby(["segment-idx", "subsegment-idx"], group_keys=False)
        wrd_seg_grp = word_segments_arr.groupby(["segment-idx"], group_keys=False)
        # we still know which word timestamps are interpolated because their score == nan
        word_segments_arr["start"] = wrd_subseg_grp['start'].apply(lambda group: interpolate_nans(group, method=interpolate_method))
        word_segments_arr["end"] = wrd_subseg_grp['end'].apply(lambda group: interpolate_nans(group, method=interpolate_method))
-        word_segments_arr["start"] = wrd_seg_grp['start'].apply(lambda group: interpolate_nans(group, method=interpolate_method))
+                if not np.isnan(word_start):
-        word_segments_arr["end"] = wrd_seg_grp['end'].apply(lambda group: interpolate_nans(group, method=interpolate_method))
+                    word_segment["start"] = word_start
                if not np.isnan(word_end):
                    word_segment["end"] = word_end
                if not np.isnan(word_score):
                    word_segment["score"] = word_score
-        sub_seg_grp =  segments_arr.groupby(["segment-idx"], group_keys=False)
+                sentence_words.append(word_segment)
        segments_arr['start'] = sub_seg_grp['start'].apply(lambda group: interpolate_nans(group, method=interpolate_method))
        segments_arr['end'] = sub_seg_grp['end'].apply(lambda group: interpolate_nans(group, method=interpolate_method))
-        # merge words & subsegments which are missing times
+            aligned_subsegments.append({
-        word_grp = word_segments_arr.groupby(["segment-idx", "subsegment-idx", "end"])
+                "text": sentence_text,
                "start": sentence_start,
                "end": sentence_end,
                "words": sentence_words,
            })
-        word_segments_arr["segment-text-start"] = word_grp["segment-text-start"].transform(min)
+            if return_char_alignments:
-        word_segments_arr["segment-text-end"] = word_grp["segment-text-end"].transform(max)
+                curr_chars = curr_chars[["char", "start", "end", "score"]]
-        word_segments_arr.drop_duplicates(subset=["segment-idx", "subsegment-idx", "end"], inplace=True)
+                curr_chars.fillna(-1, inplace=True)
                curr_chars = curr_chars.to_dict("records")
                curr_chars = [{key: val for key, val in char.items() if val != -1} for char in curr_chars]
                aligned_subsegments[-1]["chars"] = curr_chars
-        seg_grp_dup = segments_arr.groupby(["segment-idx", "start", "end"])
+        aligned_subsegments = pd.DataFrame(aligned_subsegments)
-        segments_arr["subsegment-idx-start"] = seg_grp_dup["subsegment-idx-start"].transform(min)
+        aligned_subsegments["start"] = interpolate_nans(aligned_subsegments["start"], method=interpolate_method)
-        segments_arr["subsegment-idx-end"] = seg_grp_dup["subsegment-idx-end"].transform(max)
+        aligned_subsegments["end"] = interpolate_nans(aligned_subsegments["end"], method=interpolate_method)
-        segments_arr.drop_duplicates(subset=["segment-idx", "subsegment-idx-start", "subsegment-idx-end"], inplace=True)
+        # concatenate sentences with same timestamps
-    else:
+        agg_dict = {"text": " ".join, "words": "sum"}
-        word_segments_arr.dropna(inplace=True)
+        if model_lang in LANGUAGES_WITHOUT_SPACES:
-        segments_arr.dropna(inplace=True)
+            agg_dict["text"] = "".join
        if return_char_alignments:
            agg_dict["chars"] = "sum"
        aligned_subsegments= aligned_subsegments.groupby(["start", "end"], as_index=False).agg(agg_dict)
        aligned_subsegments = aligned_subsegments.to_dict('records')
        aligned_segments += aligned_subsegments
-    # if some segments still have missing timestamps (usually because all numerals / symbols), then use original timestamps...
+    # create word_segments list
-    segments_arr['start'].fillna(pd.Series([x['start'] for x in transcript]), inplace=True)
+    word_segments: List[SingleWordSegment] = []
-    segments_arr['end'].fillna(pd.Series([x['end'] for x in transcript]), inplace=True)
+    for segment in aligned_segments:
-    segments_arr['subsegment-idx-start'].fillna(0, inplace=True)
+        word_segments += segment["words"]
    segments_arr['subsegment-idx-end'].fillna(1, inplace=True)
    aligned_segments = []
    aligned_segments_word = []
    word_segments_arr.set_index(["segment-idx", "subsegment-idx"], inplace=True)
    char_segments_arr.set_index(["segment-idx", "subsegment-idx", "word-idx"], inplace=True)
    for sdx, srow in segments_arr.iterrows():
        seg_idx = int(srow["segment-idx"])
        sub_start = int(srow["subsegment-idx-start"])
        sub_end = int(srow["subsegment-idx-end"])
        seg = transcript[seg_idx]
        text = "".join(seg["seg-text"][sub_start:sub_end])
        wseg = word_segments_arr.loc[seg_idx].loc[sub_start:sub_end-1]
        wseg["start"].fillna(srow["start"], inplace=True)
        wseg["end"].fillna(srow["end"], inplace=True)
        wseg["segment-text-start"].fillna(0, inplace=True)
        wseg["segment-text-end"].fillna(len(text)-1, inplace=True)
        cseg = char_segments_arr.loc[seg_idx].loc[sub_start:sub_end-1]
        # fixes bug for single segment in transcript
        cseg['segment-text-start'] = cseg['level_1'] if 'level_1' in cseg else 0
        cseg['segment-text-end'] = cseg['level_1'] + 1 if 'level_1' in cseg else 1
        if 'level_1' in cseg: del cseg['level_1']
        if 'level_0' in cseg: del cseg['level_0']
        cseg.reset_index(inplace=True)
        aligned_segments.append(
            {
                "start": srow["start"],
                "end": srow["end"],
                "text": text,
                "word-segments": wseg,
                "char-segments": cseg
            }
        )
        def get_raw_text(word_row):
            return seg["seg-text"][word_row.name][int(word_row["segment-text-start"]):int(word_row["segment-text-end"])+1]
        wdx = 0
        curr_text = get_raw_text(wseg.iloc[wdx])
        if len(wseg) > 1:
            for _, wrow in wseg.iloc[1:].iterrows():
                if wrow['start'] != wseg.iloc[wdx]['start']:
                    aligned_segments_word.append(
                        {
                            "text": curr_text.strip(),
                            "start": wseg.iloc[wdx]["start"],
                            "end": wseg.iloc[wdx]["end"],
                        }
                    )
                    curr_text = ""
                curr_text += " " + get_raw_text(wrow)
                wdx += 1
        aligned_segments_word.append(
            {
                "text": curr_text.strip(),
                "start": wseg.iloc[wdx]["start"],
                "end": wseg.iloc[wdx]["end"]
            }
        )
    return {"segments": aligned_segments, "word_segments": aligned_segments_word}
    return {"segments": aligned_segments, "word_segments": word_segments}
 """
 source: https://pytorch.org/tutorials/intermediate/forced_alignment_with_torchaudio_tutorial.html
--- a/whisperx/asr.py
+++ b/whisperx/asr.py
@ -1,433 +1,397 @@
 import os
 import warnings
-from typing import TYPE_CHECKING, Optional, Tuple, Union
+from typing import List, NamedTuple, Optional, Union
 from dataclasses import replace
 import ctranslate2
 import faster_whisper
 import numpy as np
 import torch
-import tqdm
+from faster_whisper.tokenizer import Tokenizer
-import ffmpeg
+from faster_whisper.transcribe import TranscriptionOptions, get_ctranslate2_storage
-from whisper.audio import (
+from transformers import Pipeline
-    FRAMES_PER_SECOND,
+from transformers.pipelines.pt_utils import PipelineIterator
    HOP_LENGTH,
    N_FRAMES,
    N_SAMPLES,
    SAMPLE_RATE,
    CHUNK_LENGTH,
    log_mel_spectrogram,
    pad_or_trim,
    load_audio
 )
 from whisper.decoding import DecodingOptions, DecodingResult
 from whisper.timing import add_word_timestamps
 from whisper.tokenizer import LANGUAGES, TO_LANGUAGE_CODE, get_tokenizer
 from whisper.utils import (
    exact_div,
    format_timestamp,
    make_safe,
 )
-if TYPE_CHECKING:
+from .audio import N_SAMPLES, SAMPLE_RATE, load_audio, log_mel_spectrogram
-    from whisper.model import Whisper
+from .types import SingleSegment, TranscriptionResult
 from .vad import VoiceActivitySegmentation, load_vad_model, merge_chunks
 from .vad import merge_chunks
-def transcribe(
+def find_numeral_symbol_tokens(tokenizer):
-    model: "Whisper",
+    numeral_symbol_tokens = []
-    audio: Union[str, np.ndarray, torch.Tensor] = None,
+    for i in range(tokenizer.eot):
-    mel: np.ndarray = None,
+        token = tokenizer.decode([i]).removeprefix(" ")
-    verbose: Optional[bool] = None,
+        has_numeral_symbol = any(c in "0123456789%$£" for c in token)
-    temperature: Union[float, Tuple[float, ...]] = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
+        if has_numeral_symbol:
-    compression_ratio_threshold: Optional[float] = 2.4,
+            numeral_symbol_tokens.append(i)
-    logprob_threshold: Optional[float] = -1.0,
+    return numeral_symbol_tokens
    no_speech_threshold: Optional[float] = 0.6,
    condition_on_previous_text: bool = True,
    initial_prompt: Optional[str] = None,
    word_timestamps: bool = False,
    prepend_punctuations: str = "\"'“¿([{-",
    append_punctuations: str = "\"'.。,，!！?？:：”)]}、",
    **decode_options,
 ):
    """
    Transcribe an audio file using Whisper.
    We redefine the Whisper transcribe function to allow mel input (for sequential slicing of audio)
-    Parameters
+class WhisperModel(faster_whisper.WhisperModel):
-    ----------
+    '''
-    model: Whisper
+    FasterWhisperModel provides batched inference for faster-whisper.
-        The Whisper model instance
+    Currently only works in non-timestamp mode and fixed prompt for all samples in batch.
    '''
-    audio: Union[str, np.ndarray, torch.Tensor]
+    def generate_segment_batched(
-        The path to the audio file to open, or the audio waveform
+        self,
-
+        features: np.ndarray,
-    mel: np.ndarray
+        tokenizer: Tokenizer,
-        Mel spectrogram of audio segment.
+        options: TranscriptionOptions,
-        
+        encoder_output=None,
    verbose: bool
        Whether to display the text being decoded to the console. If True, displays all the details,
        If False, displays minimal details. If None, does not display anything
    temperature: Union[float, Tuple[float, ...]]
        Temperature for sampling. It can be a tuple of temperatures, which will be successively used
        upon failures according to either `compression_ratio_threshold` or `logprob_threshold`.
    compression_ratio_threshold: float
        If the gzip compression ratio is above this value, treat as failed
    logprob_threshold: float
        If the average log probability over sampled tokens is below this value, treat as failed
    no_speech_threshold: float
        If the no_speech probability is higher than this value AND the average log probability
        over sampled tokens is below `logprob_threshold`, consider the segment as silent
    condition_on_previous_text: bool
        if True, the previous output of the model is provided as a prompt for the next window;
        disabling may make the text inconsistent across windows, but the model becomes less prone to
        getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.
    word_timestamps: bool
        Extract word-level timestamps using the cross-attention pattern and dynamic time warping,
        and include the timestamps for each word in each segment.
    prepend_punctuations: str
        If word_timestamps is True, merge these punctuation symbols with the next word
    append_punctuations: str
        If word_timestamps is True, merge these punctuation symbols with the previous word
    initial_prompt: Optional[str]
        Optional text to provide as a prompt for the first window. This can be used to provide, or
        "prompt-engineer" a context for transcription, e.g. custom vocabularies or proper nouns
        to make it more likely to predict those word correctly.
    decode_options: dict
        Keyword arguments to construct `DecodingOptions` instances
    Returns
    -------
    A dictionary containing the resulting text ("text") and segment-level details ("segments"), and
    the spoken language ("language"), which is detected when `decode_options["language"]` is None.
    """
    dtype = torch.float16 if decode_options.get("fp16", True) else torch.float32
    if model.device == torch.device("cpu"):
        if torch.cuda.is_available():
            warnings.warn("Performing inference on CPU when CUDA is available")
        if dtype == torch.float16:
            warnings.warn("FP16 is not supported on CPU; using FP32 instead")
            dtype = torch.float32
    if dtype == torch.float32:
        decode_options["fp16"] = False
    # Pad 30-seconds of silence to the input audio, for slicing
    if mel is None:
        if audio is None:
            raise ValueError("Transcribe needs either audio or mel as input, currently both are none.")
        mel = log_mel_spectrogram(audio, padding=N_SAMPLES)
    content_frames = mel.shape[-1] - N_FRAMES
    if decode_options.get("language", None) is None:
        if not model.is_multilingual:
            decode_options["language"] = "en"
        else:
            if verbose:
                print(
                    "Detecting language using up to the first 30 seconds. Use `--language` to specify the language"
                )
            mel_segment = pad_or_trim(mel, N_FRAMES).to(model.device).to(dtype)
            _, probs = model.detect_language(mel_segment)
            decode_options["language"] = max(probs, key=probs.get)
            if verbose is not None:
                print(
                    f"Detected language: {LANGUAGES[decode_options['language']].title()}"
                )
    language: str = decode_options["language"]
    task: str = decode_options.get("task", "transcribe")
    tokenizer = get_tokenizer(model.is_multilingual, language=language, task=task)
    if word_timestamps and task == "translate":
        warnings.warn("Word-level timestamps on translations may not be reliable.")
    def decode_with_fallback(segment: torch.Tensor) -> DecodingResult:
        temperatures = (
            [temperature] if isinstance(temperature, (int, float)) else temperature
        )
        decode_result = None
        for t in temperatures:
            kwargs = {**decode_options}
            if t > 0:
                # disable beam_size and patience when t > 0
                kwargs.pop("beam_size", None)
                kwargs.pop("patience", None)
            else:
                # disable best_of when t == 0
                kwargs.pop("best_of", None)
            options = DecodingOptions(**kwargs, temperature=t)
            decode_result = model.decode(segment, options)
            needs_fallback = False
            if (
                compression_ratio_threshold is not None
                and decode_result.compression_ratio > compression_ratio_threshold
    ):
-                needs_fallback = True  # too repetitive
+        batch_size = features.shape[0]
            if (
                logprob_threshold is not None
                and decode_result.avg_logprob < logprob_threshold
            ):
                needs_fallback = True  # average log probability is too low
            if not needs_fallback:
                break
        return decode_result
    seek = 0
    input_stride = exact_div(
        N_FRAMES, model.dims.n_audio_ctx
    )  # mel frames per output token: 2
    time_precision = (
        input_stride * HOP_LENGTH / SAMPLE_RATE
    )  # time per output token: 0.02 (seconds)
        all_tokens = []
    all_segments = []
        prompt_reset_since = 0
-
+        if options.initial_prompt is not None:
-    if initial_prompt is not None:
+            initial_prompt = " " + options.initial_prompt.strip()
-        initial_prompt_tokens = tokenizer.encode(" " + initial_prompt.strip())
+            initial_prompt_tokens = tokenizer.encode(initial_prompt)
            all_tokens.extend(initial_prompt_tokens)
-    else:
+        previous_tokens = all_tokens[prompt_reset_since:]
-        initial_prompt_tokens = []
+        prompt = self.get_prompt(
            tokenizer,
            previous_tokens,
            without_timestamps=options.without_timestamps,
            prefix=options.prefix,
        )
-    def new_segment(
+        encoder_output = self.encode(features)
-        *, start: float, end: float, tokens: torch.Tensor, result: DecodingResult
+
        max_initial_timestamp_index = int(
            round(options.max_initial_timestamp / self.time_precision)
        )
        result = self.model.generate(
                encoder_output,
                [prompt] * batch_size,
                beam_size=options.beam_size,
                patience=options.patience,
                length_penalty=options.length_penalty,
                max_length=self.max_length,
                suppress_blank=options.suppress_blank,
                suppress_tokens=options.suppress_tokens,
            )
        tokens_batch = [x.sequences_ids[0] for x in result]
        def decode_batch(tokens: List[List[int]]) -> str:
            res = []
            for tk in tokens:
                res.append([token for token in tk if token < tokenizer.eot])
            # text_tokens = [token for token in tokens if token < self.eot]
            return tokenizer.tokenizer.decode_batch(res)
        text = decode_batch(tokens_batch)
        return text
    def encode(self, features: np.ndarray) -> ctranslate2.StorageView:
        # When the model is running on multiple GPUs, the encoder output should be moved
        # to the CPU since we don't know which GPU will handle the next job.
        to_cpu = self.model.device == "cuda" and len(self.model.device_index) > 1
        # unsqueeze if batch size = 1
        if len(features.shape) == 2:
            features = np.expand_dims(features, 0)
        features = get_ctranslate2_storage(features)
        return self.model.encode(features, to_cpu=to_cpu)
 class FasterWhisperPipeline(Pipeline):
    """
    Huggingface Pipeline wrapper for FasterWhisperModel.
    """
    # TODO:
    # - add support for timestamp mode
    # - add support for custom inference kwargs
    def __init__(
        self,
        model: WhisperModel,
        vad: VoiceActivitySegmentation,
        vad_params: dict,
        options: TranscriptionOptions,
        tokenizer: Optional[Tokenizer] = None,
        device: Union[int, str, "torch.device"] = -1,
        framework="pt",
        language: Optional[str] = None,
        suppress_numerals: bool = False,
        **kwargs,
    ):
-        tokens = tokens.tolist()
+        self.model = model
-        text_tokens = [token for token in tokens if token < tokenizer.eot]
+        self.tokenizer = tokenizer
-        return {
+        self.options = options
-            "seek": seek,
+        self.preset_language = language
-            "start": start,
+        self.suppress_numerals = suppress_numerals
-            "end": end,
+        self._batch_size = kwargs.pop("batch_size", None)
-            "text": tokenizer.decode(text_tokens),
+        self._num_workers = 1
-            "tokens": tokens,
+        self._preprocess_params, self._forward_params, self._postprocess_params = self._sanitize_parameters(**kwargs)
-            "temperature": result.temperature,
+        self.call_count = 0
-            "avg_logprob": result.avg_logprob,
+        self.framework = framework
-            "compression_ratio": result.compression_ratio,
+        if self.framework == "pt":
-            "no_speech_prob": result.no_speech_prob,
+            if isinstance(device, torch.device):
-        }
+                self.device = device
-
+            elif isinstance(device, str):
-
+                self.device = torch.device(device)
-    # show the progress bar when verbose is False (if True, transcribed text will be printed)
+            elif device < 0:
-    with tqdm.tqdm(
+                self.device = torch.device("cpu")
        total=content_frames, unit="frames", disable=verbose is not False
    ) as pbar:
        while seek < content_frames:
            time_offset = float(seek * HOP_LENGTH / SAMPLE_RATE)
            mel_segment = mel[:, seek : seek + N_FRAMES]
            segment_size = min(N_FRAMES, content_frames - seek)
            segment_duration = segment_size * HOP_LENGTH / SAMPLE_RATE
            mel_segment = pad_or_trim(mel_segment, N_FRAMES).to(model.device).to(dtype)
            decode_options["prompt"] = all_tokens[prompt_reset_since:]
            result: DecodingResult = decode_with_fallback(mel_segment)
            tokens = torch.tensor(result.tokens)
            if no_speech_threshold is not None:
                # no voice activity check
                should_skip = result.no_speech_prob > no_speech_threshold
                if (
                    logprob_threshold is not None
                    and result.avg_logprob > logprob_threshold
                ):
                    # don't skip if the logprob is high enough, despite the no_speech_prob
                    should_skip = False
                if should_skip:
                    seek += segment_size  # fast-forward to the next segment boundary
                    continue
            previous_seek = seek
            current_segments = []
            timestamp_tokens: torch.Tensor = tokens.ge(tokenizer.timestamp_begin)
            single_timestamp_ending = timestamp_tokens[-2:].tolist() == [False, True]
            consecutive = torch.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0]
            consecutive.add_(1)
            if len(consecutive) > 0:
                # if the output contains two consecutive timestamp tokens
                slices = consecutive.tolist()
                if single_timestamp_ending:
                    slices.append(len(tokens))
                last_slice = 0
                for current_slice in slices:
                    sliced_tokens = tokens[last_slice:current_slice]
                    start_timestamp_pos = (
                        sliced_tokens[0].item() - tokenizer.timestamp_begin
                    )
                    end_timestamp_pos = (
                        sliced_tokens[-1].item() - tokenizer.timestamp_begin
                    )
                    # clamp end-time to at least be 1 frame after start-time
                    end_timestamp_pos = max(end_timestamp_pos, start_timestamp_pos + time_precision)
                    current_segments.append(
                        new_segment(
                            start=time_offset + start_timestamp_pos * time_precision,
                            end=time_offset + end_timestamp_pos * time_precision,
                            tokens=sliced_tokens,
                            result=result,
                        )
                    )
                    last_slice = current_slice
                if single_timestamp_ending:
                    # single timestamp at the end means no speech after the last timestamp.
                    seek += segment_size
            else:
-                    # otherwise, ignore the unfinished segment and seek to the last timestamp
+                self.device = torch.device(f"cuda:{device}")
                    last_timestamp_pos = (
                        tokens[last_slice - 1].item() - tokenizer.timestamp_begin
                    )
                    seek += last_timestamp_pos * input_stride
        else:
-                duration = segment_duration
+            self.device = device
-                timestamps = tokens[timestamp_tokens.nonzero().flatten()]
+
-                if (
+        super(Pipeline, self).__init__()
-                    len(timestamps) > 0
+        self.vad_model = vad
-                    and timestamps[-1].item() != tokenizer.timestamp_begin
+        self._vad_params = vad_params
    def _sanitize_parameters(self, **kwargs):
        preprocess_kwargs = {}
        if "tokenizer" in kwargs:
            preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
        return preprocess_kwargs, {}, {}
    def preprocess(self, audio):
        audio = audio['inputs']
        model_n_mels = self.model.feat_kwargs.get("feature_size")
        features = log_mel_spectrogram(
            audio,
            n_mels=model_n_mels if model_n_mels is not None else 80,
            padding=N_SAMPLES - audio.shape[0],
        )
        return {'inputs': features}
    def _forward(self, model_inputs):
        outputs = self.model.generate_segment_batched(model_inputs['inputs'], self.tokenizer, self.options)
        return {'text': outputs}
    def postprocess(self, model_outputs):
        return model_outputs
    def get_iterator(
        self,
        inputs,
        num_workers: int,
        batch_size: int,
        preprocess_params: dict,
        forward_params: dict,
        postprocess_params: dict,
    ):
-                    # no consecutive timestamps but it has a timestamp; use the last one.
+        dataset = PipelineIterator(inputs, self.preprocess, preprocess_params)
-                    last_timestamp_pos = (
+        if "TOKENIZERS_PARALLELISM" not in os.environ:
-                        timestamps[-1].item() - tokenizer.timestamp_begin
+            os.environ["TOKENIZERS_PARALLELISM"] = "false"
        # TODO hack by collating feature_extractor and image_processor
        def stack(items):
            return {'inputs': torch.stack([x['inputs'] for x in items])}
        dataloader = torch.utils.data.DataLoader(dataset, num_workers=num_workers, batch_size=batch_size, collate_fn=stack)
        model_iterator = PipelineIterator(dataloader, self.forward, forward_params, loader_batch_size=batch_size)
        final_iterator = PipelineIterator(model_iterator, self.postprocess, postprocess_params)
        return final_iterator
    def transcribe(
        self,
        audio: Union[str, np.ndarray],
        batch_size: Optional[int] = None,
        num_workers=0,
        language: Optional[str] = None,
        task: Optional[str] = None,
        chunk_size=30,
        print_progress=False,
        combined_progress=False,
        verbose=False,
    ) -> TranscriptionResult:
        if isinstance(audio, str):
            audio = load_audio(audio)
        def data(audio, segments):
            for seg in segments:
                f1 = int(seg['start'] * SAMPLE_RATE)
                f2 = int(seg['end'] * SAMPLE_RATE)
                # print(f2-f1)
                yield {'inputs': audio[f1:f2]}
        vad_segments = self.vad_model({"waveform": torch.from_numpy(audio).unsqueeze(0), "sample_rate": SAMPLE_RATE})
        vad_segments = merge_chunks(
            vad_segments,
            chunk_size,
            onset=self._vad_params["vad_onset"],
            offset=self._vad_params["vad_offset"],
        )
-                    duration = last_timestamp_pos * time_precision
+        if self.tokenizer is None:
-
+            language = language or self.detect_language(audio)
-                current_segments.append(
+            task = task or "transcribe"
-                    new_segment(
+            self.tokenizer = Tokenizer(
-                        start=time_offset,
+                self.model.hf_tokenizer,
-                        end=time_offset + duration,
+                self.model.model.is_multilingual,
-                        tokens=tokens,
+                task=task,
-                        result=result,
+                language=language,
            )
-                )
+        else:
-                seek += segment_size
+            language = language or self.tokenizer.language_code
-
+            task = task or self.tokenizer.task
-            if not condition_on_previous_text or result.temperature > 0.5:
+            if task != self.tokenizer.task or language != self.tokenizer.language_code:
-                # do not feed the prompt tokens if a high temperature was used
+                self.tokenizer = Tokenizer(
-                prompt_reset_since = len(all_tokens)
+                    self.model.hf_tokenizer,
-
+                    self.model.model.is_multilingual,
-            if word_timestamps:
+                    task=task,
                add_word_timestamps(
                    segments=current_segments,
                    model=model,
                    tokenizer=tokenizer,
                    mel=mel_segment,
                    num_frames=segment_size,
                    prepend_punctuations=prepend_punctuations,
                    append_punctuations=append_punctuations,
                )
                word_end_timestamps = [
                    w["end"] for s in current_segments for w in s["words"]
                ]
                if not single_timestamp_ending and len(word_end_timestamps) > 0:
                    seek_shift = round(
                        (word_end_timestamps[-1] - time_offset) * FRAMES_PER_SECOND
                    )
                    if seek_shift > 0:
                        seek = previous_seek + seek_shift
            if verbose:
                for segment in current_segments:
                    start, end, text = segment["start"], segment["end"], segment["text"]
                    line = f"[{format_timestamp(start)} --> {format_timestamp(end)}] {text}"
                    print(make_safe(line))
            # if a segment is instantaneous or does not contain text, clear it
            for i, segment in enumerate(current_segments):
                if segment["start"] == segment["end"] or segment["text"].strip() == "":
                    segment["text"] = ""
                    segment["tokens"] = []
                    segment["words"] = []
            all_segments.extend(
                [
                    {"id": i, **segment}
                    for i, segment in enumerate(
                        current_segments, start=len(all_segments)
                    )
                ]
            )
            all_tokens.extend(
                [token for segment in current_segments for token in segment["tokens"]]
            )
            # update progress bar
            pbar.update(min(content_frames, seek) - previous_seek)
    return dict(
        text=tokenizer.decode(all_tokens[len(initial_prompt_tokens) :]),
        segments=all_segments,
                    language=language,
                )
        if self.suppress_numerals:
            previous_suppress_tokens = self.options.suppress_tokens
            numeral_symbol_tokens = find_numeral_symbol_tokens(self.tokenizer)
            print(f"Suppressing numeral and symbol tokens")
            new_suppressed_tokens = numeral_symbol_tokens + self.options.suppress_tokens
            new_suppressed_tokens = list(set(new_suppressed_tokens))
            self.options = replace(self.options, suppress_tokens=new_suppressed_tokens)
-def transcribe_with_vad(
+        segments: List[SingleSegment] = []
-    model: "Whisper",
+        batch_size = batch_size or self._batch_size
-    audio: str,
+        total_segments = len(vad_segments)
-    vad_pipeline,
+        for idx, out in enumerate(self.__call__(data(audio, vad_segments), batch_size=batch_size, num_workers=num_workers)):
-    mel = None,
+            if print_progress:
-    verbose: Optional[bool] = None,
+                base_progress = ((idx + 1) / total_segments) * 100
-    **kwargs
+                percent_complete = base_progress / 2 if combined_progress else base_progress
-):
+                print(f"Progress: {percent_complete:.2f}%...")
-    """
+            text = out['text']
-    Transcribe per VAD segment
+            if batch_size in [0, 1, None]:
-    """
+                text = text[0]
    vad_segments = vad_pipeline(audio)
    # if not torch.is_tensor(audio):
        # if isinstance(audio, str):
    audio = load_audio(audio)
    audio = torch.from_numpy(audio)
    prev = 0
    output = {"segments": []}
    # merge segments to approx 30s inputs to make whisper most appropraite
    vad_segments = merge_chunks(vad_segments, chunk_size=CHUNK_LENGTH)
    if len(vad_segments) == 0:
        return output
    print(">>Performing transcription...")
    for sdx, seg_t in enumerate(vad_segments):
            if verbose:
-            print(f"~~ Transcribing VAD chunk: ({format_timestamp(seg_t['start'])} --> {format_timestamp(seg_t['end'])}) ~~")
+                print(f"Transcript: [{round(vad_segments[idx]['start'], 3)} --> {round(vad_segments[idx]['end'], 3)}] {text}")
-        seg_f_start, seg_f_end = int(seg_t["start"] * SAMPLE_RATE), int(seg_t["end"] * SAMPLE_RATE)
+            segments.append(
        local_f_start, local_f_end = seg_f_start - prev, seg_f_end - prev
        audio = audio[local_f_start:] # seek forward
        seg_audio = audio[:local_f_end-local_f_start] # seek forward
        prev = seg_f_start
        local_mel = log_mel_spectrogram(seg_audio, padding=N_SAMPLES)
        # need to pad
        result = transcribe(model, audio, mel=local_mel, verbose=verbose, **kwargs)
        seg_t["text"] = result["text"]
        output["segments"].append(
                {
-                "start": seg_t["start"],
+                    "text": text,
-                "end": seg_t["end"],
+                    "start": round(vad_segments[idx]['start'], 3),
-                "language": result["language"],
+                    "end": round(vad_segments[idx]['end'], 3)
                "text": result["text"],
                "seg-text": [x["text"] for x in result["segments"]],
                "seg-start": [x["start"] for x in result["segments"]],
                "seg-end": [x["end"] for x in result["segments"]],
                }
            )
-    output["language"] = output["segments"][0]["language"]
+        # revert the tokenizer if multilingual inference is enabled
        if self.preset_language is None:
            self.tokenizer = None
-    return output
+        # revert suppressed tokens if suppress_numerals is enabled
        if self.suppress_numerals:
            self.options = replace(self.options, suppress_tokens=previous_suppress_tokens)
        return {"segments": segments, "language": language}
    def detect_language(self, audio: np.ndarray) -> str:
        if audio.shape[0] < N_SAMPLES:
            print("Warning: audio is shorter than 30s, language detection may be inaccurate.")
        model_n_mels = self.model.feat_kwargs.get("feature_size")
        segment = log_mel_spectrogram(audio[: N_SAMPLES],
                                      n_mels=model_n_mels if model_n_mels is not None else 80,
                                      padding=0 if audio.shape[0] >= N_SAMPLES else N_SAMPLES - audio.shape[0])
        encoder_output = self.model.encode(segment)
        results = self.model.model.detect_language(encoder_output)
        language_token, language_probability = results[0][0]
        language = language_token[2:-2]
        print(f"Detected language: {language} ({language_probability:.2f}) in first 30s of audio...")
        return language
 def load_model(
    whisper_arch: str,
    device: str,
    device_index=0,
    compute_type="float16",
    asr_options: Optional[dict] = None,
    language: Optional[str] = None,
    vad_model: Optional[VoiceActivitySegmentation] = None,
    vad_options: Optional[dict] = None,
    model: Optional[WhisperModel] = None,
    task="transcribe",
    download_root: Optional[str] = None,
    local_files_only=False,
    threads=4,
 ) -> FasterWhisperPipeline:
    """Load a Whisper model for inference.
    Args:
        whisper_arch - The name of the Whisper model to load.
        device - The device to load the model on.
        compute_type - The compute type to use for the model.
        options - A dictionary of options to use for the model.
        language - The language of the model. (use English for now)
        model - The WhisperModel instance to use.
        download_root - The root directory to download the model to.
        local_files_only - If `True`, avoid downloading the file and return the path to the local cached file if it exists.
        threads - The number of cpu threads to use per worker, e.g. will be multiplied by num workers.
    Returns:
        A Whisper pipeline.
    """
    if whisper_arch.endswith(".en"):
        language = "en"
    model = model or WhisperModel(whisper_arch,
                         device=device,
                         device_index=device_index,
                         compute_type=compute_type,
                         download_root=download_root,
                         local_files_only=local_files_only,
                         cpu_threads=threads)
    if language is not None:
        tokenizer = Tokenizer(model.hf_tokenizer, model.model.is_multilingual, task=task, language=language)
    else:
        print("No language specified, language will be first be detected for each audio file (increases inference time).")
        tokenizer = None
    default_asr_options =  {
        "beam_size": 5,
        "best_of": 5,
        "patience": 1,
        "length_penalty": 1,
        "repetition_penalty": 1,
        "no_repeat_ngram_size": 0,
        "temperatures": [0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
        "compression_ratio_threshold": 2.4,
        "log_prob_threshold": -1.0,
        "no_speech_threshold": 0.6,
        "condition_on_previous_text": False,
        "prompt_reset_on_temperature": 0.5,
        "initial_prompt": None,
        "prefix": None,
        "suppress_blank": True,
        "suppress_tokens": [-1],
        "without_timestamps": True,
        "max_initial_timestamp": 0.0,
        "word_timestamps": False,
        "prepend_punctuations": "\"'“¿([{-",
        "append_punctuations": "\"'.。,，!！?？:：”)]}、",
        "multilingual": model.model.is_multilingual,
        "suppress_numerals": False,
        "max_new_tokens": None,
        "clip_timestamps": None,
        "hallucination_silence_threshold": None,
        "hotwords": None,
    }
    if asr_options is not None:
        default_asr_options.update(asr_options)
    suppress_numerals = default_asr_options["suppress_numerals"]
    del default_asr_options["suppress_numerals"]
    default_asr_options = TranscriptionOptions(**default_asr_options)
    default_vad_options = {
        "vad_onset": 0.500,
        "vad_offset": 0.363
    }
    if vad_options is not None:
        default_vad_options.update(vad_options)
    if vad_model is not None:
        vad_model = vad_model
    else:
        vad_model = load_vad_model(torch.device(device), use_auth_token=None, **default_vad_options)
    return FasterWhisperPipeline(
        model=model,
        vad=vad_model,
        options=default_asr_options,
        tokenizer=tokenizer,
        language=language,
        suppress_numerals=suppress_numerals,
        vad_params=default_vad_options,
    )
--- a/whisperx/assets/mel_filters.npz
+++ b/whisperx/assets/mel_filters.npz
--- a/whisperx/assets/pytorch_model.bin
+++ b/whisperx/assets/pytorch_model.bin
--- a/whisperx/audio.py
+++ b/whisperx/audio.py
@ -0,0 +1,159 @@
 import os
 import subprocess
 from functools import lru_cache
 from typing import Optional, Union
 import numpy as np
 import torch
 import torch.nn.functional as F
 from .utils import exact_div
 # hard-coded audio hyperparameters
 SAMPLE_RATE = 16000
 N_FFT = 400
 HOP_LENGTH = 160
 CHUNK_LENGTH = 30
 N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE  # 480000 samples in a 30-second chunk
 N_FRAMES = exact_div(N_SAMPLES, HOP_LENGTH)  # 3000 frames in a mel spectrogram input
 N_SAMPLES_PER_TOKEN = HOP_LENGTH * 2  # the initial convolutions has stride 2
 FRAMES_PER_SECOND = exact_div(SAMPLE_RATE, HOP_LENGTH)  # 10ms per audio frame
 TOKENS_PER_SECOND = exact_div(SAMPLE_RATE, N_SAMPLES_PER_TOKEN)  # 20ms per audio token
 def load_audio(file: str, sr: int = SAMPLE_RATE) -> np.ndarray:
    """
    Open an audio file and read as mono waveform, resampling as necessary
    Parameters
    ----------
    file: str
        The audio file to open
    sr: int
        The sample rate to resample the audio if necessary
    Returns
    -------
    A NumPy array containing the audio waveform, in float32 dtype.
    """
    try:
        # Launches a subprocess to decode audio while down-mixing and resampling as necessary.
        # Requires the ffmpeg CLI to be installed.
        cmd = [
            "ffmpeg",
            "-nostdin",
            "-threads",
            "0",
            "-i",
            file,
            "-f",
            "s16le",
            "-ac",
            "1",
            "-acodec",
            "pcm_s16le",
            "-ar",
            str(sr),
            "-",
        ]
        out = subprocess.run(cmd, capture_output=True, check=True).stdout
    except subprocess.CalledProcessError as e:
        raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
 def pad_or_trim(array, length: int = N_SAMPLES, *, axis: int = -1):
    """
    Pad or trim the audio array to N_SAMPLES, as expected by the encoder.
    """
    if torch.is_tensor(array):
        if array.shape[axis] > length:
            array = array.index_select(
                dim=axis, index=torch.arange(length, device=array.device)
            )
        if array.shape[axis] < length:
            pad_widths = [(0, 0)] * array.ndim
            pad_widths[axis] = (0, length - array.shape[axis])
            array = F.pad(array, [pad for sizes in pad_widths[::-1] for pad in sizes])
    else:
        if array.shape[axis] > length:
            array = array.take(indices=range(length), axis=axis)
        if array.shape[axis] < length:
            pad_widths = [(0, 0)] * array.ndim
            pad_widths[axis] = (0, length - array.shape[axis])
            array = np.pad(array, pad_widths)
    return array
@lru_cache(maxsize=None)
 def mel_filters(device, n_mels: int) -> torch.Tensor:
    """
    load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
    Allows decoupling librosa dependency; saved using:
        np.savez_compressed(
            "mel_filters.npz",
            mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80),
        )
    """
    assert n_mels in [80, 128], f"Unsupported n_mels: {n_mels}"
    with np.load(
        os.path.join(os.path.dirname(__file__), "assets", "mel_filters.npz")
    ) as f:
        return torch.from_numpy(f[f"mel_{n_mels}"]).to(device)
 def log_mel_spectrogram(
    audio: Union[str, np.ndarray, torch.Tensor],
    n_mels: int,
    padding: int = 0,
    device: Optional[Union[str, torch.device]] = None,
 ):
    """
    Compute the log-Mel spectrogram of
    Parameters
    ----------
    audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
        The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
    n_mels: int
        The number of Mel-frequency filters, only 80 is supported
    padding: int
        Number of zero samples to pad to the right
    device: Optional[Union[str, torch.device]]
        If given, the audio tensor is moved to this device before STFT
    Returns
    -------
    torch.Tensor, shape = (80, n_frames)
        A Tensor that contains the Mel spectrogram
    """
    if not torch.is_tensor(audio):
        if isinstance(audio, str):
            audio = load_audio(audio)
        audio = torch.from_numpy(audio)
    if device is not None:
        audio = audio.to(device)
    if padding > 0:
        audio = F.pad(audio, (0, padding))
    window = torch.hann_window(N_FFT).to(audio.device)
    stft = torch.stft(audio, N_FFT, HOP_LENGTH, window=window, return_complex=True)
    magnitudes = stft[..., :-1].abs() ** 2
    filters = mel_filters(audio.device, n_mels)
    mel_spec = filters @ magnitudes
    log_spec = torch.clamp(mel_spec, min=1e-10).log10()
    log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
    log_spec = (log_spec + 4.0) / 4.0
    return log_spec
--- a/whisperx/conjunctions.py
+++ b/whisperx/conjunctions.py
@ -0,0 +1,47 @@
 # conjunctions.py
 from typing import Set
 conjunctions_by_language = {
    'en': {'and', 'whether', 'or', 'as', 'but', 'so', 'for', 'nor', 'which', 'yet', 'although', 'since', 'unless', 'when', 'while', 'because', 'if', 'how', 'that', 'than', 'who', 'where', 'what', 'near', 'before', 'after', 'across', 'through', 'until', 'once', 'whereas', 'even', 'both', 'either', 'neither', 'though'},
    'fr': {'et', 'ou', 'mais', 'parce', 'bien', 'pendant', 'quand', 'où', 'comme', 'si', 'que', 'avant', 'après', 'aussitôt', 'jusqu’à', 'à', 'malgré', 'donc', 'tant', 'puisque', 'ni', 'soit', 'bien', 'encore', 'dès', 'lorsque'},
    'de': {'und', 'oder', 'aber', 'weil', 'obwohl', 'während', 'wenn', 'wo', 'wie', 'dass', 'bevor', 'nachdem', 'sobald', 'bis', 'außer', 'trotzdem', 'also', 'sowie', 'indem', 'weder', 'sowohl', 'zwar', 'jedoch'},
    'es': {'y', 'o', 'pero', 'porque', 'aunque', 'sin', 'mientras', 'cuando', 'donde', 'como', 'si', 'que', 'antes', 'después', 'tan', 'hasta', 'a', 'a', 'por', 'ya', 'ni', 'sino'},
    'it': {'e', 'o', 'ma', 'perché', 'anche', 'mentre', 'quando', 'dove', 'come', 'se', 'che', 'prima', 'dopo', 'appena', 'fino', 'a', 'nonostante', 'quindi', 'poiché', 'né', 'ossia', 'cioè'},
    'ja': {'そして', 'または', 'しかし', 'なぜなら', 'もし', 'それとも', 'だから', 'それに', 'なのに', 'そのため', 'かつ', 'それゆえに', 'ならば', 'もしくは', 'ため'},
    'zh': {'和', '或', '但是', '因为', '任何', '也', '虽然', '而且', '所以', '如果', '除非', '尽管', '既然', '即使', '只要', '直到', '然后', '因此', '不但', '而是', '不过'},
    'nl': {'en', 'of', 'maar', 'omdat', 'hoewel', 'terwijl', 'wanneer', 'waar', 'zoals', 'als', 'dat', 'voordat', 'nadat', 'zodra', 'totdat', 'tenzij', 'ondanks', 'dus', 'zowel', 'noch', 'echter', 'toch'},
    'uk': {'та', 'або', 'але', 'тому', 'хоча', 'поки', 'бо', 'коли', 'де', 'як', 'якщо', 'що', 'перш', 'після', 'доки', 'незважаючи', 'тому', 'ані'},
    'pt': {'e', 'ou', 'mas', 'porque', 'embora', 'enquanto', 'quando', 'onde', 'como', 'se', 'que', 'antes', 'depois', 'assim', 'até', 'a', 'apesar', 'portanto', 'já', 'pois', 'nem', 'senão'},
    'ar': {'و', 'أو', 'لكن', 'لأن', 'مع', 'بينما', 'عندما', 'حيث', 'كما', 'إذا', 'الذي', 'قبل', 'بعد', 'فور', 'حتى', 'إلا', 'رغم', 'لذلك', 'بما'},
    'cs': {'a', 'nebo', 'ale', 'protože', 'ačkoli', 'zatímco', 'když', 'kde', 'jako', 'pokud', 'že', 'než', 'poté', 'jakmile', 'dokud', 'pokud ne', 'navzdory', 'tak', 'stejně', 'ani', 'tudíž'},
    'ru': {'и', 'или', 'но', 'потому', 'хотя', 'пока', 'когда', 'где', 'как', 'если', 'что', 'перед', 'после', 'несмотря', 'таким', 'также', 'ни', 'зато'},
    'pl': {'i', 'lub', 'ale', 'ponieważ', 'chociaż', 'podczas', 'kiedy', 'gdzie', 'jak', 'jeśli', 'że', 'zanim', 'po', 'jak tylko', 'dopóki', 'chyba', 'pomimo', 'więc', 'tak', 'ani', 'czyli'},
    'hu': {'és', 'vagy', 'de', 'mert', 'habár', 'míg', 'amikor', 'ahol', 'ahogy', 'ha', 'hogy', 'mielőtt', 'miután', 'amint', 'amíg', 'hacsak', 'ellenére', 'tehát', 'úgy', 'sem', 'vagyis'},
    'fi': {'ja', 'tai', 'mutta', 'koska', 'vaikka', 'kun', 'missä', 'kuten', 'jos', 'että', 'ennen', 'sen jälkeen', 'heti', 'kunnes', 'ellei', 'huolimatta', 'siis', 'sekä', 'eikä', 'vaan'},
    'fa': {'و', 'یا', 'اما', 'چون', 'اگرچه', 'در حالی', 'وقتی', 'کجا', 'چگونه', 'اگر', 'که', 'قبل', 'پس', 'به محض', 'تا زمانی', 'مگر', 'با وجود', 'پس', 'همچنین', 'نه'},
    'el': {'και', 'ή', 'αλλά', 'επειδή', 'αν', 'ενώ', 'όταν', 'όπου', 'όπως', 'αν', 'που', 'προτού', 'αφού', 'μόλις', 'μέχρι', 'εκτός', 'παρά', 'έτσι', 'όπως', 'ούτε', 'δηλαδή'},
    'tr': {'ve', 'veya', 'ama', 'çünkü', 'her ne', 'iken', 'nerede', 'nasıl', 'eğer', 'ki', 'önce', 'sonra', 'hemen', 'kadar', 'rağmen', 'hem', 'ne', 'yani'},
    'da': {'og', 'eller', 'men', 'fordi', 'selvom', 'mens', 'når', 'hvor', 'som', 'hvis', 'at', 'før', 'efter', 'indtil', 'medmindre', 'således', 'ligesom', 'hverken', 'altså'},
    'he': {'ו', 'או', 'אבל', 'כי', 'אף', 'בזמן', 'כאשר', 'היכן', 'כיצד', 'אם', 'ש', 'לפני', 'אחרי', 'ברגע', 'עד', 'אלא', 'למרות', 'לכן', 'כמו', 'לא', 'אז'},
    'vi': {'và', 'hoặc', 'nhưng', 'bởi', 'mặc', 'trong', 'khi', 'ở', 'như', 'nếu', 'rằng', 'trước', 'sau', 'ngay', 'cho', 'trừ', 'mặc', 'vì', 'giống', 'cũng', 'tức'},
    'ko': {'그리고', '또는','그런데','그래도', '이나', '결국', '마지막으로', '마찬가지로', '반면에', '아니면', '거나', '또는', '그럼에도', '그렇기', '때문에', '덧붙이자면', '게다가', '그러나',  '고', '그래서', '랑', '한다면', '하지만', '무엇', '왜냐하면', '비록', '동안', '언제', '어디서', '어떻게', '만약', '그', '전에', '후에', '즉시', '까지', '아니라면', '불구하고', '따라서', '같은', '도'},
    'ur': {'اور', 'یا', 'مگر', 'کیونکہ', 'اگرچہ', 'جبکہ', 'جب', 'کہاں', 'کس طرح', 'اگر', 'کہ', 'سے پہلے', 'کے بعد', 'جیسے ہی', 'تک', 'اگر نہیں تو', 'کے باوجود', 'اس لئے', 'جیسے', 'نہ'},
    'hi': {'और', 'या', 'पर', 'तो', 'न', 'फिर', 'हालांकि', 'चूंकि', 'अगर', 'कैसे', 'वह', 'से', 'जो', 'जहां', 'क्या', 'नजदीक', 'पहले', 'बाद', 'के', 'पार', 'माध्यम', 'तक', 'एक', 'जबकि', 'यहां', 'तक', 'दोनों', 'या', 'न', 'हालांकि'}
 }
 commas_by_language = {
    'ja': '、', 
    'zh': '，',
    'fa': '،', 
    'ur': '،'  
 }
 def get_conjunctions(lang_code: str) -> Set[str]:
    return conjunctions_by_language.get(lang_code, set())
 def get_comma(lang_code: str) -> str:
    return commas_by_language.get(lang_code, ",")
--- a/whisperx/diarize.py
+++ b/whisperx/diarize.py
@ -1,73 +1,82 @@
 import numpy as np
 import pandas as pd
 from pyannote.audio import Pipeline
 from typing import Optional, Union
 import torch
 from .audio import load_audio, SAMPLE_RATE
 from .types import TranscriptionResult, AlignedTranscriptionResult
 class DiarizationPipeline:
    def __init__(
        self,
-        model_name="pyannote/speaker-diarization@2.1",
+        model_name="pyannote/speaker-diarization-3.1",
        use_auth_token=None,
        device: Optional[Union[str, torch.device]] = "cpu",
    ):
-        self.model = Pipeline.from_pretrained(model_name, use_auth_token=use_auth_token)
+        if isinstance(device, str):
            device = torch.device(device)
        self.model = Pipeline.from_pretrained(model_name, use_auth_token=use_auth_token).to(device)
-    def __call__(self, audio, min_speakers=None, max_speakers=None):
+    def __call__(
-        segments = self.model(audio, min_speakers=min_speakers, max_speakers=max_speakers)
+        self,
-        diarize_df = pd.DataFrame(segments.itertracks(yield_label=True))
+        audio: Union[str, np.ndarray],
-        diarize_df['start'] = diarize_df[0].apply(lambda x: x.start)
+        num_speakers: Optional[int] = None,
-        diarize_df['end'] = diarize_df[0].apply(lambda x: x.end)
+        min_speakers: Optional[int] = None,
        max_speakers: Optional[int] = None,
    ):
        if isinstance(audio, str):
            audio = load_audio(audio)
        audio_data = {
            'waveform': torch.from_numpy(audio[None, :]),
            'sample_rate': SAMPLE_RATE
        }
        segments = self.model(audio_data, num_speakers = num_speakers, min_speakers=min_speakers, max_speakers=max_speakers)
        diarize_df = pd.DataFrame(segments.itertracks(yield_label=True), columns=['segment', 'label', 'speaker'])
        diarize_df['start'] = diarize_df['segment'].apply(lambda x: x.start)
        diarize_df['end'] = diarize_df['segment'].apply(lambda x: x.end)
        return diarize_df
-def assign_word_speakers(diarize_df, result_segments, fill_nearest=False):
+
-    for seg in result_segments:
+def assign_word_speakers(
-        wdf = seg['word-segments']
+    diarize_df: pd.DataFrame,
-        if len(wdf['start'].dropna()) == 0:
+    transcript_result: Union[AlignedTranscriptionResult, TranscriptionResult],
-            wdf['start'] = seg['start']
+    fill_nearest=False,
-            wdf['end'] = seg['end']
+) -> dict:
-        speakers = []
+    transcript_segments = transcript_result["segments"]
-        for wdx, wrow in wdf.iterrows():
+    for seg in transcript_segments:
-            if not np.isnan(wrow['start']):
+        # assign speaker to segment (if any)
-                diarize_df['intersection'] = np.minimum(diarize_df['end'], wrow['end']) - np.maximum(diarize_df['start'], wrow['start'])
+        diarize_df['intersection'] = np.minimum(diarize_df['end'], seg['end']) - np.maximum(diarize_df['start'], seg['start'])
-                diarize_df['union'] = np.maximum(diarize_df['end'], wrow['end']) - np.minimum(diarize_df['start'], wrow['start'])
+        diarize_df['union'] = np.maximum(diarize_df['end'], seg['end']) - np.minimum(diarize_df['start'], seg['start'])
        # remove no hit, otherwise we look for closest (even negative intersection...)
        if not fill_nearest:
            dia_tmp = diarize_df[diarize_df['intersection'] > 0]
        else:
            dia_tmp = diarize_df
        if len(dia_tmp) > 0:
            # sum over speakers
            speaker = dia_tmp.groupby("speaker")["intersection"].sum().sort_values(ascending=False).index[0]
            seg["speaker"] = speaker
        # assign speaker to words
        if 'words' in seg:
            for word in seg['words']:
                if 'start' in word:
                    diarize_df['intersection'] = np.minimum(diarize_df['end'], word['end']) - np.maximum(diarize_df['start'], word['start'])
                    diarize_df['union'] = np.maximum(diarize_df['end'], word['end']) - np.minimum(diarize_df['start'], word['start'])
                    # remove no hit
                    if not fill_nearest:
                        dia_tmp = diarize_df[diarize_df['intersection'] > 0]
                    else:
                        dia_tmp = diarize_df
-                if len(dia_tmp) == 0:
+                    if len(dia_tmp) > 0:
-                    speaker = None
+                        # sum over speakers
-                else:
+                        speaker = dia_tmp.groupby("speaker")["intersection"].sum().sort_values(ascending=False).index[0]
-                    speaker = dia_tmp.sort_values("intersection", ascending=False).iloc[0][2]
+                        word["speaker"] = speaker
            else:
                speaker = None
            speakers.append(speaker)
        seg['word-segments']['speaker'] = speakers
-        speaker_count = pd.Series(speakers).value_counts()
+    return transcript_result            
        if len(speaker_count) == 0:
            seg["speaker"]= "UNKNOWN"
        else:
            seg["speaker"] = speaker_count.index[0]
    # create word level segments for .srt
    word_seg = []
    for seg in result_segments:
        wseg = pd.DataFrame(seg["word-segments"])
        for wdx, wrow in wseg.iterrows():
            if wrow["start"] is not None:
                speaker = wrow['speaker']
                if speaker is None or speaker == np.nan:
                    speaker = "UNKNOWN"
                word_seg.append(
                    {
                        "start": wrow["start"],
                        "end": wrow["end"],
                        "text": f"[{speaker}]: " + seg["text"][int(wrow["segment-text-start"]):int(wrow["segment-text-end"])]
                    }
                )
    # TODO: create segments but split words on new speaker
    return result_segments, word_seg
 class Segment:
    def __init__(self, start, end, speaker=None):
--- a/whisperx/transcribe.py
+++ b/whisperx/transcribe.py
@ -1,37 +1,39 @@
 import argparse
 import os
 import gc
 import os
 import warnings
-from typing import TYPE_CHECKING, Optional, Tuple, Union
+
 import numpy as np
 import torch
-import tempfile 
+
-import ffmpeg
+from .alignment import align, load_align_model
-from whisper.tokenizer import LANGUAGES, TO_LANGUAGE_CODE
+from .asr import load_model
-from whisper.audio import SAMPLE_RATE
+from .audio import load_audio
-from whisper.utils import (
+from .diarize import DiarizationPipeline, assign_word_speakers
 from .types import AlignedTranscriptionResult, TranscriptionResult
 from .utils import (
    LANGUAGES,
    TO_LANGUAGE_CODE,
    get_writer,
    optional_float,
    optional_int,
    str2bool,
 )
 from .alignment import load_align_model, align
 from .asr import transcribe, transcribe_with_vad
 from .diarize import DiarizationPipeline, assign_word_speakers
 from .utils import get_writer
 from .vad import load_vad_model
 def cli():
    from whisper import available_models
    # fmt: off
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("audio", nargs="+", type=str, help="audio file(s) to transcribe")
-    parser.add_argument("--model", default="small", choices=available_models(), help="name of the Whisper model to use")
+    parser.add_argument("--model", default="small", help="name of the Whisper model to use")
    parser.add_argument("--model_dir", type=str, default=None, help="the path to save model files; uses ~/.cache/whisper by default")
    parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", help="device to use for PyTorch inference")
    parser.add_argument("--device_index", default=0, type=int, help="device index to use for FasterWhisper inference")
    parser.add_argument("--batch_size", default=8, type=int, help="the preferred batch size for inference")
    parser.add_argument("--compute_type", default="float16", type=str, choices=["float16", "float32", "int8"], help="compute type for computation")
    parser.add_argument("--output_dir", "-o", type=str, default=".", help="directory to save the outputs")
-    parser.add_argument("--output_format", "-f", type=str, default="all", choices=["all", "srt", "srt-word", "vtt", "txt", "tsv", "ass", "ass-char", "pickle", "vad"], help="format of the output file; if not specified, all available formats will be produced")
+    parser.add_argument("--output_format", "-f", type=str, default="all", choices=["all", "srt", "vtt", "txt", "tsv", "json", "aud"], help="format of the output file; if not specified, all available formats will be produced")
    parser.add_argument("--verbose", type=str2bool, default=True, help="whether to print out the progress and debug messages")
    parser.add_argument("--task", type=str, default="transcribe", choices=["transcribe", "translate"], help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
@ -39,28 +41,29 @@ def cli():
    # alignment params
    parser.add_argument("--align_model", default=None, help="Name of phoneme-level ASR model to do alignment")
    parser.add_argument("--align_extend", default=2, type=float, help="Seconds before and after to extend the whisper segments for alignment (if not using VAD).")
    parser.add_argument("--align_from_prev", default=True, type=bool, help="Whether to clip the alignment start time of current segment to the end time of the last aligned word of the previous segment (if not using VAD)")
    parser.add_argument("--interpolate_method", default="nearest", choices=["nearest", "linear", "ignore"], help="For word .srt, method to assign timestamps to non-aligned words, or merge them into neighbouring.")
    parser.add_argument("--no_align", action='store_true', help="Do not perform phoneme alignment")
    parser.add_argument("--return_char_alignments", action='store_true', help="Return character-level alignments in the output json file")
    # vad params
    parser.add_argument("--vad_filter", type=str2bool, default=True, help="Whether to pre-segment audio with VAD, highly recommended! Produces more accurate alignment + timestamp see WhisperX paper https://arxiv.org/abs/2303.00747")
    parser.add_argument("--vad_onset", type=float, default=0.500, help="Onset threshold for VAD (see pyannote.audio), reduce this if speech is not being detected")
    parser.add_argument("--vad_offset", type=float, default=0.363, help="Offset threshold for VAD (see pyannote.audio), reduce this if speech is not being detected.")
    parser.add_argument("--chunk_size", type=int, default=30, help="Chunk size for merging VAD segments. Default is 30, reduce this if the chunk is too long.")
    # diarization params
    parser.add_argument("--diarize", action="store_true", help="Apply diarization to assign speaker labels to each segment/word")
-    parser.add_argument("--min_speakers", default=None, type=int)
+    parser.add_argument("--min_speakers", default=None, type=int, help="Minimum number of speakers to in audio file")
-    parser.add_argument("--max_speakers", default=None, type=int)
+    parser.add_argument("--max_speakers", default=None, type=int, help="Maximum number of speakers to in audio file")
    parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling")
    parser.add_argument("--best_of", type=optional_int, default=5, help="number of candidates when sampling with non-zero temperature")
    parser.add_argument("--beam_size", type=optional_int, default=5, help="number of beams in beam search, only applicable when temperature is zero")
-    parser.add_argument("--patience", type=float, default=None, help="optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search")
+    parser.add_argument("--patience", type=float, default=1.0, help="optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search")
-    parser.add_argument("--length_penalty", type=float, default=None, help="optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple length normalization by default")
+    parser.add_argument("--length_penalty", type=float, default=1.0, help="optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple length normalization by default")
    parser.add_argument("--suppress_tokens", type=str, default="-1", help="comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations")
    parser.add_argument("--suppress_numerals", action="store_true", help="whether to suppress numeric symbols and currency symbols during sampling, since wav2vec2 cannot align them correctly")
    parser.add_argument("--initial_prompt", type=str, default=None, help="optional text to provide as a prompt for the first window.")
    parser.add_argument("--condition_on_previous_text", type=str2bool, default=False, help="if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop")
    parser.add_argument("--fp16", type=str2bool, default=True, help="whether to perform inference in fp16; True by default")
@ -69,63 +72,69 @@ def cli():
    parser.add_argument("--compression_ratio_threshold", type=optional_float, default=2.4, help="if the gzip compression ratio is higher than this value, treat the decoding as failed")
    parser.add_argument("--logprob_threshold", type=optional_float, default=-1.0, help="if the average log probability is lower than this value, treat the decoding as failed")
    parser.add_argument("--no_speech_threshold", type=optional_float, default=0.6, help="if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence")
-    parser.add_argument("--word_timestamps", type=str2bool, default=False, help="(experimental) extract word-level timestamps and refine the results based on them")
+
-    parser.add_argument("--prepend_punctuations", type=str, default="\"\'“¿([{-", help="if word_timestamps is True, merge these punctuation symbols with the next word")
+    parser.add_argument("--max_line_width", type=optional_int, default=None, help="(not possible with --no_align) the maximum number of characters in a line before breaking the line")
-    parser.add_argument("--append_punctuations", type=str, default="\"\'.。,，!！?？:：”)]}、", help="if word_timestamps is True, merge these punctuation symbols with the previous word")
+    parser.add_argument("--max_line_count", type=optional_int, default=None, help="(not possible with --no_align) the maximum number of lines in a segment")
    parser.add_argument("--highlight_words", type=str2bool, default=False, help="(not possible with --no_align) underline each word as it is spoken in srt and vtt")
    parser.add_argument("--segment_resolution", type=str, default="sentence", choices=["sentence", "chunk"], help="(not possible with --no_align) the maximum number of characters in a line before breaking the line")
    parser.add_argument("--threads", type=optional_int, default=0, help="number of threads used by torch for CPU inference; supercedes MKL_NUM_THREADS/OMP_NUM_THREADS")
    parser.add_argument("--hf_token", type=str, default=None, help="Hugging Face Access Token to access PyAnnote gated models")
-    # parser.add_argument("--model_flush", action="store_true", help="Flush memory from each model after use, reduces GPU requirement but slower processing >1 audio file.")
+
-    parser.add_argument("--tmp_dir", default=None, help="Temporary directory to write audio file if input if not .wav format (only for VAD).")
+    parser.add_argument("--print_progress", type=str2bool, default = False, help = "if True, progress will be printed in transcribe() and align() methods.")
    # fmt: on
    args = parser.parse_args().__dict__
    model_name: str = args.pop("model")
    batch_size: int = args.pop("batch_size")
    model_dir: str = args.pop("model_dir")
    output_dir: str = args.pop("output_dir")
    output_format: str = args.pop("output_format")
    device: str = args.pop("device")
    device_index: int = args.pop("device_index")
    compute_type: str = args.pop("compute_type")
    verbose: bool = args.pop("verbose")
    # model_flush: bool = args.pop("model_flush")
    os.makedirs(output_dir, exist_ok=True)
    tmp_dir: str = args.pop("tmp_dir")
    if tmp_dir is not None:
        os.makedirs(tmp_dir, exist_ok=True)
    align_model: str = args.pop("align_model")
    align_extend: float = args.pop("align_extend")
    align_from_prev: bool = args.pop("align_from_prev")
    interpolate_method: str = args.pop("interpolate_method")
    no_align: bool = args.pop("no_align")
    task: str = args.pop("task")
    if task == "translate":
        # translation cannot be aligned
        no_align = True
    return_char_alignments: bool = args.pop("return_char_alignments")
    hf_token: str = args.pop("hf_token")
    vad_filter: bool = args.pop("vad_filter")
    vad_onset: float = args.pop("vad_onset")
    vad_offset: float = args.pop("vad_offset")
    chunk_size: int = args.pop("chunk_size")
    diarize: bool = args.pop("diarize")
    min_speakers: int = args.pop("min_speakers")
    max_speakers: int = args.pop("max_speakers")
    print_progress: bool = args.pop("print_progress")
-    if vad_filter:
+    if args["language"] is not None:
-        from pyannote.audio import Pipeline
+        args["language"] = args["language"].lower()
-        from pyannote.audio import Model, Pipeline
+        if args["language"] not in LANGUAGES:
-        vad_model = load_vad_model(torch.device(device), vad_onset, vad_offset, use_auth_token=hf_token)
+            if args["language"] in TO_LANGUAGE_CODE:
                args["language"] = TO_LANGUAGE_CODE[args["language"]]
            else:
-        vad_model = None
+                raise ValueError(f"Unsupported language: {args['language']}")
-    # if model_flush:
+    if model_name.endswith(".en") and args["language"] != "en":
    #     print(">>Model flushing activated... Only loading model after ASR stage")
    #     del align_model
    #     align_model = ""
    if model_name.endswith(".en") and args["language"] not in {"en", "English"}:
        if args["language"] is not None:
            warnings.warn(
-                f"{model_name} is an English-only model but receipted '{args['language']}'; using English instead."
+                f"{model_name} is an English-only model but received '{args['language']}'; using English instead."
            )
        args["language"] = "en"
    align_language = args["language"] if args["language"] is not None else "en" # default to loading english if not specified
    temperature = args.pop("temperature")
    if (increment := args.pop("temperature_increment_on_fallback")) is not None:
@ -133,42 +142,56 @@ def cli():
    else:
        temperature = [temperature]
    faster_whisper_threads = 4
    if (threads := args.pop("threads")) > 0:
        torch.set_num_threads(threads)
        faster_whisper_threads = threads
-    from whisper import load_model
+    asr_options = {
        "beam_size": args.pop("beam_size"),
        "patience": args.pop("patience"),
        "length_penalty": args.pop("length_penalty"),
        "temperatures": temperature,
        "compression_ratio_threshold": args.pop("compression_ratio_threshold"),
        "log_prob_threshold": args.pop("logprob_threshold"),
        "no_speech_threshold": args.pop("no_speech_threshold"),
        "condition_on_previous_text": False,
        "initial_prompt": args.pop("initial_prompt"),
        "suppress_tokens": [int(x) for x in args.pop("suppress_tokens").split(",")],
        "suppress_numerals": args.pop("suppress_numerals"),
    }
    writer = get_writer(output_format, output_dir)
    word_options = ["highlight_words", "max_line_count", "max_line_width"]
    if no_align:
        for option in word_options:
            if args[option]:
                parser.error(f"--{option} not possible with --no_align")
    if args["max_line_count"] and not args["max_line_width"]:
        warnings.warn("--max_line_count has no effect without --max_line_width")
    writer_args = {arg: args.pop(arg) for arg in word_options}
    # Part 1: VAD & ASR Loop
    results = []
    tmp_results = []
-    model = load_model(model_name, device=device, download_root=model_dir)
+    # model = load_model(model_name, device=device, download_root=model_dir)
    model = load_model(model_name, device=device, device_index=device_index, download_root=model_dir, compute_type=compute_type, language=args['language'], asr_options=asr_options, vad_options={"vad_onset": vad_onset, "vad_offset": vad_offset}, task=task, threads=faster_whisper_threads)
    for audio_path in args.pop("audio"):
-        input_audio_path = audio_path
+        audio = load_audio(audio_path)
        tfile = None
        # >> VAD & ASR
        if vad_model is not None:
            if not audio_path.endswith(".wav"):
                print(">>VAD requires .wav format, converting to wav as a tempfile...")
                audio_basename = os.path.splitext(os.path.basename(audio_path))[0]
                if tmp_dir is not None:
                    input_audio_path = os.path.join(tmp_dir, audio_basename + ".wav")
                else:
                    input_audio_path = os.path.join(os.path.dirname(audio_path), audio_basename + ".wav")
                ffmpeg.input(audio_path, threads=0).output(input_audio_path, ac=1, ar=SAMPLE_RATE).run(cmd=["ffmpeg"])
            print(">>Performing VAD...")
            result = transcribe_with_vad(model, input_audio_path, vad_model, temperature=temperature, **args)
        else:
        print(">>Performing transcription...")
-            result = transcribe(model, input_audio_path, temperature=temperature, **args)
+        result: TranscriptionResult = model.transcribe(
-        
+            audio,
-        results.append((result, input_audio_path))
+            batch_size=batch_size,
            chunk_size=chunk_size,
            print_progress=print_progress,
            verbose=verbose,
        )
        results.append((result, audio_path))
    # Unload Whisper and VAD
    del model
    del vad_model
    gc.collect()
    torch.cuda.empty_cache()
@ -176,19 +199,33 @@ def cli():
    if not no_align:
        tmp_results = results
        results = []
        align_language = args["language"] if args["language"] is not None else "en" # default to loading english if not specified
        align_model, align_metadata = load_align_model(align_language, device, model_name=align_model)
-        for result, input_audio_path in tmp_results:
+        for result, audio_path in tmp_results:
            # >> Align
            if len(tmp_results) > 1:
                input_audio = audio_path
            else:
                # lazily load audio from part 1
                input_audio = audio
            if align_model is not None and len(result["segments"]) > 0:
                if result.get("language", "en") != align_metadata["language"]:
                    # load new language
                    print(f"New language found ({result['language']})! Previous was ({align_metadata['language']}), loading new alignment model for new language...")
                    align_model, align_metadata = load_align_model(result["language"], device)
                print(">>Performing alignment...")
-                result = align(result["segments"], align_model, align_metadata, input_audio_path, device,
+                result: AlignedTranscriptionResult = align(
-                    extend_duration=align_extend, start_from_previous=align_from_prev, interpolate_method=interpolate_method)
+                    result["segments"],
-            results.append((result, input_audio_path))
+                    align_model,
                    align_metadata,
                    input_audio,
                    device,
                    interpolate_method=interpolate_method,
                    return_char_alignments=return_char_alignments,
                    print_progress=print_progress,
                )
            results.append((result, audio_path))
        # Unload align model
        del align_model
@ -200,21 +237,17 @@ def cli():
        if hf_token is None:
            print("Warning, no --hf_token used, needs to be saved in environment variable, otherwise will throw error loading diarization model...")
        tmp_results = results
        print(">>Performing diarization...")
        results = []
-        diarize_model = DiarizationPipeline(use_auth_token=hf_token)
+        diarize_model = DiarizationPipeline(use_auth_token=hf_token, device=device)
        for result, input_audio_path in tmp_results:
            diarize_segments = diarize_model(input_audio_path, min_speakers=min_speakers, max_speakers=max_speakers)
-            results_segments, word_segments = assign_word_speakers(diarize_segments, result["segments"])
+            result = assign_word_speakers(diarize_segments, result)
            result = {"segments": results_segments, "word_segments": word_segments}
            results.append((result, input_audio_path))
    # >> Write
    for result, audio_path in results:
-        writer(result, audio_path)
+        result["language"] = align_language
-
+        writer(result, audio_path, writer_args)
        # cleanup
        if input_audio_path != audio_path:
            os.remove(input_audio_path)
 if __name__ == "__main__":
    cli()
--- a/whisperx/types.py
+++ b/whisperx/types.py
@ -0,0 +1,58 @@
 from typing import TypedDict, Optional, List
 class SingleWordSegment(TypedDict):
    """
    A single word of a speech.
    """
    word: str
    start: float
    end: float
    score: float
 class SingleCharSegment(TypedDict):
    """
    A single char of a speech.
    """
    char: str
    start: float
    end: float
    score: float
 class SingleSegment(TypedDict):
    """
    A single segment (up to multiple sentences) of a speech.
    """
    start: float
    end: float
    text: str
 class SingleAlignedSegment(TypedDict):
    """
    A single segment (up to multiple sentences) of a speech with word alignment.
    """
    start: float
    end: float
    text: str
    words: List[SingleWordSegment]
    chars: Optional[List[SingleCharSegment]]
 class TranscriptionResult(TypedDict):
    """
    A list of segments and word segments of a speech.
    """
    segments: List[SingleSegment]
    language: str
 class AlignedTranscriptionResult(TypedDict):
    """
    A list of segments and word segments of a speech.
    """
    segments: List[SingleAlignedSegment]
    word_segments: List[SingleWordSegment]
--- a/whisperx/utils.py
+++ b/whisperx/utils.py
@ -1,280 +1,333 @@
 import json
 import os
 import re
 import sys
 import zlib
-from typing import Callable, TextIO, Iterator, Tuple
+from typing import Callable, Optional, TextIO
 import pandas as pd
 import numpy as np
-def interpolate_nans(x, method='nearest'):
+LANGUAGES = {
-    if x.notnull().sum() > 1:
+    "en": "english",
-        return x.interpolate(method=method).ffill().bfill()
+    "zh": "chinese",
    "de": "german",
    "es": "spanish",
    "ru": "russian",
    "ko": "korean",
    "fr": "french",
    "ja": "japanese",
    "pt": "portuguese",
    "tr": "turkish",
    "pl": "polish",
    "ca": "catalan",
    "nl": "dutch",
    "ar": "arabic",
    "sv": "swedish",
    "it": "italian",
    "id": "indonesian",
    "hi": "hindi",
    "fi": "finnish",
    "vi": "vietnamese",
    "he": "hebrew",
    "uk": "ukrainian",
    "el": "greek",
    "ms": "malay",
    "cs": "czech",
    "ro": "romanian",
    "da": "danish",
    "hu": "hungarian",
    "ta": "tamil",
    "no": "norwegian",
    "th": "thai",
    "ur": "urdu",
    "hr": "croatian",
    "bg": "bulgarian",
    "lt": "lithuanian",
    "la": "latin",
    "mi": "maori",
    "ml": "malayalam",
    "cy": "welsh",
    "sk": "slovak",
    "te": "telugu",
    "fa": "persian",
    "lv": "latvian",
    "bn": "bengali",
    "sr": "serbian",
    "az": "azerbaijani",
    "sl": "slovenian",
    "kn": "kannada",
    "et": "estonian",
    "mk": "macedonian",
    "br": "breton",
    "eu": "basque",
    "is": "icelandic",
    "hy": "armenian",
    "ne": "nepali",
    "mn": "mongolian",
    "bs": "bosnian",
    "kk": "kazakh",
    "sq": "albanian",
    "sw": "swahili",
    "gl": "galician",
    "mr": "marathi",
    "pa": "punjabi",
    "si": "sinhala",
    "km": "khmer",
    "sn": "shona",
    "yo": "yoruba",
    "so": "somali",
    "af": "afrikaans",
    "oc": "occitan",
    "ka": "georgian",
    "be": "belarusian",
    "tg": "tajik",
    "sd": "sindhi",
    "gu": "gujarati",
    "am": "amharic",
    "yi": "yiddish",
    "lo": "lao",
    "uz": "uzbek",
    "fo": "faroese",
    "ht": "haitian creole",
    "ps": "pashto",
    "tk": "turkmen",
    "nn": "nynorsk",
    "mt": "maltese",
    "sa": "sanskrit",
    "lb": "luxembourgish",
    "my": "myanmar",
    "bo": "tibetan",
    "tl": "tagalog",
    "mg": "malagasy",
    "as": "assamese",
    "tt": "tatar",
    "haw": "hawaiian",
    "ln": "lingala",
    "ha": "hausa",
    "ba": "bashkir",
    "jw": "javanese",
    "su": "sundanese",
    "yue": "cantonese",
 }
 # language code lookup by name, with a few language aliases
 TO_LANGUAGE_CODE = {
    **{language: code for code, language in LANGUAGES.items()},
    "burmese": "my",
    "valencian": "ca",
    "flemish": "nl",
    "haitian": "ht",
    "letzeburgesch": "lb",
    "pushto": "ps",
    "panjabi": "pa",
    "moldavian": "ro",
    "moldovan": "ro",
    "sinhalese": "si",
    "castilian": "es",
 }
 LANGUAGES_WITHOUT_SPACES = ["ja", "zh"]
 system_encoding = sys.getdefaultencoding()
 if system_encoding != "utf-8":
    def make_safe(string):
        # replaces any character not representable using the system default encoding with an '?',
        # avoiding UnicodeEncodeError (https://github.com/openai/whisper/discussions/729).
        return string.encode(system_encoding, errors="replace").decode(system_encoding)
 else:
    def make_safe(string):
        # utf-8 can encode any Unicode code point, so no need to do the round-trip encoding
        return string
 def exact_div(x, y):
    assert x % y == 0
    return x // y
 def str2bool(string):
    str2val = {"True": True, "False": False}
    if string in str2val:
        return str2val[string]
    else:
-        return x.ffill().bfill()
+        raise ValueError(f"Expected one of {set(str2val.keys())}, got {string}")
-def write_txt(transcript: Iterator[dict], file: TextIO):
+def optional_int(string):
-    for segment in transcript:
+    return None if string == "None" else int(string)
        print(segment['text'].strip(), file=file, flush=True)
-def write_vtt(transcript: Iterator[dict], file: TextIO):
+def optional_float(string):
-    print("WEBVTT\n", file=file)
+    return None if string == "None" else float(string)
    for segment in transcript:
        print(
            f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n"
            f"{segment['text'].strip().replace('-->', '->')}\n",
            file=file,
            flush=True,
        )
 def write_tsv(transcript: Iterator[dict], file: TextIO):
    print("start", "end", "text", sep="\t", file=file)
    for segment in transcript:
        print(segment['start'], file=file, end="\t")
        print(segment['end'], file=file, end="\t")
        print(segment['text'].strip().replace("\t", " "), file=file, flush=True)
-def write_srt(transcript: Iterator[dict], file: TextIO):
+def compression_ratio(text) -> float:
-    """
+    text_bytes = text.encode("utf-8")
-    Write a transcript to a file in SRT format.
+    return len(text_bytes) / len(zlib.compress(text_bytes))
    Example usage:
        from pathlib import Path
        from whisper.utils import write_srt
-        result = transcribe(model, audio_path, temperature=temperature, **args)
+def format_timestamp(
    seconds: float, always_include_hours: bool = False, decimal_marker: str = "."
 ):
    assert seconds >= 0, "non-negative timestamp expected"
    milliseconds = round(seconds * 1000.0)
-        # save SRT
+    hours = milliseconds // 3_600_000
-        audio_basename = Path(audio_path).stem
+    milliseconds -= hours * 3_600_000
-        with open(Path(output_dir) / (audio_basename + ".srt"), "w", encoding="utf-8") as srt:
+
-            write_srt(result["segments"], file=srt)
+    minutes = milliseconds // 60_000
-    """
+    milliseconds -= minutes * 60_000
-    for i, segment in enumerate(transcript, start=1):
+
-        # write srt lines
+    seconds = milliseconds // 1_000
-        print(
+    milliseconds -= seconds * 1_000
-            f"{i}\n"
+
-            f"{format_timestamp(segment['start'], always_include_hours=True, decimal_marker=',')} --> "
+    hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
-            f"{format_timestamp(segment['end'], always_include_hours=True, decimal_marker=',')}\n"
+    return (
-            f"{segment['text'].strip().replace('-->', '->')}\n",
+        f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
            file=file,
            flush=True,
    )
-def write_ass(transcript: Iterator[dict],
+class ResultWriter:
-            file: TextIO,
+    extension: str
            resolution: str = "word",
            color: str = None, underline=True,
            prefmt: str = None, suffmt: str = None,
            font: str = None, font_size: int = 24,
            strip=True, **kwargs):
    """
    Credit: https://github.com/jianfch/stable-ts/blob/ff79549bd01f764427879f07ecd626c46a9a430a/stable_whisper/text_output.py
        Generate Advanced SubStation Alpha (ass) file from results to
    display both phrase-level & word-level timestamp simultaneously by:
     -using segment-level timestamps display phrases as usual
     -using word-level timestamps change formats (e.g. color/underline) of the word in the displayed segment
    Note: ass file is used in the same way as srt, vtt, etc.
    Parameters
    ----------
    transcript: dict
        results from modified model
    file: TextIO
        file object to write to
    resolution: str
        "word" or "char", timestamp resolution to highlight.
    color: str
        color code for a word at its corresponding timestamp
        <bbggrr> reverse order hexadecimal RGB value (e.g. FF0000 is full intensity blue. Default: 00FF00)
    underline: bool
        whether to underline a word at its corresponding timestamp
    prefmt: str
        used to specify format for word-level timestamps (must be use with 'suffmt' and overrides 'color'&'underline')
        appears as such in the .ass file:
            Hi, {<prefmt>}how{<suffmt>} are you?
        reference [Appendix A: Style override codes] in http://www.tcax.org/docs/ass-specs.htm
    suffmt: str
        used to specify format for word-level timestamps (must be use with 'prefmt' and overrides 'color'&'underline')
        appears as such in the .ass file:
            Hi, {<prefmt>}how{<suffmt>} are you?
        reference [Appendix A: Style override codes] in http://www.tcax.org/docs/ass-specs.htm
    font: str
        word font (default: Arial)
    font_size: int
        word font size (default: 48)
    kwargs:
        used for format styles:
        'Name', 'Fontname', 'Fontsize', 'PrimaryColour', 'SecondaryColour', 'OutlineColour', 'BackColour', 'Bold',
        'Italic', 'Underline', 'StrikeOut', 'ScaleX', 'ScaleY', 'Spacing', 'Angle', 'BorderStyle', 'Outline',
        'Shadow', 'Alignment', 'MarginL', 'MarginR', 'MarginV', 'Encoding'
-    """
+    def __init__(self, output_dir: str):
        self.output_dir = output_dir
-    fmt_style_dict = {'Name': 'Default', 'Fontname': 'Arial', 'Fontsize': '48', 'PrimaryColour': '&Hffffff',
+    def __call__(self, result: dict, audio_path: str, options: dict):
-                    'SecondaryColour': '&Hffffff', 'OutlineColour': '&H0', 'BackColour': '&H0', 'Bold': '0',
+        audio_basename = os.path.basename(audio_path)
-                    'Italic': '0', 'Underline': '0', 'StrikeOut': '0', 'ScaleX': '100', 'ScaleY': '100',
+        audio_basename = os.path.splitext(audio_basename)[0]
-                    'Spacing': '0', 'Angle': '0', 'BorderStyle': '1', 'Outline': '1', 'Shadow': '0',
+        output_path = os.path.join(
-                    'Alignment': '2', 'MarginL': '10', 'MarginR': '10', 'MarginV': '10', 'Encoding': '0'}
+            self.output_dir, audio_basename + "." + self.extension
        )
-    for k, v in filter(lambda x: 'colour' in x[0].lower() and not str(x[1]).startswith('&H'), kwargs.items()):
+        with open(output_path, "w", encoding="utf-8") as f:
-        kwargs[k] = f'&H{kwargs[k]}'
+            self.write_result(result, file=f, options=options)
-    fmt_style_dict.update((k, v) for k, v in kwargs.items() if k in fmt_style_dict)
+    def write_result(self, result: dict, file: TextIO, options: dict):
        raise NotImplementedError
    if font:
        fmt_style_dict.update(Fontname=font)
    if font_size:
        fmt_style_dict.update(Fontsize=font_size)
-    fmts = f'Format: {", ".join(map(str, fmt_style_dict.keys()))}'
+class WriteTXT(ResultWriter):
    extension: str = "txt"
-    styles = f'Style: {",".join(map(str, fmt_style_dict.values()))}'
+    def write_result(self, result: dict, file: TextIO, options: dict):
-
+        for segment in result["segments"]:
-    ass_str = f'[Script Info]\nScriptType: v4.00+\nPlayResX: 384\nPlayResY: 288\nScaledBorderAndShadow: yes\n\n' \
+            speaker = segment.get("speaker")
-            f'[V4+ Styles]\n{fmts}\n{styles}\n\n' \
+            text = segment["text"].strip()
-            f'[Events]\nFormat: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n\n'
+            if speaker is not None:
-
+                print(f"[{speaker}]: {text}", file=file, flush=True)
    if prefmt or suffmt:
        if suffmt:
            assert prefmt, 'prefmt must be used along with suffmt'
            else:
-            suffmt = r'\r'
+                print(text, file=file, flush=True)
 class SubtitlesWriter(ResultWriter):
    always_include_hours: bool
    decimal_marker: str
    def iterate_result(self, result: dict, options: dict):
        raw_max_line_width: Optional[int] = options["max_line_width"]
        max_line_count: Optional[int] = options["max_line_count"]
        highlight_words: bool = options["highlight_words"]
        max_line_width = 1000 if raw_max_line_width is None else raw_max_line_width
        preserve_segments = max_line_count is None or raw_max_line_width is None
        if len(result["segments"]) == 0:
            return
        def iterate_subtitles():
            line_len = 0
            line_count = 1
            # the next subtitle to yield (a list of word timings with whitespace)
            subtitle: list[dict] = []
            times = []
            last = result["segments"][0]["start"]
            for segment in result["segments"]:
                for i, original_timing in enumerate(segment["words"]):
                    timing = original_timing.copy()
                    long_pause = not preserve_segments
                    if "start" in timing:
                        long_pause = long_pause and timing["start"] - last > 3.0
                    else:
-        if not color:
+                        long_pause = False
-            color = 'HFF00'
+                    has_room = line_len + len(timing["word"]) <= max_line_width
-        underline_code = r'\u1' if underline else ''
+                    seg_break = i == 0 and len(subtitle) > 0 and preserve_segments
-
+                    if line_len > 0 and has_room and not long_pause and not seg_break:
-        prefmt = r'{\1c&' + f'{color.upper()}&{underline_code}' + '}'
+                        # line continuation
-        suffmt = r'{\r}'
+                        line_len += len(timing["word"])
    def secs_to_hhmmss(secs: Tuple[float, int]):
        mm, ss = divmod(secs, 60)
        hh, mm = divmod(mm, 60)
        return f'{hh:0>1.0f}:{mm:0>2.0f}:{ss:0>2.2f}'
    def dialogue(chars: str, start: float, end: float, idx_0: int, idx_1: int) -> str:
        if idx_0 == -1:
            text = chars
                    else:
-            text = f'{chars[:idx_0]}{prefmt}{chars[idx_0:idx_1]}{suffmt}{chars[idx_1:]}'
+                        # new line
-        return f"Dialogue: 0,{secs_to_hhmmss(start)},{secs_to_hhmmss(end)}," \
+                        timing["word"] = timing["word"].strip()
-               f"Default,,0,0,0,,{text.strip() if strip else text}"
+                        if (
                            len(subtitle) > 0
                            and max_line_count is not None
                            and (long_pause or line_count >= max_line_count)
                            or seg_break
                        ):
                            # subtitle break
                            yield subtitle, times
                            subtitle = []
                            times = []
                            line_count = 1
                        elif line_len > 0:
                            # line break
                            line_count += 1
                            timing["word"] = "\n" + timing["word"]
                        line_len = len(timing["word"].strip())
                    subtitle.append(timing)
                    times.append((segment["start"], segment["end"], segment.get("speaker")))
                    if "start" in timing:
                        last = timing["start"]
            if len(subtitle) > 0:
                yield subtitle, times
-    if resolution == "word":
+        if "words" in result["segments"][0]:
-        resolution_key = "word-segments"
+            for subtitle, _ in iterate_subtitles():
-    elif resolution == "char":
+                sstart, ssend, speaker = _[0]
-        resolution_key = "char-segments"
+                subtitle_start = self.format_timestamp(sstart)
                subtitle_end = self.format_timestamp(ssend)
                if result["language"] in LANGUAGES_WITHOUT_SPACES:
                    subtitle_text = "".join([word["word"] for word in subtitle])
                else:
-        raise ValueError(".ass resolution should be 'word' or 'char', not ", resolution)
+                    subtitle_text = " ".join([word["word"] for word in subtitle])
                has_timing = any(["start" in word for word in subtitle])
-    ass_arr = []
+                # add [$SPEAKER_ID]: to each subtitle if speaker is available
                prefix = ""
                if speaker is not None:
                    prefix = f"[{speaker}]: "
-    for segment in transcript:
+                if highlight_words and has_timing:
-        # if "12" in segment['text']:
+                    last = subtitle_start
-            # import pdb; pdb.set_trace()
+                    all_words = [timing["word"] for timing in subtitle]
-        if resolution_key in segment:
+                    for i, this_word in enumerate(subtitle):
-            res_segs = pd.DataFrame(segment[resolution_key])
+                        if "start" in this_word:
            prev = segment['start']
            if "speaker" in segment:
                speaker_str = f"[{segment['speaker']}]: "
            else:
                speaker_str = ""
            for cdx, crow in res_segs.iterrows():
                if not np.isnan(crow['start']):
                    if resolution == "char":
                        idx_0 = cdx
                        idx_1 = cdx + 1
                    elif resolution == "word":
                        idx_0 = int(crow["segment-text-start"])
                        idx_1 = int(crow["segment-text-end"])
                    # fill gap
                    if crow['start'] > prev:
                        filler_ts = {
                            "chars": speaker_str + segment['text'],
                            "start": prev,
                            "end": crow['start'],
                            "idx_0": -1,
                            "idx_1": -1
                        }
                        ass_arr.append(filler_ts)
                    # highlight current word
                    f_word_ts = {
                        "chars": speaker_str + segment['text'],
                        "start": crow['start'],
                        "end": crow['end'],
                        "idx_0": idx_0 + len(speaker_str),
                        "idx_1": idx_1 + len(speaker_str)
                    }
                    ass_arr.append(f_word_ts)
                    prev = crow['end']
    ass_str += '\n'.join(map(lambda x: dialogue(**x), ass_arr))
    file.write(ass_str)
 from whisper.utils import SubtitlesWriter, ResultWriter, WriteTXT, WriteVTT, WriteSRT, WriteTSV, WriteJSON, format_timestamp
 class WriteASS(ResultWriter):
    extension: str = "ass"
    def write_result(self, result: dict, file: TextIO):
        write_ass(result["segments"], file, resolution="word")
 class WriteASSchar(ResultWriter):
    extension: str = "ass"
    def write_result(self, result: dict, file: TextIO):
        write_ass(result["segments"], file, resolution="char")
 class WritePickle(ResultWriter):
    extension: str = "ass"
    def write_result(self, result: dict, file: TextIO):
        pd.DataFrame(result["segments"]).to_pickle(file)
 class WriteSRTWord(ResultWriter):
    extension: str = "word.srt"
    always_include_hours: bool = True
    decimal_marker: str = ","
    def iterate_result(self, result: dict):
        for segment in result["word_segments"]:
            segment_start = self.format_timestamp(segment["start"])
            segment_end = self.format_timestamp(segment["end"])
            segment_text = segment["text"].strip().replace("-->", "->")
            if word_timings := segment.get("words", None):
                all_words = [timing["word"] for timing in word_timings]
                all_words[0] = all_words[0].strip()  # remove the leading space, if any
                last = segment_start
                for i, this_word in enumerate(word_timings):
                            start = self.format_timestamp(this_word["start"])
                            end = self.format_timestamp(this_word["end"])
                            if last != start:
-                        yield last, start, segment_text
+                                yield last, start, prefix + subtitle_text
-                    yield start, end, "".join(
+                            yield start, end, prefix + " ".join(
                                [
-                            f"<u>{word}</u>" if j == i else word
+                                    re.sub(r"^(\s*)(.*)$", r"\1<u>\2</u>", word)
                                    if j == i
                                    else word
                                    for j, word in enumerate(all_words)
                                ]
                            )
                            last = end
                if last != segment_end:
                    yield last, segment_end, segment_text
                else:
                    yield subtitle_start, subtitle_end, prefix + subtitle_text
        else:
            for segment in result["segments"]:
                segment_start = self.format_timestamp(segment["start"])
                segment_end = self.format_timestamp(segment["end"])
                segment_text = segment["text"].strip().replace("-->", "->")
                if "speaker" in segment:
                    segment_text = f"[{segment['speaker']}]: {segment_text}"
                yield segment_start, segment_end, segment_text
    def write_result(self, result: dict, file: TextIO):
        if "word_segments" not in result:
            return
        for i, (start, end, text) in enumerate(self.iterate_result(result), start=1):
            print(f"{i}\n{start} --> {end}\n{text}\n", file=file, flush=True)
    def format_timestamp(self, seconds: float):
        return format_timestamp(
            seconds=seconds,
@ -282,36 +335,108 @@ class WriteSRTWord(ResultWriter):
            decimal_marker=self.decimal_marker,
        )
-def get_writer(output_format: str, output_dir: str) -> Callable[[dict, TextIO], None]:
+
 class WriteVTT(SubtitlesWriter):
    extension: str = "vtt"
    always_include_hours: bool = False
    decimal_marker: str = "."
    def write_result(self, result: dict, file: TextIO, options: dict):
        print("WEBVTT\n", file=file)
        for start, end, text in self.iterate_result(result, options):
            print(f"{start} --> {end}\n{text}\n", file=file, flush=True)
 class WriteSRT(SubtitlesWriter):
    extension: str = "srt"
    always_include_hours: bool = True
    decimal_marker: str = ","
    def write_result(self, result: dict, file: TextIO, options: dict):
        for i, (start, end, text) in enumerate(
            self.iterate_result(result, options), start=1
        ):
            print(f"{i}\n{start} --> {end}\n{text}\n", file=file, flush=True)
 class WriteTSV(ResultWriter):
    """
    Write a transcript to a file in TSV (tab-separated values) format containing lines like:
    <start time in integer milliseconds>\t<end time in integer milliseconds>\t<transcript text>
    Using integer milliseconds as start and end times means there's no chance of interference from
    an environment setting a language encoding that causes the decimal in a floating point number
    to appear as a comma; also is faster and more efficient to parse & store, e.g., in C++.
    """
    extension: str = "tsv"
    def write_result(self, result: dict, file: TextIO, options: dict):
        print("start", "end", "text", sep="\t", file=file)
        for segment in result["segments"]:
            print(round(1000 * segment["start"]), file=file, end="\t")
            print(round(1000 * segment["end"]), file=file, end="\t")
            print(segment["text"].strip().replace("\t", " "), file=file, flush=True)
 class WriteAudacity(ResultWriter):
    """
    Write a transcript to a text file that audacity can import as labels.
    The extension used is "aud" to distinguish it from the txt file produced by WriteTXT.
    Yet this is not an audacity project but only a label file!
    Please note : Audacity uses seconds in timestamps not ms! 
    Also there is no header expected.
    If speaker is provided it is prepended to the text between double square brackets [[]].
    """
    extension: str = "aud"    
    def write_result(self, result: dict, file: TextIO, options: dict):
        ARROW = "	"
        for segment in result["segments"]:
            print(segment["start"], file=file, end=ARROW)
            print(segment["end"], file=file, end=ARROW)
            print( ( ("[[" + segment["speaker"] + "]]") if "speaker" in segment else "") + segment["text"].strip().replace("\t", " "), file=file, flush=True)
 class WriteJSON(ResultWriter):
    extension: str = "json"
    def write_result(self, result: dict, file: TextIO, options: dict):
        json.dump(result, file, ensure_ascii=False)
 def get_writer(
    output_format: str, output_dir: str
 ) -> Callable[[dict, TextIO, dict], None]:
    writers = {
        "txt": WriteTXT,
        "vtt": WriteVTT,
        "srt": WriteSRT,
        "tsv": WriteTSV,
-        "ass": WriteASS,
+        "json": WriteJSON,
        "srt-word": WriteSRTWord,
        # "ass-char": WriteASSchar,
        # "pickle": WritePickle,
        # "json": WriteJSON,
    }
-
+    optional_writers = {
-    writers_other = {
+        "aud": WriteAudacity,
        "pkl": WritePickle,
        "ass-char": WriteASSchar
    }
    if output_format == "all":
        all_writers = [writer(output_dir) for writer in writers.values()]
-        def write_all(result: dict, file: TextIO):
+        def write_all(result: dict, file: TextIO, options: dict):
            for writer in all_writers:
-                writer(result, file)
+                writer(result, file, options)
        return write_all
-    if output_format in writers:
+    if output_format in optional_writers:
        return optional_writers[output_format](output_dir)
    return writers[output_format](output_dir)
-    elif output_format in writers_other:
+
-        return writers_other[output_format](output_dir)
+def interpolate_nans(x, method='nearest'):
    if x.notnull().sum() > 1:
        return x.interpolate(method=method).ffill().bfill()
    else:
-        raise ValueError(f"Output format '{output_format}' not supported, choose from {writers.keys()} and {writers_other.keys()}")
+        return x.ffill().bfill()
--- a/whisperx/vad.py
+++ b/whisperx/vad.py
@ -1,50 +1,47 @@
 import hashlib
 import os
 import urllib
-import pandas as pd
+from typing import Callable, Optional, Text, Union
 import numpy as np
 import torch
 import hashlib
 from tqdm import tqdm
 from typing import Optional, Callable, Union, Text
 from pyannote.audio.core.io import AudioFile
 from pyannote.core import Annotation, Segment, SlidingWindowFeature
 from pyannote.audio.pipelines.utils import PipelineModel
 from pyannote.audio import Model
 from pyannote.audio.pipelines import VoiceActivityDetection
 from .diarize import Segment as SegmentX
 from typing import List, Tuple, Optional
 import numpy as np
 import pandas as pd
 import torch
 from pyannote.audio import Model
 from pyannote.audio.core.io import AudioFile
 from pyannote.audio.pipelines import VoiceActivityDetection
 from pyannote.audio.pipelines.utils import PipelineModel
 from pyannote.core import Annotation, Segment, SlidingWindowFeature
 from tqdm import tqdm
 from .diarize import Segment as SegmentX
 # deprecated
 VAD_SEGMENTATION_URL = "https://whisperx.s3.eu-west-2.amazonaws.com/model_weights/segmentation/0b5b3216d60a2d32fc086b47ea8c67589aaeb26b7e07fcbe620d6d0b83e209ea/pytorch_model.bin"
-def load_vad_model(device, vad_onset, vad_offset, use_auth_token=None, model_fp=None):
+def load_vad_model(device, vad_onset=0.500, vad_offset=0.363, use_auth_token=None, model_fp=None):
    model_dir = torch.hub._get_torch_home()
    vad_dir = os.path.dirname(os.path.abspath(__file__))
    os.makedirs(model_dir, exist_ok = True)
    if model_fp is None:
-        model_fp = os.path.join(model_dir, "whisperx-vad-segmentation.bin")
+        # Dynamically resolve the path to the model file
        model_fp = os.path.join(vad_dir, "assets", "pytorch_model.bin")
        model_fp = os.path.abspath(model_fp)  # Ensure the path is absolute
    else:
        model_fp = os.path.abspath(model_fp)  # Ensure any provided path is absolute
    # Check if the resolved model file exists
    if not os.path.exists(model_fp):
        raise FileNotFoundError(f"Model file not found at {model_fp}")
    if os.path.exists(model_fp) and not os.path.isfile(model_fp):
        raise RuntimeError(f"{model_fp} exists and is not a regular file")
    if not os.path.isfile(model_fp):
        with urllib.request.urlopen(VAD_SEGMENTATION_URL) as source, open(model_fp, "wb") as output:
            with tqdm(
                total=int(source.info().get("Content-Length")),
                ncols=80,
                unit="iB",
                unit_scale=True,
                unit_divisor=1024,
            ) as loop:
                while True:
                    buffer = source.read(8192)
                    if not buffer:
                        break
                    output.write(buffer)
                    loop.update(len(buffer))
    model_bytes = open(model_fp, "rb").read()
    if hashlib.sha256(model_bytes).hexdigest() != VAD_SEGMENTATION_URL.split('/')[-2]:
        raise RuntimeError(
-            "Model has been downloaded but the SHA256 checksum does not not match. Please retry loading the model."
+            "Model has been downloaded but the SHA256 checksum does not match. Please retry loading the model."
        )
    vad_model = Model.from_pretrained(model_fp, use_auth_token=use_auth_token)
@ -141,13 +138,12 @@ class Binarize:
            is_active = k_scores[0] > self.onset
            curr_scores = [k_scores[0]]
            curr_timestamps = [start]
            t = start
            for t, y in zip(timestamps[1:], k_scores[1:]):
                # currently active
                if is_active: 
                    curr_duration = t - start
                    if curr_duration > self.max_duration:
                        # if curr_duration > 15:
                            # import pdb; pdb.set_trace()
                        search_after = len(curr_scores) // 2
                        # divide segment
                        min_score_div_idx = search_after + np.argmin(curr_scores[search_after:])
@ -165,14 +161,14 @@ class Binarize:
                        is_active = False
                        curr_scores = []
                        curr_timestamps = []
                    curr_scores.append(y)
                    curr_timestamps.append(t)
                # currently inactive
                else:
                    # switching from inactive to active
                    if y > self.onset:
                        start = t
                        is_active = True
                curr_scores.append(y)
                curr_timestamps.append(t)
            # if active at the end, add final region
            if is_active:
@ -261,7 +257,12 @@ def merge_vad(vad_arr, pad_onset=0.0, pad_offset=0.0, min_duration_off=0.0, min_
    active_segs = pd.DataFrame([x['segment'] for x in active['content']])
    return active_segs
-def merge_chunks(segments, chunk_size):
+def merge_chunks(
    segments,
    chunk_size,
    onset: float = 0.5,
    offset: Optional[float] = None,
 ):
    """
    Merge operation described in paper
    """
@ -271,7 +272,7 @@ def merge_chunks(segments, chunk_size):
    speaker_idxs = []
    assert chunk_size > 0
-    binarize = Binarize(max_duration=chunk_size)
+    binarize = Binarize(max_duration=chunk_size, onset=onset, offset=offset)
    segments = binarize(segments)
    segments_list = []
    for speech_turn in segments.get_timeline():