mirror of
https://github.com/m-bain/whisperX.git
synced 2025-07-01 18:17:27 -04:00
Compare commits
16 Commits
Author | SHA1 | Date | |
---|---|---|---|
4916192246 | |||
cbdac53e87 | |||
940a223219 | |||
a0eb31019b | |||
b08ad67a72 | |||
c18f9f979b | |||
948b3e368b | |||
e9ac5b63bc | |||
90b45459d9 | |||
81c4af96a6 | |||
1c6d9327bc | |||
0fdb55d317 | |||
51da22771f | |||
15ad5bf7df | |||
7fdbd21fe3 | |||
3ff625c561 |
35
.github/workflows/build-and-release.yml
vendored
Normal file
35
.github/workflows/build-and-release.yml
vendored
Normal file
@ -0,0 +1,35 @@
|
||||
name: Build and release
|
||||
|
||||
on:
|
||||
release:
|
||||
types: [published]
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.9"
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install build
|
||||
|
||||
- name: Build wheels
|
||||
run: python -m build --wheel
|
||||
|
||||
- name: Release to Github
|
||||
uses: softprops/action-gh-release@v2
|
||||
with:
|
||||
files: dist/*
|
||||
|
||||
- name: Publish package to PyPi
|
||||
uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
|
||||
with:
|
||||
user: __token__
|
||||
password: ${{ secrets.PYPI_API_TOKEN }}
|
32
.github/workflows/python-compatibility.yml
vendored
Normal file
32
.github/workflows/python-compatibility.yml
vendored
Normal file
@ -0,0 +1,32 @@
|
||||
name: Python Compatibility Test
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
pull_request:
|
||||
branches: [main]
|
||||
workflow_dispatch: # Allows manual triggering from GitHub UI
|
||||
|
||||
jobs:
|
||||
test:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ["3.9", "3.10", "3.11", "3.12"]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
|
||||
- name: Install package
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install .
|
||||
|
||||
- name: Test import
|
||||
run: |
|
||||
python -c "import whisperx; print('Successfully imported whisperx')"
|
172
.gitignore
vendored
172
.gitignore
vendored
@ -1,3 +1,171 @@
|
||||
whisperx.egg-info/
|
||||
**/__pycache__/
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# UV
|
||||
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
#uv.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
#poetry.lock
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
#pdm.lock
|
||||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||
# in version control.
|
||||
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
||||
.pdm.toml
|
||||
.pdm-python
|
||||
.pdm-build/
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
|
||||
# PyPI configuration file
|
||||
.pypirc
|
@ -1,6 +1,3 @@
|
||||
include whisperx/assets/*
|
||||
include whisperx/assets/gpt2/*
|
||||
include whisperx/assets/multilingual/*
|
||||
include whisperx/normalizers/english.json
|
||||
include LICENSE
|
||||
include requirements.txt
|
||||
|
39
README.md
39
README.md
@ -23,7 +23,7 @@
|
||||
</p>
|
||||
|
||||
|
||||
<img width="1216" align="center" alt="whisperx-arch" src="figures/pipeline.png">
|
||||
<img width="1216" align="center" alt="whisperx-arch" src="https://raw.githubusercontent.com/m-bain/whisperX/refs/heads/main/figures/pipeline.png">
|
||||
|
||||
|
||||
<!-- <p align="left">Whisper-Based Automatic Speech Recognition (ASR) with improved timestamp accuracy + quality via forced phoneme alignment and voice-activity based batching for fast inference.</p> -->
|
||||
@ -80,21 +80,40 @@ GPU execution requires the NVIDIA libraries cuBLAS 11.x and cuDNN 8.x to be inst
|
||||
|
||||
See other methods [here.](https://pytorch.org/get-started/previous-versions/#v200)
|
||||
|
||||
### 3. Install this repo
|
||||
### 3. Install WhisperX
|
||||
|
||||
`pip install git+https://github.com/m-bain/whisperx.git`
|
||||
You have several installation options:
|
||||
|
||||
If already installed, update package to most recent commit
|
||||
#### Option A: Stable Release (recommended)
|
||||
Install the latest stable version from PyPI:
|
||||
|
||||
`pip install git+https://github.com/m-bain/whisperx.git --upgrade`
|
||||
|
||||
If wishing to modify this package, clone and install in editable mode:
|
||||
```bash
|
||||
pip install whisperx
|
||||
```
|
||||
$ git clone https://github.com/m-bain/whisperX.git
|
||||
$ cd whisperX
|
||||
$ pip install -e .
|
||||
|
||||
#### Option B: Development Version
|
||||
Install the latest development version directly from GitHub (may be unstable):
|
||||
|
||||
```bash
|
||||
pip install git+https://github.com/m-bain/whisperx.git
|
||||
```
|
||||
|
||||
If already installed, update to the most recent commit:
|
||||
|
||||
```bash
|
||||
pip install git+https://github.com/m-bain/whisperx.git --upgrade
|
||||
```
|
||||
|
||||
#### Option C: Development Mode
|
||||
If you wish to modify the package, clone and install in editable mode:
|
||||
```bash
|
||||
git clone https://github.com/m-bain/whisperX.git
|
||||
cd whisperX
|
||||
pip install -e .
|
||||
```
|
||||
|
||||
> **Note**: The development version may contain experimental features and bugs. Use the stable PyPI release for production environments.
|
||||
|
||||
You may also need to install ffmpeg, rust etc. Follow openAI instructions here https://github.com/openai/whisper#setup.
|
||||
|
||||
### Speaker Diarization
|
||||
|
@ -1,7 +1,7 @@
|
||||
torch>=2
|
||||
torchaudio>=2
|
||||
faster-whisper==1.0.0
|
||||
ctranslate2==4.4.0
|
||||
faster-whisper==1.1.0
|
||||
ctranslate2<4.5.0
|
||||
transformers
|
||||
pandas
|
||||
setuptools>=65
|
||||
|
15
setup.py
15
setup.py
@ -1,19 +1,22 @@
|
||||
import os
|
||||
import platform
|
||||
|
||||
import pkg_resources
|
||||
from setuptools import find_packages, setup
|
||||
|
||||
with open("README.md", "r", encoding="utf-8") as f:
|
||||
long_description = f.read()
|
||||
|
||||
setup(
|
||||
name="whisperx",
|
||||
py_modules=["whisperx"],
|
||||
version="3.2.0",
|
||||
version="3.3.0",
|
||||
description="Time-Accurate Automatic Speech Recognition using Whisper.",
|
||||
readme="README.md",
|
||||
python_requires=">=3.8",
|
||||
long_description=long_description,
|
||||
long_description_content_type="text/markdown",
|
||||
python_requires=">=3.9, <3.13",
|
||||
author="Max Bain",
|
||||
url="https://github.com/m-bain/whisperx",
|
||||
license="MIT",
|
||||
license="BSD-2-Clause",
|
||||
packages=find_packages(exclude=["tests*"]),
|
||||
install_requires=[
|
||||
str(r)
|
||||
@ -21,7 +24,7 @@ setup(
|
||||
open(os.path.join(os.path.dirname(__file__), "requirements.txt"))
|
||||
)
|
||||
]
|
||||
+ [f"pyannote.audio==3.1.1"],
|
||||
+ [f"pyannote.audio==3.3.2"],
|
||||
entry_points={
|
||||
"console_scripts": ["whisperx=whisperx.transcribe:cli"],
|
||||
},
|
||||
|
@ -85,8 +85,8 @@ def load_align_model(language_code, device, model_name=None, model_dir=None):
|
||||
align_dictionary = {c.lower(): i for i, c in enumerate(labels)}
|
||||
else:
|
||||
try:
|
||||
processor = Wav2Vec2Processor.from_pretrained(model_name)
|
||||
align_model = Wav2Vec2ForCTC.from_pretrained(model_name)
|
||||
processor = Wav2Vec2Processor.from_pretrained(model_name, cache_dir=model_dir)
|
||||
align_model = Wav2Vec2ForCTC.from_pretrained(model_name, cache_dir=model_dir)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
print(f"Error loading model from huggingface, check https://huggingface.co/models for finetuned wav2vec2.0 models")
|
||||
|
@ -171,7 +171,7 @@ class FasterWhisperPipeline(Pipeline):
|
||||
return final_iterator
|
||||
|
||||
def transcribe(
|
||||
self, audio: Union[str, np.ndarray], batch_size=None, num_workers=0, language=None, task=None, chunk_size=30, print_progress = False, combined_progress=False
|
||||
self, audio: Union[str, np.ndarray], batch_size=None, num_workers=0, language=None, task=None, chunk_size=30, print_progress = False, combined_progress=False, verbose=False
|
||||
) -> TranscriptionResult:
|
||||
if isinstance(audio, str):
|
||||
audio = load_audio(audio)
|
||||
@ -223,6 +223,8 @@ class FasterWhisperPipeline(Pipeline):
|
||||
text = out['text']
|
||||
if batch_size in [0, 1, None]:
|
||||
text = text[0]
|
||||
if verbose:
|
||||
print(f"Transcript: [{round(vad_segments[idx]['start'], 3)} --> {round(vad_segments[idx]['end'], 3)}] {text}")
|
||||
segments.append(
|
||||
{
|
||||
"text": text,
|
||||
@ -267,6 +269,7 @@ def load_model(whisper_arch,
|
||||
model : Optional[WhisperModel] = None,
|
||||
task="transcribe",
|
||||
download_root=None,
|
||||
local_files_only=False,
|
||||
threads=4):
|
||||
'''Load a Whisper model for inference.
|
||||
Args:
|
||||
@ -277,6 +280,7 @@ def load_model(whisper_arch,
|
||||
language: str - The language of the model. (use English for now)
|
||||
model: Optional[WhisperModel] - The WhisperModel instance to use.
|
||||
download_root: Optional[str] - The root directory to download the model to.
|
||||
local_files_only: bool - If `True`, avoid downloading the file and return the path to the local cached file if it exists.
|
||||
threads: int - The number of cpu threads to use per worker, e.g. will be multiplied by num workers.
|
||||
Returns:
|
||||
A Whisper pipeline.
|
||||
@ -290,6 +294,7 @@ def load_model(whisper_arch,
|
||||
device_index=device_index,
|
||||
compute_type=compute_type,
|
||||
download_root=download_root,
|
||||
local_files_only=local_files_only,
|
||||
cpu_threads=threads)
|
||||
if language is not None:
|
||||
tokenizer = faster_whisper.tokenizer.Tokenizer(model.hf_tokenizer, model.model.is_multilingual, task=task, language=language)
|
||||
@ -319,10 +324,12 @@ def load_model(whisper_arch,
|
||||
"word_timestamps": False,
|
||||
"prepend_punctuations": "\"'“¿([{-",
|
||||
"append_punctuations": "\"'.。,,!!??::”)]}、",
|
||||
"multilingual": model.model.is_multilingual,
|
||||
"suppress_numerals": False,
|
||||
"max_new_tokens": None,
|
||||
"clip_timestamps": None,
|
||||
"hallucination_silence_threshold": None,
|
||||
"hotwords": None,
|
||||
}
|
||||
|
||||
if asr_options is not None:
|
||||
@ -354,4 +361,4 @@ def load_model(whisper_arch,
|
||||
language=language,
|
||||
suppress_numerals=suppress_numerals,
|
||||
vad_params=default_vad_options,
|
||||
)
|
||||
)
|
@ -87,6 +87,7 @@ def cli():
|
||||
device: str = args.pop("device")
|
||||
device_index: int = args.pop("device_index")
|
||||
compute_type: str = args.pop("compute_type")
|
||||
verbose: bool = args.pop("verbose")
|
||||
|
||||
# model_flush: bool = args.pop("model_flush")
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
@ -173,7 +174,7 @@ def cli():
|
||||
audio = load_audio(audio_path)
|
||||
# >> VAD & ASR
|
||||
print(">>Performing transcription...")
|
||||
result = model.transcribe(audio, batch_size=batch_size, chunk_size=chunk_size, print_progress=print_progress)
|
||||
result = model.transcribe(audio, batch_size=batch_size, chunk_size=chunk_size, print_progress=print_progress, verbose=verbose)
|
||||
results.append((result, audio_path))
|
||||
|
||||
# Unload Whisper and VAD
|
||||
|
Reference in New Issue
Block a user