support for large-v3

This commit is contained in:
MahmoudAshraf97
2023-11-25 12:09:00 +00:00
parent d97cdb7bcf
commit 71a5281bde
3 changed files with 11 additions and 5 deletions

View File

@ -140,7 +140,12 @@ class FasterWhisperPipeline(Pipeline):
def preprocess(self, audio): def preprocess(self, audio):
audio = audio['inputs'] audio = audio['inputs']
features = log_mel_spectrogram(audio, padding=N_SAMPLES - audio.shape[0]) model_n_mels = self.model.feat_kwargs.get("feature_size")
features = log_mel_spectrogram(
audio,
n_mels=model_n_mels if model_n_mels is not None else 80,
padding=N_SAMPLES - audio.shape[0],
)
return {'inputs': features} return {'inputs': features}
def _forward(self, model_inputs): def _forward(self, model_inputs):
@ -240,7 +245,9 @@ class FasterWhisperPipeline(Pipeline):
def detect_language(self, audio: np.ndarray): def detect_language(self, audio: np.ndarray):
if audio.shape[0] < N_SAMPLES: if audio.shape[0] < N_SAMPLES:
print("Warning: audio is shorter than 30s, language detection may be inaccurate.") print("Warning: audio is shorter than 30s, language detection may be inaccurate.")
model_n_mels = self.model.feat_kwargs.get("feature_size")
segment = log_mel_spectrogram(audio[: N_SAMPLES], segment = log_mel_spectrogram(audio[: N_SAMPLES],
n_mels=model_n_mels if model_n_mels is not None else 80,
padding=0 if audio.shape[0] >= N_SAMPLES else N_SAMPLES - audio.shape[0]) padding=0 if audio.shape[0] >= N_SAMPLES else N_SAMPLES - audio.shape[0])
encoder_output = self.model.encode(segment) encoder_output = self.model.encode(segment)
results = self.model.model.detect_language(encoder_output) results = self.model.model.detect_language(encoder_output)

Binary file not shown.

View File

@ -12,7 +12,6 @@ from .utils import exact_div
# hard-coded audio hyperparameters # hard-coded audio hyperparameters
SAMPLE_RATE = 16000 SAMPLE_RATE = 16000
N_FFT = 400 N_FFT = 400
N_MELS = 80
HOP_LENGTH = 160 HOP_LENGTH = 160
CHUNK_LENGTH = 30 CHUNK_LENGTH = 30
N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE # 480000 samples in a 30-second chunk N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE # 480000 samples in a 30-second chunk
@ -93,7 +92,7 @@ def pad_or_trim(array, length: int = N_SAMPLES, *, axis: int = -1):
@lru_cache(maxsize=None) @lru_cache(maxsize=None)
def mel_filters(device, n_mels: int = N_MELS) -> torch.Tensor: def mel_filters(device, n_mels: int) -> torch.Tensor:
""" """
load the mel filterbank matrix for projecting STFT into a Mel spectrogram. load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
Allows decoupling librosa dependency; saved using: Allows decoupling librosa dependency; saved using:
@ -103,7 +102,7 @@ def mel_filters(device, n_mels: int = N_MELS) -> torch.Tensor:
mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80), mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80),
) )
""" """
assert n_mels == 80, f"Unsupported n_mels: {n_mels}" assert n_mels in [80, 128], f"Unsupported n_mels: {n_mels}"
with np.load( with np.load(
os.path.join(os.path.dirname(__file__), "assets", "mel_filters.npz") os.path.join(os.path.dirname(__file__), "assets", "mel_filters.npz")
) as f: ) as f:
@ -112,7 +111,7 @@ def mel_filters(device, n_mels: int = N_MELS) -> torch.Tensor:
def log_mel_spectrogram( def log_mel_spectrogram(
audio: Union[str, np.ndarray, torch.Tensor], audio: Union[str, np.ndarray, torch.Tensor],
n_mels: int = N_MELS, n_mels: int,
padding: int = 0, padding: int = 0,
device: Optional[Union[str, torch.device]] = None, device: Optional[Union[str, torch.device]] = None,
): ):