From 9b8b4c3671ba609e47d020d2d065192259a6080f Mon Sep 17 00:00:00 2001 From: Luke Southam Date: Sat, 10 Dec 2022 23:29:03 +0000 Subject: [PATCH] initial commit --- .vscode/settings.json | 7 +++ README | 4 ++ setup.py | 28 ++++++++++ whispercpp.pxd | 120 ++++++++++++++++++++++++++++++++++++++++++ whispercpp.pyx | 76 ++++++++++++++++++++++++++ 5 files changed, 235 insertions(+) create mode 100644 .vscode/settings.json create mode 100644 README create mode 100644 setup.py create mode 100644 whispercpp.pxd create mode 100644 whispercpp.pyx diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..bc6d20b --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,7 @@ +{ + "workbench.colorCustomizations": { + "activityBar.background": "#053239", + "titleBar.activeBackground": "#074750", + "titleBar.activeForeground": "#F2FCFE" + } +} \ No newline at end of file diff --git a/README b/README new file mode 100644 index 0000000..6b827cf --- /dev/null +++ b/README @@ -0,0 +1,4 @@ +Python bindings for whisper.cpp +=============================== + + diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..e84b4fb --- /dev/null +++ b/setup.py @@ -0,0 +1,28 @@ +from distutils.core import setup +from Cython.Build import cythonize +import numpy +import os +import sys + +if sys.platform == 'darwin': + os.environ['CFLAGS'] = '-DGGML_USE_ACCELERATE' + os.environ['CXXFLAGS'] = '-DGGML_USE_ACCELERATE' + os.environ['LDFLAGS'] = '-framework Accelerate' +else: + os.environ['CFLAGS'] = '-mavx -mavx2 -mfma -mf16c' + os.environ['CXXFLAGS'] = '-mavx -mavx2 -mfma -mf16c' + + +setup( + name='whispercpp', + version='1.0', + description='Python Bindings for whisper.cpp', + author='Luke Southam', + author_email='luke@devthe.com', + ext_modules = cythonize("whispercpp.pyx"), + include_dirs = ['./whisper.cpp/', numpy.get_include()], + install_requires=[ + 'numpy', + 'ffmpeg-python', + ], +) diff --git a/whispercpp.pxd b/whispercpp.pxd new file mode 100644 index 0000000..f941ca1 --- /dev/null +++ b/whispercpp.pxd @@ -0,0 +1,120 @@ +#!python +# cython: language_level=3 +# distutils: libraries=['whisper'] + + +from libc.stdint cimport ( + int64_t +) + +cdef: + int WHISPER_SAMPLE_RATE = 16000 + int WHISPER_N_FFT = 400 + int WHISPER_N_MEL = 80 + int WHISPER_HOP_LENGTH = 160 + int WHISPER_CHUNK_SIZE = 30 + int SAMPLE_RATE = 16000 + char* TEST_FILE = b'test.wav' + char* DEFAULT_MODEL = b'ggml-tiny.bin' + char* LANGUAGE = b'fr' + +cdef extern from "whisper.h" nogil: + enum whisper_sampling_strategy: + WHISPER_SAMPLING_GREEDY = 0, + WHISPER_SAMPLING_BEAM_SEARCH, + ctypedef bint _Bool + ctypedef void (*whisper_new_segment_callback)(whisper_context*, int, void*) + ctypedef _Bool whisper_encoder_begin_callback(whisper_context*, void*) + ctypedef int whisper_token + ctypedef struct whisper_token_data: + whisper_token id + whisper_token tid + float p + float pt + float ptsum + int64_t t0 + int64_t t1 + float vlen + ctypedef struct whisper_context: + pass + ctypedef struct anon_2: + int n_past + ctypedef struct anon_3: + int n_past + int beam_width + int n_best + ctypedef struct whisper_full_params: + int strategy + int n_threads + int n_max_text_ctx + int offset_ms + int duration_ms + _Bool translate + _Bool no_context + _Bool single_segment + _Bool print_special + _Bool print_progress + _Bool print_realtime + _Bool print_timestamps + _Bool token_timestamps + float thold_pt + float thold_ptsum + int max_len + int max_tokens + _Bool speed_up + int audio_ctx + whisper_token* prompt_tokens + int prompt_n_tokens + char* language + anon_2 greedy + anon_3 beam_search + whisper_new_segment_callback new_segment_callback + void* new_segment_callback_user_data + whisper_encoder_begin_callback encoder_begin_callback + void* encoder_begin_callback_user_data + whisper_full_params whisper_full_default_params(whisper_sampling_strategy) + cdef whisper_context* whisper_init(char*) + cdef void whisper_free(whisper_context*) + cdef int whisper_pcm_to_mel(whisper_context*, float*, int, int) + cdef int whisper_set_mel(whisper_context*, float*, int, int) + cdef int whisper_encode(whisper_context*, int, int) + cdef int whisper_decode(whisper_context*, whisper_token*, int, int, int) + cdef whisper_token_data whisper_sample_best(whisper_context*) + cdef whisper_token whisper_sample_timestamp(whisper_context*) + cdef int whisper_lang_id(char*) + cdef int whisper_n_len(whisper_context*) + cdef int whisper_n_vocab(whisper_context*) + cdef int whisper_n_text_ctx(whisper_context*) + cdef int whisper_is_multilingual(whisper_context*) + cdef float* whisper_get_probs(whisper_context*) + # Unknown CtypesSpecial name='c_char_p' + cdef whisper_token whisper_token_eot(whisper_context*) + cdef whisper_token whisper_token_sot(whisper_context*) + cdef whisper_token whisper_token_prev(whisper_context*) + cdef whisper_token whisper_token_solm(whisper_context*) + cdef whisper_token whisper_token_not(whisper_context*) + cdef whisper_token whisper_token_beg(whisper_context*) + cdef whisper_token whisper_token_translate() + cdef whisper_token whisper_token_transcribe() + cdef void whisper_print_timings(whisper_context*) + cdef void whisper_reset_timings(whisper_context*) + # Unsupported base Klass='CtypesEnum' + cdef int whisper_full(whisper_context*, whisper_full_params, float*, int) + cdef int whisper_full_parallel(whisper_context*, whisper_full_params, float*, int, int) + cdef int whisper_full_n_segments(whisper_context*) + cdef int64_t whisper_full_get_segment_t0(whisper_context*, int) + cdef int64_t whisper_full_get_segment_t1(whisper_context*, int) + # Unknown CtypesSpecial name='c_char_p' + cdef int whisper_full_n_tokens(whisper_context*, int) + # Unknown CtypesSpecial name='c_char_p' + cdef whisper_token whisper_full_get_token_id(whisper_context*, int, int) + cdef whisper_token_data whisper_full_get_token_data(whisper_context*, int, int) + cdef float whisper_full_get_token_p(whisper_context*, int, int) + const char* whisper_print_system_info() + const char* whisper_full_get_segment_text(whisper_context*, int) + + +ctypedef struct audio_data: + float* frames; + int n_frames; + diff --git a/whispercpp.pyx b/whispercpp.pyx new file mode 100644 index 0000000..67c455a --- /dev/null +++ b/whispercpp.pyx @@ -0,0 +1,76 @@ +#!python +# cython: language_level=3 +# distutils: language = c++ +# distutils: sources= ./whisper.cpp/whisper.cpp ./whisper.cpp/ggml.c + +import ffmpeg +import numpy as np +cimport numpy as cnp + +cdef int SAMPLE_RATE = 16000 +cdef char* TEST_FILE = b'test.wav' +cdef char* DEFAULT_MODEL = b'ggml-tiny.bin' +cdef char* LANGUAGE = b'fr' + +cdef audio_data load_audio(bytes file, int sr = SAMPLE_RATE): + out = ( + ffmpeg.input(file, threads=0) + .output( + "-", format="s16le", + acodec="pcm_s16le", + ac=1, ar=sr + ) + .run( + cmd=["ffmpeg", "-nostdin"], + capture_stdout=True, + capture_stderr=True + ) + )[0] + + cdef cnp.ndarray[cnp.float32_t, ndim=1, mode="c"] frames = ( + np.frombuffer(out, np.int16) + .flatten() + .astype(np.float32) + ) / pow(2, 15) + + cdef audio_data data; + data.frames = &frames[0] + data.n_frames = len(frames) + + return data + +cdef whisper_full_params default_params(): + cdef whisper_full_params params = whisper_full_default_params( + whisper_sampling_strategy.WHISPER_SAMPLING_GREEDY + ) + params.print_realtime = True + params.print_progress = True + params.translate = False + params.language = LANGUAGE + return params + + +cdef class Whisper: + cdef whisper_context * ctx + cdef whisper_full_params params + + def __init__(self, char* model=DEFAULT_MODEL): + self.ctx = whisper_init(model) + self.params = default_params() + + def __dealloc__(self): + whisper_free(self.ctx) + + cpdef str transcribe(self): + cdef audio_data data = load_audio(TEST_FILE) + cdef int res = whisper_full(self.ctx, self.params, data.frames, data.n_frames) + if res != 0: + raise RuntimeError + cdef int n_segments = whisper_full_n_segments(self.ctx) + return b'\n'.join([ + whisper_full_get_segment_text(self.ctx, i) for i in range(n_segments) + ]).decode() + + + +