From 9b8b4c3671ba609e47d020d2d065192259a6080f Mon Sep 17 00:00:00 2001
From: Luke Southam <luke@devthe.com>
Date: Sat, 10 Dec 2022 23:29:03 +0000
Subject: [PATCH] initial commit

---
 .vscode/settings.json |   7 +++
 README                |   4 ++
 setup.py              |  28 ++++++++++
 whispercpp.pxd        | 120 ++++++++++++++++++++++++++++++++++++++++++
 whispercpp.pyx        |  76 ++++++++++++++++++++++++++
 5 files changed, 235 insertions(+)
 create mode 100644 .vscode/settings.json
 create mode 100644 README
 create mode 100644 setup.py
 create mode 100644 whispercpp.pxd
 create mode 100644 whispercpp.pyx

diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..bc6d20b
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,7 @@
+{
+    "workbench.colorCustomizations": {
+        "activityBar.background": "#053239",
+        "titleBar.activeBackground": "#074750",
+        "titleBar.activeForeground": "#F2FCFE"
+    }
+}
\ No newline at end of file
diff --git a/README b/README
new file mode 100644
index 0000000..6b827cf
--- /dev/null
+++ b/README
@@ -0,0 +1,4 @@
+Python bindings for whisper.cpp
+===============================
+
+
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..e84b4fb
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,28 @@
+from distutils.core import setup
+from Cython.Build import cythonize
+import numpy
+import os
+import sys
+
+if sys.platform == 'darwin':
+    os.environ['CFLAGS'] = '-DGGML_USE_ACCELERATE'
+    os.environ['CXXFLAGS'] = '-DGGML_USE_ACCELERATE'
+    os.environ['LDFLAGS'] = '-framework Accelerate'
+else:
+    os.environ['CFLAGS'] = '-mavx -mavx2 -mfma -mf16c'
+    os.environ['CXXFLAGS'] = '-mavx -mavx2 -mfma -mf16c'
+
+
+setup(
+    name='whispercpp',
+    version='1.0',
+    description='Python Bindings for whisper.cpp',
+    author='Luke Southam',
+    author_email='luke@devthe.com',
+    ext_modules = cythonize("whispercpp.pyx"),
+    include_dirs = ['./whisper.cpp/', numpy.get_include()],
+    install_requires=[
+      'numpy',
+      'ffmpeg-python',
+    ],
+)
diff --git a/whispercpp.pxd b/whispercpp.pxd
new file mode 100644
index 0000000..f941ca1
--- /dev/null
+++ b/whispercpp.pxd
@@ -0,0 +1,120 @@
+#!python
+# cython: language_level=3
+# distutils: libraries=['whisper']
+
+
+from libc.stdint cimport (
+    int64_t
+)
+
+cdef:
+    int WHISPER_SAMPLE_RATE = 16000
+    int WHISPER_N_FFT = 400
+    int WHISPER_N_MEL = 80
+    int WHISPER_HOP_LENGTH = 160
+    int WHISPER_CHUNK_SIZE = 30
+    int SAMPLE_RATE = 16000
+    char* TEST_FILE = b'test.wav'
+    char* DEFAULT_MODEL = b'ggml-tiny.bin'
+    char* LANGUAGE = b'fr'
+
+cdef extern from "whisper.h" nogil:
+    enum whisper_sampling_strategy:
+        WHISPER_SAMPLING_GREEDY = 0,
+        WHISPER_SAMPLING_BEAM_SEARCH,
+    ctypedef bint _Bool
+    ctypedef void (*whisper_new_segment_callback)(whisper_context*, int, void*)
+    ctypedef _Bool whisper_encoder_begin_callback(whisper_context*, void*)
+    ctypedef int whisper_token
+    ctypedef struct whisper_token_data:
+        whisper_token id
+        whisper_token tid
+        float p
+        float pt
+        float ptsum
+        int64_t t0
+        int64_t t1
+        float vlen
+    ctypedef struct whisper_context:
+        pass
+    ctypedef struct anon_2:
+        int n_past
+    ctypedef struct anon_3:
+        int n_past
+        int beam_width
+        int n_best
+    ctypedef struct whisper_full_params:
+        int strategy
+        int n_threads
+        int n_max_text_ctx
+        int offset_ms
+        int duration_ms
+        _Bool translate
+        _Bool no_context
+        _Bool single_segment
+        _Bool print_special
+        _Bool print_progress
+        _Bool print_realtime
+        _Bool print_timestamps
+        _Bool token_timestamps
+        float thold_pt
+        float thold_ptsum
+        int max_len
+        int max_tokens
+        _Bool speed_up
+        int audio_ctx
+        whisper_token* prompt_tokens
+        int prompt_n_tokens
+        char* language
+        anon_2 greedy
+        anon_3 beam_search
+        whisper_new_segment_callback new_segment_callback
+        void* new_segment_callback_user_data
+        whisper_encoder_begin_callback encoder_begin_callback
+        void* encoder_begin_callback_user_data
+    whisper_full_params whisper_full_default_params(whisper_sampling_strategy)
+    cdef whisper_context* whisper_init(char*)
+    cdef void whisper_free(whisper_context*)
+    cdef int whisper_pcm_to_mel(whisper_context*, float*, int, int)
+    cdef int whisper_set_mel(whisper_context*, float*, int, int)
+    cdef int whisper_encode(whisper_context*, int, int)
+    cdef int whisper_decode(whisper_context*, whisper_token*, int, int, int)
+    cdef whisper_token_data whisper_sample_best(whisper_context*)
+    cdef whisper_token whisper_sample_timestamp(whisper_context*)
+    cdef int whisper_lang_id(char*)
+    cdef int whisper_n_len(whisper_context*)
+    cdef int whisper_n_vocab(whisper_context*)
+    cdef int whisper_n_text_ctx(whisper_context*)
+    cdef int whisper_is_multilingual(whisper_context*)
+    cdef float* whisper_get_probs(whisper_context*)
+    # Unknown CtypesSpecial name='c_char_p'
+    cdef whisper_token whisper_token_eot(whisper_context*)
+    cdef whisper_token whisper_token_sot(whisper_context*)
+    cdef whisper_token whisper_token_prev(whisper_context*)
+    cdef whisper_token whisper_token_solm(whisper_context*)
+    cdef whisper_token whisper_token_not(whisper_context*)
+    cdef whisper_token whisper_token_beg(whisper_context*)
+    cdef whisper_token whisper_token_translate()
+    cdef whisper_token whisper_token_transcribe()
+    cdef void whisper_print_timings(whisper_context*)
+    cdef void whisper_reset_timings(whisper_context*)
+    # Unsupported base Klass='CtypesEnum'
+    cdef int whisper_full(whisper_context*, whisper_full_params, float*, int)
+    cdef int whisper_full_parallel(whisper_context*, whisper_full_params, float*, int, int)
+    cdef int whisper_full_n_segments(whisper_context*)
+    cdef int64_t whisper_full_get_segment_t0(whisper_context*, int)
+    cdef int64_t whisper_full_get_segment_t1(whisper_context*, int)
+    # Unknown CtypesSpecial name='c_char_p'
+    cdef int whisper_full_n_tokens(whisper_context*, int)
+    # Unknown CtypesSpecial name='c_char_p'
+    cdef whisper_token whisper_full_get_token_id(whisper_context*, int, int)
+    cdef whisper_token_data whisper_full_get_token_data(whisper_context*, int, int)
+    cdef float whisper_full_get_token_p(whisper_context*, int, int)
+    const char* whisper_print_system_info()
+    const char* whisper_full_get_segment_text(whisper_context*, int)
+
+
+ctypedef struct audio_data:
+    float* frames;
+    int n_frames;
+
diff --git a/whispercpp.pyx b/whispercpp.pyx
new file mode 100644
index 0000000..67c455a
--- /dev/null
+++ b/whispercpp.pyx
@@ -0,0 +1,76 @@
+#!python
+# cython: language_level=3
+# distutils: language = c++
+# distutils: sources= ./whisper.cpp/whisper.cpp ./whisper.cpp/ggml.c
+
+import ffmpeg
+import numpy as np
+cimport numpy as cnp
+
+cdef int SAMPLE_RATE = 16000
+cdef char* TEST_FILE = b'test.wav'
+cdef char* DEFAULT_MODEL = b'ggml-tiny.bin'
+cdef char* LANGUAGE = b'fr'
+
+cdef audio_data load_audio(bytes file, int sr = SAMPLE_RATE):
+    out = (
+        ffmpeg.input(file, threads=0)
+        .output(
+            "-", format="s16le",
+            acodec="pcm_s16le",
+            ac=1, ar=sr
+        )
+        .run(
+            cmd=["ffmpeg", "-nostdin"],
+            capture_stdout=True,
+            capture_stderr=True
+        )
+    )[0]
+
+    cdef cnp.ndarray[cnp.float32_t, ndim=1, mode="c"] frames = (
+        np.frombuffer(out, np.int16)
+        .flatten()
+        .astype(np.float32)
+    ) / pow(2, 15)
+
+    cdef audio_data data;
+    data.frames = &frames[0]
+    data.n_frames = len(frames)
+
+    return data
+
+cdef whisper_full_params default_params():
+    cdef whisper_full_params params = whisper_full_default_params(
+        whisper_sampling_strategy.WHISPER_SAMPLING_GREEDY
+    )
+    params.print_realtime = True
+    params.print_progress = True
+    params.translate = False
+    params.language = <const char *> LANGUAGE
+    return params
+
+
+cdef class Whisper:
+    cdef whisper_context * ctx
+    cdef whisper_full_params params
+
+    def __init__(self, char* model=DEFAULT_MODEL):
+        self.ctx = whisper_init(model)
+        self.params = default_params()
+
+    def __dealloc__(self):
+        whisper_free(self.ctx)
+
+    cpdef str transcribe(self):
+        cdef audio_data data = load_audio(TEST_FILE) 
+        cdef int res = whisper_full(self.ctx, self.params, data.frames, data.n_frames)
+        if res != 0:
+            raise RuntimeError
+        cdef int n_segments = whisper_full_n_segments(self.ctx)
+        return b'\n'.join([
+            whisper_full_get_segment_text(self.ctx, i) for i in range(n_segments)
+        ]).decode()
+
+
+
+