forked from mozilla/DeepSpeech
-
Notifications
You must be signed in to change notification settings - Fork 0
/
deepspeech.h
95 lines (83 loc) · 3.3 KB
/
deepspeech.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#ifndef __DEEPSPEECH_H__
#define __DEEPSPEECH_H__
#include <cstddef>
namespace DeepSpeech
{
class Private;
class Model {
private:
Private* mPriv;
public:
/**
* @brief An object providing an interface to a trained DeepSpeech model.
*
* @param aModelPath The path to the frozen model graph.
* @param aNCep The number of cepstrum the model was trained with.
* @param aNContext The context window the model was trained with.
*/
Model(const char* aModelPath, int aNCep, int aNContext);
/**
* @brief Frees associated resources and destroys model object.
*/
~Model();
/**
* @brief Given audio, return a vector suitable for input to the
* DeepSpeech model.
*
* Extracts MFCC features from a given audio signal and adds the
* appropriate amount of context to run inference on the DeepSpeech model.
* This is equivalent to calling audioToInputVector() with the model's
* cepstrum and context window.
*
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
* sample rate.
* @param aBufferSize The sample-length of the audio signal.
* @param aSampleRate The sample-rate of the audio signal.
* @param[out] aMFCC An array containing features, of shape
* (@p aNFrames, ncep * ncontext). The user is
* responsible for freeing the array.
* @param[out] aNFrames (optional) The number of frames in @p aMFCC.
* @param[out] aFrameLen (optional) The length of each frame
* (ncep * ncontext) in @p aMFCC.
*/
void getInputVector(const short* aBuffer,
unsigned int aBufferSize,
int aSampleRate,
float** aMfcc,
int* aNFrames = NULL,
int* aFrameLen = NULL);
/**
* @brief Run inference on the given audio.
*
* Runs inference on the given input vector with the model.
* See getInputVector().
*
* @param aMfcc MFCC features with the appropriate amount of context per
* frame.
* @param aNFrames The number of frames in @p aMfcc.
* @param aFrameLen (optional) The length of each frame in @p aMfcc. If
* specified, this will be used to verify the array is
* large enough.
*
* @return The resulting string after running inference. The user is
* responsible for freeing this string.
*/
char* infer(float* aMfcc,
int aNFrames,
int aFrameLen = 0);
/**
* @brief Use the DeepSpeech model to perform Speech-To-Text.
*
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
* sample rate.
* @param aBufferSize The number of samples in the audio signal.
* @param aSampleRate The sample-rate of the audio signal.
*
* @return The STT result. The user is responsible for freeing the string.
*/
char* stt(const short* aBuffer,
unsigned int aBufferSize,
int aSampleRate);
};
}
#endif /* __DEEPSPEECH_H__ */