@@ -28,23 +28,62 @@ import org.tensorflow.types.TInt32
28
28
import org.tensorflow.types.TString
29
29
30
30
/* *
31
- * An API for building {@code audio} operations as {@link org.tensorflow.op.Op Op} s
31
+ * An API for building ` audio` operations as [Op][ org.tensorflow.op.Op] s
32
32
*
33
- * @see {@link org.tensorflow.op.Ops}
33
+ * @see org.tensorflow.op.Ops
34
34
*/
35
35
public class AudioOps (
36
36
/* *
37
- * Get the parent {@link KotlinOps} object.
37
+ * Get the parent [ KotlinOps] object.
38
38
*/
39
39
public val ops : KotlinOps
40
40
) {
41
41
public val java: org.tensorflow.op.AudioOps = ops.java.audio
42
42
43
43
/* *
44
- * Returns the current {@link Scope scope} of this API
44
+ * Returns the current [ scope][Scope] of this API
45
45
*/
46
46
public val scope: Scope = ops.scope
47
47
48
+ /* *
49
+ * Produces a visualization of audio data over time.
50
+ *
51
+ * Spectrograms are a standard way of representing audio information as a series of
52
+ * slices of frequency information, one slice for each window of time. By joining
53
+ * these together into a sequence, they form a distinctive fingerprint of the sound
54
+ * over time.
55
+ *
56
+ * This op expects to receive audio data as an input, stored as floats in the range
57
+ * -1 to 1, together with a window width in samples, and a stride specifying how
58
+ * far to move the window between slices. From this it generates a three
59
+ * dimensional output. The first dimension is for the channels in the input, so a
60
+ * stereo audio input would have two here for example. The second dimension is time,
61
+ * with successive frequency slices. The third dimension has an amplitude value for
62
+ * each frequency during that time slice.
63
+ *
64
+ * This means the layout when converted and saved as an image is rotated 90 degrees
65
+ * clockwise from a typical spectrogram. Time is descending down the Y axis, and
66
+ * the frequency decreases from left to right.
67
+ *
68
+ * Each value in the result represents the square root of the sum of the real and
69
+ * imaginary parts of an FFT on the current window of samples. In this way, the
70
+ * lowest dimension represents the power of each frequency in the current window,
71
+ * and adjacent windows are concatenated in the next dimension.
72
+ *
73
+ * To get a more intuitive and visual look at what this operation does, you can run
74
+ * tensorflow/examples/wav_to_spectrogram to read in an audio file and save out the
75
+ * resulting spectrogram as a PNG image.
76
+ *
77
+ * @param input Float representation of audio data.
78
+ * @param windowSize How wide the input window is in samples. For the highest efficiency
79
+ * this should be a power of two, but other values are accepted.
80
+ * @param stride How widely apart the center of adjacent sample windows should be.
81
+ * @param options carries optional attributes values
82
+ * @return a new instance of AudioSpectrogram
83
+ * @see org.tensorflow.op.AudioOps.audioSpectrogram
84
+ * @param magnitudeSquared Whether to return the squared magnitude or just the
85
+ * magnitude. Using squared magnitude can avoid extra calculations.
86
+ */
48
87
public fun audioSpectrogram (
49
88
input : Operand <TFloat32 >,
50
89
windowSize : Long ,
@@ -59,6 +98,31 @@ public class AudioOps(
59
98
).toTypedArray()
60
99
)
61
100
101
+ /* *
102
+ * Decode a 16-bit PCM WAV file to a float tensor.
103
+ *
104
+ * The -32768 to 32767 signed 16-bit values will be scaled to -1.0 to 1.0 in float.
105
+ *
106
+ * When desired_channels is set, if the input contains fewer channels than this
107
+ * then the last channel will be duplicated to give the requested number, else if
108
+ * the input has more channels than requested then the additional channels will be
109
+ * ignored.
110
+ *
111
+ * If desired_samples is set, then the audio will be cropped or padded with zeroes
112
+ * to the requested length.
113
+ *
114
+ * The first output contains a Tensor with the content of the audio samples. The
115
+ * lowest dimension will be the number of channels, and the second will be the
116
+ * number of samples. For example, a ten-sample-long stereo WAV file should give an
117
+ * output shape of [10, 2].
118
+ *
119
+ * @param contents The WAV-encoded audio, usually from a file.
120
+ * @param options carries optional attributes values
121
+ * @return a new instance of DecodeWav
122
+ * @see org.tensorflow.op.AudioOps.decodeWav
123
+ * @param desiredChannels Number of sample channels wanted.
124
+ * @param desiredSamples Length of audio requested.
125
+ */
62
126
public fun decodeWav (
63
127
contents : Operand <TString >,
64
128
desiredChannels : Long? = null,
@@ -71,12 +135,52 @@ public class AudioOps(
71
135
).toTypedArray()
72
136
)
73
137
138
+ /* *
139
+ * Encode audio data using the WAV file format.
140
+ *
141
+ * This operation will generate a string suitable to be saved out to create a .wav
142
+ * audio file. It will be encoded in the 16-bit PCM format. It takes in float
143
+ * values in the range -1.0f to 1.0f, and any outside that value will be clamped to
144
+ * that range.
145
+ *
146
+ * `audio` is a 2-D float Tensor of shape `[length, channels]`.
147
+ * `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
148
+ *
149
+ * @param audio 2-D with shape `[length, channels]`.
150
+ * @param sampleRate Scalar containing the sample frequency.
151
+ * @return a new instance of EncodeWav
152
+ * @see org.tensorflow.op.AudioOps.encodeWav
153
+ */
74
154
public fun encodeWav (audio : Operand <TFloat32 >, sampleRate : Operand <TInt32 >): EncodeWav =
75
155
java.encodeWav(
76
156
audio,
77
157
sampleRate
78
158
)
79
159
160
+ /* *
161
+ * Transforms a spectrogram into a form that's useful for speech recognition.
162
+ *
163
+ * Mel Frequency Cepstral Coefficients are a way of representing audio data that's
164
+ * been effective as an input feature for machine learning. They are created by
165
+ * taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
166
+ * higher frequencies that are less significant to the human ear. They have a long
167
+ * history in the speech recognition world, and
168
+ * https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
169
+ * is a good resource to learn more.
170
+ *
171
+ * @param spectrogram Typically produced by the Spectrogram op, with magnitude_squared
172
+ * set to true.
173
+ * @param sampleRate How many samples per second the source audio used.
174
+ * @param options carries optional attributes values
175
+ * @return a new instance of Mfcc
176
+ * @see org.tensorflow.op.AudioOps.mfcc
177
+ * @param upperFrequencyLimit The highest frequency to use when calculating the
178
+ * ceptstrum.
179
+ * @param lowerFrequencyLimit The lowest frequency to use when calculating the
180
+ * ceptstrum.
181
+ * @param filterbankChannelCount Resolution of the Mel bank used internally.
182
+ * @param dctCoefficientCount How many output channels to produce per time slice.
183
+ */
80
184
public fun mfcc (
81
185
spectrogram : Operand <TFloat32 >,
82
186
sampleRate : Operand <TInt32 >,
0 commit comments