@@ -26,6 +26,9 @@ import org.tensorflow.op.audio.Mfcc
26
26
import org.tensorflow.types.TFloat32
27
27
import org.tensorflow.types.TInt32
28
28
import org.tensorflow.types.TString
29
+ import kotlin.Boolean
30
+ import kotlin.Float
31
+ import kotlin.Long
29
32
30
33
/* *
31
34
* An API for building `audio` operations as [Op][org.tensorflow.op.Op]s
@@ -47,33 +50,33 @@ public class AudioOps(
47
50
48
51
/* *
49
52
* Produces a visualization of audio data over time.
50
- *
53
+ *
51
54
* Spectrograms are a standard way of representing audio information as a series of
52
55
* slices of frequency information, one slice for each window of time. By joining
53
56
* these together into a sequence, they form a distinctive fingerprint of the sound
54
57
* over time.
55
- *
58
+ *
56
59
* This op expects to receive audio data as an input, stored as floats in the range
57
60
* -1 to 1, together with a window width in samples, and a stride specifying how
58
61
* far to move the window between slices. From this it generates a three
59
62
* dimensional output. The first dimension is for the channels in the input, so a
60
63
* stereo audio input would have two here for example. The second dimension is time,
61
64
* with successive frequency slices. The third dimension has an amplitude value for
62
65
* each frequency during that time slice.
63
- *
66
+ *
64
67
* This means the layout when converted and saved as an image is rotated 90 degrees
65
68
* clockwise from a typical spectrogram. Time is descending down the Y axis, and
66
69
* the frequency decreases from left to right.
67
- *
70
+ *
68
71
* Each value in the result represents the square root of the sum of the real and
69
72
* imaginary parts of an FFT on the current window of samples. In this way, the
70
73
* lowest dimension represents the power of each frequency in the current window,
71
74
* and adjacent windows are concatenated in the next dimension.
72
- *
75
+ *
73
76
* To get a more intuitive and visual look at what this operation does, you can run
74
77
* tensorflow/examples/wav_to_spectrogram to read in an audio file and save out the
75
78
* resulting spectrogram as a PNG image.
76
- *
79
+ *
77
80
* @param input Float representation of audio data.
78
81
* @param windowSize How wide the input window is in samples. For the highest efficiency
79
82
* this should be a power of two, but other values are accepted.
@@ -89,33 +92,33 @@ public class AudioOps(
89
92
windowSize : Long ,
90
93
stride : Long ,
91
94
magnitudeSquared : Boolean? = null
92
- ): AudioSpectrogram = java.audioSpectrogram(
95
+ ): AudioSpectrogram = java.audioSpectrogram(
93
96
input,
94
97
windowSize,
95
98
stride,
96
99
* listOfNotNull(
97
- magnitudeSquared?.let { org.tensorflow.op.audio.AudioSpectrogram .magnitudeSquared(it) }
100
+ magnitudeSquared?.let { org.tensorflow.op.audio.AudioSpectrogram .magnitudeSquared(it) }
98
101
).toTypedArray()
99
- )
102
+ )
100
103
101
104
/* *
102
105
* Decode a 16-bit PCM WAV file to a float tensor.
103
- *
106
+ *
104
107
* The -32768 to 32767 signed 16-bit values will be scaled to -1.0 to 1.0 in float.
105
- *
108
+ *
106
109
* When desired_channels is set, if the input contains fewer channels than this
107
110
* then the last channel will be duplicated to give the requested number, else if
108
111
* the input has more channels than requested then the additional channels will be
109
112
* ignored.
110
- *
113
+ *
111
114
* If desired_samples is set, then the audio will be cropped or padded with zeroes
112
115
* to the requested length.
113
- *
116
+ *
114
117
* The first output contains a Tensor with the content of the audio samples. The
115
118
* lowest dimension will be the number of channels, and the second will be the
116
119
* number of samples. For example, a ten-sample-long stereo WAV file should give an
117
120
* output shape of [10, 2].
118
- *
121
+ *
119
122
* @param contents The WAV-encoded audio, usually from a file.
120
123
* @param options carries optional attributes values
121
124
* @return a new instance of DecodeWav
@@ -127,47 +130,47 @@ public class AudioOps(
127
130
contents : Operand <TString >,
128
131
desiredChannels : Long? = null,
129
132
desiredSamples : Long? = null
130
- ): DecodeWav = java.decodeWav(
133
+ ): DecodeWav = java.decodeWav(
131
134
contents,
132
135
* listOfNotNull(
133
- desiredChannels?.let { org.tensorflow.op.audio.DecodeWav .desiredChannels(it) },
134
- desiredSamples?.let { org.tensorflow.op.audio.DecodeWav .desiredSamples(it) }
136
+ desiredChannels?.let { org.tensorflow.op.audio.DecodeWav .desiredChannels(it) },
137
+ desiredSamples?.let { org.tensorflow.op.audio.DecodeWav .desiredSamples(it) }
135
138
).toTypedArray()
136
- )
139
+ )
137
140
138
141
/* *
139
142
* Encode audio data using the WAV file format.
140
- *
143
+ *
141
144
* This operation will generate a string suitable to be saved out to create a .wav
142
145
* audio file. It will be encoded in the 16-bit PCM format. It takes in float
143
146
* values in the range -1.0f to 1.0f, and any outside that value will be clamped to
144
147
* that range.
145
- *
148
+ *
146
149
* `audio` is a 2-D float Tensor of shape `[length, channels]`.
147
150
* `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
148
- *
151
+ *
149
152
* @param audio 2-D with shape `[length, channels]`.
150
153
* @param sampleRate Scalar containing the sample frequency.
151
154
* @return a new instance of EncodeWav
152
155
* @see org.tensorflow.op.AudioOps.encodeWav
153
156
*/
154
157
public fun encodeWav (audio : Operand <TFloat32 >, sampleRate : Operand <TInt32 >): EncodeWav =
155
- java.encodeWav(
156
- audio,
157
- sampleRate
158
+ java.encodeWav(
159
+ audio,
160
+ sampleRate
158
161
)
159
162
160
163
/* *
161
164
* Transforms a spectrogram into a form that's useful for speech recognition.
162
- *
165
+ *
163
166
* Mel Frequency Cepstral Coefficients are a way of representing audio data that's
164
167
* been effective as an input feature for machine learning. They are created by
165
168
* taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
166
169
* higher frequencies that are less significant to the human ear. They have a long
167
170
* history in the speech recognition world, and
168
171
* https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
169
172
* is a good resource to learn more.
170
- *
173
+ *
171
174
* @param spectrogram Typically produced by the Spectrogram op, with magnitude_squared
172
175
* set to true.
173
176
* @param sampleRate How many samples per second the source audio used.
@@ -188,14 +191,14 @@ public class AudioOps(
188
191
lowerFrequencyLimit : Float? = null,
189
192
filterbankChannelCount : Long? = null,
190
193
dctCoefficientCount : Long? = null
191
- ): Mfcc = java.mfcc(
194
+ ): Mfcc = java.mfcc(
192
195
spectrogram,
193
196
sampleRate,
194
197
* listOfNotNull(
195
- upperFrequencyLimit?.let { org.tensorflow.op.audio.Mfcc .upperFrequencyLimit(it) },
196
- lowerFrequencyLimit?.let { org.tensorflow.op.audio.Mfcc .lowerFrequencyLimit(it) },
197
- filterbankChannelCount?.let { org.tensorflow.op.audio.Mfcc .filterbankChannelCount(it) },
198
- dctCoefficientCount?.let { org.tensorflow.op.audio.Mfcc .dctCoefficientCount(it) }
198
+ upperFrequencyLimit?.let { org.tensorflow.op.audio.Mfcc .upperFrequencyLimit(it) },
199
+ lowerFrequencyLimit?.let { org.tensorflow.op.audio.Mfcc .lowerFrequencyLimit(it) },
200
+ filterbankChannelCount?.let { org.tensorflow.op.audio.Mfcc .filterbankChannelCount(it) },
201
+ dctCoefficientCount?.let { org.tensorflow.op.audio.Mfcc .dctCoefficientCount(it) }
199
202
).toTypedArray()
200
- )
203
+ )
201
204
}
0 commit comments