Skip to content

Commit cffce11

Browse files
committed
Javadoc generation
Signed-off-by: Ryan Nett <[email protected]>
1 parent 00f8123 commit cffce11

File tree

25 files changed

+20296
-197
lines changed

25 files changed

+20296
-197
lines changed

tensorflow-core-kotlin/tensorflow-core-kotlin-api/src/gen/annotations/org/tensorflow/op/kotlin/AudioOps.kt

Lines changed: 108 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,23 +28,62 @@ import org.tensorflow.types.TInt32
2828
import org.tensorflow.types.TString
2929

3030
/**
31-
* An API for building {@code audio} operations as {@link org.tensorflow.op.Op Op}s
31+
* An API for building `audio` operations as [Op][org.tensorflow.op.Op]s
3232
*
33-
* @see {@link org.tensorflow.op.Ops}
33+
* @see org.tensorflow.op.Ops
3434
*/
3535
public class AudioOps(
3636
/**
37-
* Get the parent {@link KotlinOps} object.
37+
* Get the parent [KotlinOps] object.
3838
*/
3939
public val ops: KotlinOps
4040
) {
4141
public val java: org.tensorflow.op.AudioOps = ops.java.audio
4242

4343
/**
44-
* Returns the current {@link Scope scope} of this API
44+
* Returns the current [scope][Scope] of this API
4545
*/
4646
public val scope: Scope = ops.scope
4747

48+
/**
49+
* Produces a visualization of audio data over time.
50+
*
51+
* Spectrograms are a standard way of representing audio information as a series of
52+
* slices of frequency information, one slice for each window of time. By joining
53+
* these together into a sequence, they form a distinctive fingerprint of the sound
54+
* over time.
55+
*
56+
* This op expects to receive audio data as an input, stored as floats in the range
57+
* -1 to 1, together with a window width in samples, and a stride specifying how
58+
* far to move the window between slices. From this it generates a three
59+
* dimensional output. The first dimension is for the channels in the input, so a
60+
* stereo audio input would have two here for example. The second dimension is time,
61+
* with successive frequency slices. The third dimension has an amplitude value for
62+
* each frequency during that time slice.
63+
*
64+
* This means the layout when converted and saved as an image is rotated 90 degrees
65+
* clockwise from a typical spectrogram. Time is descending down the Y axis, and
66+
* the frequency decreases from left to right.
67+
*
68+
* Each value in the result represents the square root of the sum of the real and
69+
* imaginary parts of an FFT on the current window of samples. In this way, the
70+
* lowest dimension represents the power of each frequency in the current window,
71+
* and adjacent windows are concatenated in the next dimension.
72+
*
73+
* To get a more intuitive and visual look at what this operation does, you can run
74+
* tensorflow/examples/wav_to_spectrogram to read in an audio file and save out the
75+
* resulting spectrogram as a PNG image.
76+
*
77+
* @param input Float representation of audio data.
78+
* @param windowSize How wide the input window is in samples. For the highest efficiency
79+
* this should be a power of two, but other values are accepted.
80+
* @param stride How widely apart the center of adjacent sample windows should be.
81+
* @param options carries optional attributes values
82+
* @return a new instance of AudioSpectrogram
83+
* @see org.tensorflow.op.AudioOps.audioSpectrogram
84+
* @param magnitudeSquared Whether to return the squared magnitude or just the
85+
* magnitude. Using squared magnitude can avoid extra calculations.
86+
*/
4887
public fun audioSpectrogram(
4988
input: Operand<TFloat32>,
5089
windowSize: Long,
@@ -59,6 +98,31 @@ public class AudioOps(
5998
).toTypedArray()
6099
)
61100

101+
/**
102+
* Decode a 16-bit PCM WAV file to a float tensor.
103+
*
104+
* The -32768 to 32767 signed 16-bit values will be scaled to -1.0 to 1.0 in float.
105+
*
106+
* When desired_channels is set, if the input contains fewer channels than this
107+
* then the last channel will be duplicated to give the requested number, else if
108+
* the input has more channels than requested then the additional channels will be
109+
* ignored.
110+
*
111+
* If desired_samples is set, then the audio will be cropped or padded with zeroes
112+
* to the requested length.
113+
*
114+
* The first output contains a Tensor with the content of the audio samples. The
115+
* lowest dimension will be the number of channels, and the second will be the
116+
* number of samples. For example, a ten-sample-long stereo WAV file should give an
117+
* output shape of &#91;10, 2].
118+
*
119+
* @param contents The WAV-encoded audio, usually from a file.
120+
* @param options carries optional attributes values
121+
* @return a new instance of DecodeWav
122+
* @see org.tensorflow.op.AudioOps.decodeWav
123+
* @param desiredChannels Number of sample channels wanted.
124+
* @param desiredSamples Length of audio requested.
125+
*/
62126
public fun decodeWav(
63127
contents: Operand<TString>,
64128
desiredChannels: Long? = null,
@@ -71,12 +135,52 @@ public class AudioOps(
71135
).toTypedArray()
72136
)
73137

138+
/**
139+
* Encode audio data using the WAV file format.
140+
*
141+
* This operation will generate a string suitable to be saved out to create a .wav
142+
* audio file. It will be encoded in the 16-bit PCM format. It takes in float
143+
* values in the range -1.0f to 1.0f, and any outside that value will be clamped to
144+
* that range.
145+
*
146+
* `audio` is a 2-D float Tensor of shape `&#91;length, channels]`.
147+
* `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
148+
*
149+
* @param audio 2-D with shape `&#91;length, channels]`.
150+
* @param sampleRate Scalar containing the sample frequency.
151+
* @return a new instance of EncodeWav
152+
* @see org.tensorflow.op.AudioOps.encodeWav
153+
*/
74154
public fun encodeWav(audio: Operand<TFloat32>, sampleRate: Operand<TInt32>): EncodeWav =
75155
java.encodeWav(
76156
audio,
77157
sampleRate
78158
)
79159

160+
/**
161+
* Transforms a spectrogram into a form that's useful for speech recognition.
162+
*
163+
* Mel Frequency Cepstral Coefficients are a way of representing audio data that's
164+
* been effective as an input feature for machine learning. They are created by
165+
* taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
166+
* higher frequencies that are less significant to the human ear. They have a long
167+
* history in the speech recognition world, and
168+
* https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
169+
* is a good resource to learn more.
170+
*
171+
* @param spectrogram Typically produced by the Spectrogram op, with magnitude_squared
172+
* set to true.
173+
* @param sampleRate How many samples per second the source audio used.
174+
* @param options carries optional attributes values
175+
* @return a new instance of Mfcc
176+
* @see org.tensorflow.op.AudioOps.mfcc
177+
* @param upperFrequencyLimit The highest frequency to use when calculating the
178+
* ceptstrum.
179+
* @param lowerFrequencyLimit The lowest frequency to use when calculating the
180+
* ceptstrum.
181+
* @param filterbankChannelCount Resolution of the Mel bank used internally.
182+
* @param dctCoefficientCount How many output channels to produce per time slice.
183+
*/
80184
public fun mfcc(
81185
spectrogram: Operand<TFloat32>,
82186
sampleRate: Operand<TInt32>,

0 commit comments

Comments
 (0)