neuro-lab8/preprocessing.py

46 lines
1.7 KiB
Python

from tensorflow import keras
import tensorflow as tf
characters = list("abcdefghijklmnopqrstuvwxyz'?! ")
char_to_num = keras.layers.StringLookup(vocabulary = characters,
oov_token = "")
num_to_char = keras.layers.StringLookup(vocabulary = char_to_num.get_vocabulary(),
oov_token = "",
invert = True)
frame_length = 256
frame_step = 160
fft_length = 384
wavs = '/mnt/tmpfs1/LJSpeech-1.1/wavs/'
def encode_single_sample(wav, label):
file = tf.io.read_file(wavs + wav + ".wav")
audio, _ = tf.audio.decode_wav(file)
audio = tf.squeeze(audio, axis = -1)
audio = tf.cast(audio, tf.float32)
spectrogram = tf.signal.stft(audio,
frame_length = frame_length,
frame_step = frame_step,
fft_length = fft_length)
spectrogram = tf.abs(spectrogram)
spectrogram = tf.math.pow(spectrogram, 0.5)
means = tf.math.reduce_mean(spectrogram, 1, keepdims=True)
stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True)
spectrogram = (spectrogram - means) / (stddevs + 1e-10)
label = tf.strings.lower(label)
label = tf.strings.unicode_split(label, input_encoding="UTF-8")
label = char_to_num(label)
return spectrogram, label
def to_dataset(df, batch_size = 32):
ds = tf.data.Dataset.from_tensor_slices((list(df["file_name"]),
list(df["normalized_transcription"])))
ds = ds.map(encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE) \
.padded_batch(batch_size) \
.prefetch(buffer_size=tf.data.AUTOTUNE)
return ds