2025-12-08 17:10:47 +02:00
|
|
|
from tensorflow import keras
|
|
|
|
|
import tensorflow as tf
|
|
|
|
|
|
|
|
|
|
characters = list("abcdefghijklmnopqrstuvwxyz'?! ")
|
|
|
|
|
|
|
|
|
|
char_to_num = keras.layers.StringLookup(vocabulary = characters,
|
|
|
|
|
oov_token = "")
|
|
|
|
|
|
|
|
|
|
num_to_char = keras.layers.StringLookup(vocabulary = char_to_num.get_vocabulary(),
|
|
|
|
|
oov_token = "",
|
|
|
|
|
invert = True)
|
|
|
|
|
|
|
|
|
|
frame_length = 256
|
|
|
|
|
frame_step = 160
|
|
|
|
|
fft_length = 384
|
|
|
|
|
|
|
|
|
|
wavs = '/mnt/tmpfs1/LJSpeech-1.1/wavs/'
|
|
|
|
|
|
|
|
|
|
def encode_single_sample(wav, label):
|
2025-12-11 12:57:18 +02:00
|
|
|
# for backward compatibility
|
|
|
|
|
encode_single_sample_selectable_dir(wavs + wav + ".wav", label)
|
|
|
|
|
|
|
|
|
|
def encode_single_sample_selectable_dir(wav, label):
|
|
|
|
|
file = tf.io.read_file(wav)
|
2025-12-08 17:10:47 +02:00
|
|
|
audio, _ = tf.audio.decode_wav(file)
|
|
|
|
|
audio = tf.squeeze(audio, axis = -1)
|
|
|
|
|
audio = tf.cast(audio, tf.float32)
|
|
|
|
|
spectrogram = tf.signal.stft(audio,
|
|
|
|
|
frame_length = frame_length,
|
|
|
|
|
frame_step = frame_step,
|
|
|
|
|
fft_length = fft_length)
|
|
|
|
|
spectrogram = tf.abs(spectrogram)
|
|
|
|
|
spectrogram = tf.math.pow(spectrogram, 0.5)
|
|
|
|
|
means = tf.math.reduce_mean(spectrogram, 1, keepdims=True)
|
|
|
|
|
stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True)
|
|
|
|
|
spectrogram = (spectrogram - means) / (stddevs + 1e-10)
|
|
|
|
|
label = tf.strings.lower(label)
|
|
|
|
|
label = tf.strings.unicode_split(label, input_encoding="UTF-8")
|
|
|
|
|
label = char_to_num(label)
|
|
|
|
|
return spectrogram, label
|
|
|
|
|
|
|
|
|
|
def to_dataset(df, batch_size = 32):
|
|
|
|
|
ds = tf.data.Dataset.from_tensor_slices((list(df["file_name"]),
|
|
|
|
|
list(df["normalized_transcription"])))
|
2025-12-09 08:09:49 +02:00
|
|
|
ds = ds.map(encode_single_sample, num_parallel_calls = tf.data.AUTOTUNE) \
|
2025-12-08 17:10:47 +02:00
|
|
|
.padded_batch(batch_size) \
|
|
|
|
|
.prefetch(buffer_size=tf.data.AUTOTUNE)
|
|
|
|
|
|
|
|
|
|
return ds
|