neuro-lab8/preprocessing.py

50 lines
1.9 KiB
Python
Raw Normal View History

2025-12-08 17:10:47 +02:00
from tensorflow import keras
import tensorflow as tf
characters = list("abcdefghijklmnopqrstuvwxyz'?! ")
char_to_num = keras.layers.StringLookup(vocabulary = characters,
oov_token = "")
num_to_char = keras.layers.StringLookup(vocabulary = char_to_num.get_vocabulary(),
oov_token = "",
invert = True)
frame_length = 256
frame_step = 160
fft_length = 384
wavs = '/mnt/tmpfs1/LJSpeech-1.1/wavs/'
def encode_single_sample(wav, label):
2025-12-11 12:57:18 +02:00
# for backward compatibility
encode_single_sample_selectable_dir(wavs + wav + ".wav", label)
def encode_single_sample_selectable_dir(wav, label):
file = tf.io.read_file(wav)
2025-12-08 17:10:47 +02:00
audio, _ = tf.audio.decode_wav(file)
audio = tf.squeeze(audio, axis = -1)
audio = tf.cast(audio, tf.float32)
spectrogram = tf.signal.stft(audio,
frame_length = frame_length,
frame_step = frame_step,
fft_length = fft_length)
spectrogram = tf.abs(spectrogram)
spectrogram = tf.math.pow(spectrogram, 0.5)
means = tf.math.reduce_mean(spectrogram, 1, keepdims=True)
stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True)
spectrogram = (spectrogram - means) / (stddevs + 1e-10)
label = tf.strings.lower(label)
label = tf.strings.unicode_split(label, input_encoding="UTF-8")
label = char_to_num(label)
return spectrogram, label
def to_dataset(df, batch_size = 32):
ds = tf.data.Dataset.from_tensor_slices((list(df["file_name"]),
list(df["normalized_transcription"])))
2025-12-09 08:09:49 +02:00
ds = ds.map(encode_single_sample, num_parallel_calls = tf.data.AUTOTUNE) \
2025-12-08 17:10:47 +02:00
.padded_batch(batch_size) \
.prefetch(buffer_size=tf.data.AUTOTUNE)
return ds