neuro-lab8/preprocessing.py

from tensorflow import keras
import tensorflow as tf

characters = list("abcdefghijklmnopqrstuvwxyz'?! ")

char_to_num = keras.layers.StringLookup(vocabulary = characters,
                                        oov_token = "")

num_to_char = keras.layers.StringLookup(vocabulary = char_to_num.get_vocabulary(),
                                        oov_token = "",
                                        invert = True)

frame_length = 256
frame_step = 160
fft_length = 384

wavs = '/mnt/tmpfs1/LJSpeech-1.1/wavs/'

def encode_single_sample(wav, label):
    file = tf.io.read_file(wavs + wav + ".wav")
    audio, _ = tf.audio.decode_wav(file)
    audio = tf.squeeze(audio, axis = -1)
    audio = tf.cast(audio, tf.float32)
    spectrogram = tf.signal.stft(audio,
                                 frame_length = frame_length,
                                 frame_step = frame_step,
                                 fft_length = fft_length)
    spectrogram = tf.abs(spectrogram)
    spectrogram = tf.math.pow(spectrogram, 0.5)
    means = tf.math.reduce_mean(spectrogram, 1, keepdims=True)
    stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True)
    spectrogram = (spectrogram - means) / (stddevs + 1e-10)
    label = tf.strings.lower(label)
    label = tf.strings.unicode_split(label, input_encoding="UTF-8")
    label = char_to_num(label)
    return spectrogram, label

def to_dataset(df, batch_size = 32):
    ds = tf.data.Dataset.from_tensor_slices((list(df["file_name"]),
                                             list(df["normalized_transcription"])))
    ds = ds.map(encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE) \
            .padded_batch(batch_size) \
            .prefetch(buffer_size=tf.data.AUTOTUNE)

    return ds