from tensorflow import keras import tensorflow as tf characters = list("abcdefghijklmnopqrstuvwxyz'?! ") char_to_num = keras.layers.StringLookup(vocabulary = characters, oov_token = "") num_to_char = keras.layers.StringLookup(vocabulary = char_to_num.get_vocabulary(), oov_token = "", invert = True) frame_length = 256 frame_step = 160 fft_length = 384 wavs = '/mnt/tmpfs1/LJSpeech-1.1/wavs/' def encode_single_sample(wav, label): # for backward compatibility encode_single_sample_selectable_dir(wavs + wav + ".wav", label) def encode_single_sample_selectable_dir(wav, label): file = tf.io.read_file(wav) audio, _ = tf.audio.decode_wav(file) audio = tf.squeeze(audio, axis = -1) audio = tf.cast(audio, tf.float32) spectrogram = tf.signal.stft(audio, frame_length = frame_length, frame_step = frame_step, fft_length = fft_length) spectrogram = tf.abs(spectrogram) spectrogram = tf.math.pow(spectrogram, 0.5) means = tf.math.reduce_mean(spectrogram, 1, keepdims=True) stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True) spectrogram = (spectrogram - means) / (stddevs + 1e-10) label = tf.strings.lower(label) label = tf.strings.unicode_split(label, input_encoding="UTF-8") label = char_to_num(label) return spectrogram, label def to_dataset(df, batch_size = 32): ds = tf.data.Dataset.from_tensor_slices((list(df["file_name"]), list(df["normalized_transcription"]))) ds = ds.map(encode_single_sample, num_parallel_calls = tf.data.AUTOTUNE) \ .padded_batch(batch_size) \ .prefetch(buffer_size=tf.data.AUTOTUNE) return ds