neuro-lab8/preprocessing.py

from tensorflow import keras
import tensorflow as tf

characters = list("abcdefghijklmnopqrstuvwxyz'?! ")

char_to_num = keras.layers.StringLookup(vocabulary = characters,
                                        oov_token = "")

num_to_char = keras.layers.StringLookup(vocabulary = char_to_num.get_vocabulary(),
                                        oov_token = "",
                                        invert = True)

frame_length = 256
frame_step = 160
fft_length = 384

wavs = '/mnt/tmpfs1/LJSpeech-1.1/wavs/'

def encode_single_sample(wav, label):
    # for backward compatibility
    encode_single_sample_selectable_dir(wavs + wav + ".wav", label)

def encode_single_sample_selectable_dir(wav, label):
    file = tf.io.read_file(wav)
    audio, _ = tf.audio.decode_wav(file)
    audio = tf.squeeze(audio, axis = -1)
    audio = tf.cast(audio, tf.float32)
    spectrogram = tf.signal.stft(audio,
                                 frame_length = frame_length,
                                 frame_step = frame_step,
                                 fft_length = fft_length)
    spectrogram = tf.abs(spectrogram)
    spectrogram = tf.math.pow(spectrogram, 0.5)
    means = tf.math.reduce_mean(spectrogram, 1, keepdims=True)
    stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True)
    spectrogram = (spectrogram - means) / (stddevs + 1e-10)
    label = tf.strings.lower(label)
    label = tf.strings.unicode_split(label, input_encoding="UTF-8")
    label = char_to_num(label)
    return spectrogram, label

def to_dataset(df, batch_size = 32):
    ds = tf.data.Dataset.from_tensor_slices((list(df["file_name"]),
                                             list(df["normalized_transcription"])))
    ds = ds.map(encode_single_sample, num_parallel_calls = tf.data.AUTOTUNE) \
            .padded_batch(batch_size) \
            .prefetch(buffer_size=tf.data.AUTOTUNE)

    return ds
initial commit (pre ctc) 2025-12-08 17:10:47 +02:00			`from tensorflow import keras`
			`import tensorflow as tf`

			`characters = list("abcdefghijklmnopqrstuvwxyz'?! ")`

			`char_to_num = keras.layers.StringLookup(vocabulary = characters,`
			`oov_token = "")`

			`num_to_char = keras.layers.StringLookup(vocabulary = char_to_num.get_vocabulary(),`
			`oov_token = "",`
			`invert = True)`

			`frame_length = 256`
			`frame_step = 160`
			`fft_length = 384`

			`wavs = '/mnt/tmpfs1/LJSpeech-1.1/wavs/'`

			`def encode_single_sample(wav, label):`
incremental update 2025-12-11 12:57:18 +02:00			`# for backward compatibility`
			`encode_single_sample_selectable_dir(wavs + wav + ".wav", label)`

			`def encode_single_sample_selectable_dir(wav, label):`
			`file = tf.io.read_file(wav)`
initial commit (pre ctc) 2025-12-08 17:10:47 +02:00			`audio, _ = tf.audio.decode_wav(file)`
			`audio = tf.squeeze(audio, axis = -1)`
			`audio = tf.cast(audio, tf.float32)`
			`spectrogram = tf.signal.stft(audio,`
			`frame_length = frame_length,`
			`frame_step = frame_step,`
			`fft_length = fft_length)`
			`spectrogram = tf.abs(spectrogram)`
			`spectrogram = tf.math.pow(spectrogram, 0.5)`
			`means = tf.math.reduce_mean(spectrogram, 1, keepdims=True)`
			`stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True)`
			`spectrogram = (spectrogram - means) / (stddevs + 1e-10)`
			`label = tf.strings.lower(label)`
			`label = tf.strings.unicode_split(label, input_encoding="UTF-8")`
			`label = char_to_num(label)`
			`return spectrogram, label`

			`def to_dataset(df, batch_size = 32):`
			`ds = tf.data.Dataset.from_tensor_slices((list(df["file_name"]),`
			`list(df["normalized_transcription"])))`
incremental backup 2025-12-09 08:09:49 +02:00			`ds = ds.map(encode_single_sample, num_parallel_calls = tf.data.AUTOTUNE) \`
initial commit (pre ctc) 2025-12-08 17:10:47 +02:00			`.padded_batch(batch_size) \`
			`.prefetch(buffer_size=tf.data.AUTOTUNE)`

			`return ds`