initial commit (pre ctc)

This commit is contained in:
ІО-23 Шмуляр Олег 2025-12-08 17:10:47 +02:00
commit d998a04d47
5 changed files with 200 additions and 0 deletions

43
cc.py Normal file
View File

@ -0,0 +1,43 @@
import tensorflow as tf
from tensorflow import keras
from jiwer import wer
import numpy as np
from preprocessing import *
def decode_batch_predictions(pred):
input_len = np.ones(pred.shape[0]) * pred.shape[1]
results = keras.backend.ctc_decode(pred,
input_length = input_len,
greedy = True)[0][0]
output_text = []
for result in results:
result = tf.strings.reduce_join(num_to_char(result)).numpy().decode("utf-8")
output_text.append(result)
return output_text
class ce(keras.callbacks.Callback):
def __init__(self, dataset, model):
super().__init__()
self.dataset = dataset
self.____model = model
def on_epoch_end(self, epoch, logs = None):
predictions = []
targets = []
for batch in self.dataset:
X, y = batch
batch_predictions = self.____model.predict(X, verbose = 0)
batch_predictions = decode_batch_predictions(batch_predictions)
predictions.extend(batch_predictions)
for label in y:
label = (
tf.strings.reduce_join(num_to_char(label)).numpy().decode("utf-8")
)
targets.append(label)
wer_score = wer(targets, predictions)
print(f"Word Error Rate: {wer_score:.4f}")
for i in np.random.randint(0, len(predictions), 10):
print(f"Target : {targets[i]}")
print(f"Prediction: {predictions[i]}")

13
loss.py Normal file
View File

@ -0,0 +1,13 @@
import tensorflow as tf
from tensorflow import keras
def CTCLoss(y_true, y_pred):
batch_len = tf.cast(tf.shape(y_true)[0], dtype = "int64")
input_length = tf.cast(tf.shape(y_pred)[1], dtype = "int64")
label_length = tf.cast(tf.shape(y_true)[1], dtype = "int64")
input_length = input_length * tf.ones(shape = (batch_len, 1), dtype = "int64")
label_length = label_length * tf.ones(shape = (batch_len, 1), dtype = "int64")
loss = keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
return loss

58
model.py Normal file
View File

@ -0,0 +1,58 @@
from tensorflow.keras import layers as kl
from tensorflow.keras import models as km
from tensorflow.keras import losses as ks
from tensorflow.keras import optimizers as ko
from tensorflow.keras import callbacks as kc
from tensorflow import keras
from loss import CTCLoss
def model(input_dim, output_dim, rnn_layers = 3, rnn_units = 72):
li = kl.Input((None, input_dim))
l1 = kl.Reshape((-1, input_dim, 1))(li)
l2 = kl.Conv2D(32,
kernel_size = [11, 41],
strides = [2, 2],
padding = 'same',
use_bias = False)(l1)
l3 = kl.BatchNormalization()(l2)
l4 = kl.ReLU()(l3)
l5 = kl.Conv2D(32,
kernel_size = [11, 21],
strides = [1, 2],
padding = 'same',
use_bias = False)(l4)
l6 = kl.BatchNormalization()(l5)
l7 = kl.ReLU()(l6)
lb = kl.Reshape((-1, l7.shape[-2] * l7.shape[-1]))(l7)
for i in range(rnn_layers):
i += 1
r = kl.GRU(rnn_units,
activation = 'tanh',
recurrent_activation = 'sigmoid',
use_bias = True,
return_sequences = True,
reset_after = True)
lb = kl.Bidirectional(r,
merge_mode = 'concat')(lb)
if i < rnn_layers:
lb = kl.Dropout(rate=0.5)(lb)
lc1 = kl.Dense(rnn_units * 2, activation = 'relu')(lb)
lc2 = kl.Dropout(0.5)(lc1)
lo = kl.Dense(output_dim + 1, activation = 'softmax')(lc2)
m = keras.Model(li, lo)
m.compile(optimizer = ko.Lion(0.0004),
loss = CTCLoss)
return m

45
preprocessing.py Normal file
View File

@ -0,0 +1,45 @@
from tensorflow import keras
import tensorflow as tf
characters = list("abcdefghijklmnopqrstuvwxyz'?! ")
char_to_num = keras.layers.StringLookup(vocabulary = characters,
oov_token = "")
num_to_char = keras.layers.StringLookup(vocabulary = char_to_num.get_vocabulary(),
oov_token = "",
invert = True)
frame_length = 256
frame_step = 160
fft_length = 384
wavs = '/mnt/tmpfs1/LJSpeech-1.1/wavs/'
def encode_single_sample(wav, label):
file = tf.io.read_file(wavs + wav + ".wav")
audio, _ = tf.audio.decode_wav(file)
audio = tf.squeeze(audio, axis = -1)
audio = tf.cast(audio, tf.float32)
spectrogram = tf.signal.stft(audio,
frame_length = frame_length,
frame_step = frame_step,
fft_length = fft_length)
spectrogram = tf.abs(spectrogram)
spectrogram = tf.math.pow(spectrogram, 0.5)
means = tf.math.reduce_mean(spectrogram, 1, keepdims=True)
stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True)
spectrogram = (spectrogram - means) / (stddevs + 1e-10)
label = tf.strings.lower(label)
label = tf.strings.unicode_split(label, input_encoding="UTF-8")
label = char_to_num(label)
return spectrogram, label
def to_dataset(df, batch_size = 32):
ds = tf.data.Dataset.from_tensor_slices((list(df["file_name"]),
list(df["normalized_transcription"])))
ds = ds.map(encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE) \
.padded_batch(batch_size) \
.prefetch(buffer_size=tf.data.AUTOTUNE)
return ds

41
train.py Normal file
View File

@ -0,0 +1,41 @@
from model import *
from preprocessing import *
import pandas as pd
from cc import ce
bs = 32
data = pd.read_csv("/mnt/tmpfs1/LJSpeech-1.1/metadata.csv",
sep = '|',
header = None,
quoting = 3,
names = ['file_name', 'i', 'normalized_transcription'])
s = int(len(data) // 2 * 0.90)
train_data = data[:s]
valid_data = data[s:len(data) // 2]
train_ds = to_dataset(train_data, batch_size = bs)
valid_ds = to_dataset(valid_data, batch_size = bs)
m = model(input_dim = fft_length // 2 + 1,
output_dim = char_to_num.vocabulary_size())
m.load_weights('model20-latest.keras')
ckpt1 = kc.ModelCheckpoint('model21-latest.keras',
monitor = 'val_loss',
save_best_only = False,
verbose = 1)
ckpt2 = kc.ModelCheckpoint('model21-best.keras',
monitor = 'val_loss',
save_best_only = True,
verbose = 1)
ce1 = ce(valid_ds, m)
m.fit(train_ds,
epochs = 8,
validation_data = valid_ds,
callbacks = [ckpt1, ckpt2, ce1])