neuro-lab7/main.py
2025-12-06 13:50:19 +02:00

99 lines
2.1 KiB
Python

from tensorflow.keras import layers as kl
from tensorflow.keras import models as km
from tensorflow.keras import losses as ks
from tensorflow.keras import optimizers as ko
from tensorflow.keras import callbacks as kc
from tensorflow.keras.preprocessing.text import Tokenizer as kT
from tensorflow.keras.utils import pad_sequences as kps
import re
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
tqdm.pandas()
print("I")
t = pd.read_csv("yelp_review_polarity_csv/train.csv",
header = None,
names = ['c', 'r'])
print("R")
y = t['c'] - 1
r = t['r']
#nltk.download("stopwords")
#nltk.download("punkt_tab")
#nltk.download("wordnet")
def fr(r):
r = r.lower()
r = " ".join(tuple(re.findall(r'\w+', r)))
for i in ['\n', '\r', ',', '.', '-', ';', ':', '\'', '"']:
r = r.replace(i, "")
sw = set(stopwords.words("english"))
l = WordNetLemmatizer()
return " ".join([l.lemmatize(i.strip(), pos = 'v') for i in word_tokenize(r) if i.strip() not in sw])
r = r.progress_apply(fr)
#print(r)
print("A")
tk = kT(num_words = 6000)
tk.fit_on_texts(r)
print("F")
#print(tk.word_index)
s = tk.texts_to_sequences(r)
#print(s)
print("T")
ts = kps(s, maxlen = 100)
print("P")
m = km.Sequential([
kl.Input(shape = (None, ), dtype = 'int32'),
kl.Embedding(6000, 96),
kl.Dropout(0.2),
kl.Conv1D(128, 5, activation = 'relu'),
kl.LSTM(128, return_sequences = True),
kl.LSTM(64),
kl.Dense(64),
kl.Dropout(0.5),
kl.Dense(1, activation = 'sigmoid')
])
m.compile(optimizer = ko.Lion(learning_rate = 0.0005),
loss = 'binary_crossentropy',
metrics = ['accuracy'])
#m.summary()
ckpt = kc.ModelCheckpoint('model1.keras',
monitor = 'val_accuracy',
save_best_only = True,
verbose = 1)
history = m.fit(ts,
y,
epochs = 3,
batch_size = 256,
validation_split = 0.1,
callbacks = [ckpt])