This commit is contained in:
ІО-23 Шмуляр Олег 2025-12-06 15:56:00 +02:00
parent 9c3e92b4f8
commit 93229d32d1
4 changed files with 67 additions and 44 deletions

54
main.py
View File

@ -7,59 +7,24 @@ from tensorflow.keras import callbacks as kc
from tensorflow.keras.preprocessing.text import Tokenizer as kT from tensorflow.keras.preprocessing.text import Tokenizer as kT
from tensorflow.keras.utils import pad_sequences as kps from tensorflow.keras.utils import pad_sequences as kps
import re
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
tqdm.pandas()
print("I") print("I")
t = pd.read_csv("yelp_review_polarity_csv/train.csv", t = pd.read_csv("prepped_train.csv",
header = None, header = None,
names = ['c', 'r']) names = ['i', 'c', 'r'])
print("R") print("R")
y = t['c'] - 1 y = t['c']
r = t['r'] r = t['r'].astype(str)
#nltk.download("stopwords")
#nltk.download("punkt_tab")
#nltk.download("wordnet")
def fr(r):
r = r.lower()
r = " ".join(tuple(re.findall(r'\w+', r)))
for i in ['\n', '\r', ',', '.', '-', ';', ':', '\'', '"']:
r = r.replace(i, "")
sw = set(stopwords.words("english"))
l = WordNetLemmatizer()
return " ".join([l.lemmatize(i.strip(), pos = 'v') for i in word_tokenize(r) if i.strip() not in sw])
r = r.progress_apply(fr)
#print(r)
print("A")
tk = kT(num_words = 6000) tk = kT(num_words = 6000)
tk.fit_on_texts(r) tk.fit_on_texts(r)
print("F") print("F")
#print(tk.word_index)
s = tk.texts_to_sequences(r) s = tk.texts_to_sequences(r)
#print(s)
print("T") print("T")
@ -67,6 +32,7 @@ ts = kps(s, maxlen = 100)
print("P") print("P")
'''
m = km.Sequential([ m = km.Sequential([
kl.Input(shape = (None, ), dtype = 'int32'), kl.Input(shape = (None, ), dtype = 'int32'),
kl.Embedding(6000, 96), kl.Embedding(6000, 96),
@ -82,17 +48,19 @@ m = km.Sequential([
m.compile(optimizer = ko.Lion(learning_rate = 0.0005), m.compile(optimizer = ko.Lion(learning_rate = 0.0005),
loss = 'binary_crossentropy', loss = 'binary_crossentropy',
metrics = ['accuracy']) metrics = ['accuracy'])
'''
#m.summary() from model import m
ckpt = kc.ModelCheckpoint('model1.keras', ckpt = kc.ModelCheckpoint('model2.keras',
monitor = 'val_accuracy', monitor = 'val_accuracy',
save_best_only = True, save_best_only = True,
verbose = 1) verbose = 1)
m.load_weights("model1.keras")
history = m.fit(ts, history = m.fit(ts,
y, y,
epochs = 3, epochs = 15,
batch_size = 256, batch_size = 1024,
validation_split = 0.1, validation_split = 0.1,
callbacks = [ckpt]) callbacks = [ckpt])

21
model.py Normal file
View File

@ -0,0 +1,21 @@
from tensorflow.keras import layers as kl
from tensorflow.keras import models as km
from tensorflow.keras import losses as ks
from tensorflow.keras import optimizers as ko
from tensorflow.keras import callbacks as kc
m = km.Sequential([
kl.Input(shape = (None, ), dtype = 'int32'),
kl.Embedding(6000, 96),
kl.Dropout(0.2),
kl.Conv1D(128, 5, activation = 'relu'),
kl.LSTM(128, return_sequences = True),
kl.LSTM(64),
kl.Dense(64),
kl.Dropout(0.5),
kl.Dense(1, activation = 'sigmoid')
])
m.compile(optimizer = ko.Lion(learning_rate = 0.0005),
loss = 'binary_crossentropy',
metrics = ['accuracy'])

11
plot.py Normal file
View File

@ -0,0 +1,11 @@
from matplotlib import pyplot as plt
X = [i for i in range(1, 21)]
tY = [0.8380, 0.9170, 0.9267, 0.9333, 0.9390, 0.9373, 0.9457, 0.9541, 0.9609, 0.9661, 0.9703, 0.9732, 0.9757, 0.9775, 0.9786, 0.9799, 0.9810, 0.9820, 0.9822, 0.9829]
vY = [0.9242, 0.9346, 0.9392, 0.9414, 0.9436, 0.9452, 0.9453, 0.9457, 0.9441, 0.9432, 0.9423, 0.9429, 0.9415, 0.9420, 0.9410, 0.9417, 0.9405, 0.9404, 0.9404, 0.9408]
plt.plot(X, tY)
plt.plot(X, vY)
plt.legend(["train_acc", "val_acc"])
plt.show()

View File

@ -3,7 +3,7 @@ import nltk
from nltk.corpus import stopwords from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer from nltk.stem import WordNetLemmatizer
#from spellchecker import SpellChecker as sc from spellchecker import SpellChecker as sc
nltk.download("stopwords") nltk.download("stopwords")
nltk.download("punkt_tab") nltk.download("punkt_tab")
@ -31,3 +31,26 @@ def fr(r):
# r[k] = w # r[k] = w
return " ".join([l.lemmatize(i, pos = 'v') for i in r]) return " ".join([l.lemmatize(i, pos = 'v') for i in r])
def frs(r):
r = r.lower()
r = " ".join(tuple(re.findall(r'\w+', r)))
for i in ['\n', '\r', ',', '.', '-', ';', ':', '\'', '"']:
r = r.replace(i, "")
sw = set(stopwords.words("english"))
l = WordNetLemmatizer()
c = sc()
r = [i.strip() for i in word_tokenize(r) if i.strip() not in sw]
# spellcheck
for k, i in enumerate(r):
w = c.correction(i)
if w:
r[k] = w
return " ".join([l.lemmatize(i, pos = 'v') for i in r])