This commit is contained in:
ІО-23 Шмуляр Олег 2025-12-06 15:56:00 +02:00
parent 9c3e92b4f8
commit 93229d32d1
4 changed files with 67 additions and 44 deletions

54
main.py
View File

@ -7,59 +7,24 @@ from tensorflow.keras import callbacks as kc
from tensorflow.keras.preprocessing.text import Tokenizer as kT
from tensorflow.keras.utils import pad_sequences as kps
import re
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
tqdm.pandas()
print("I")
t = pd.read_csv("yelp_review_polarity_csv/train.csv",
t = pd.read_csv("prepped_train.csv",
header = None,
names = ['c', 'r'])
names = ['i', 'c', 'r'])
print("R")
y = t['c'] - 1
r = t['r']
#nltk.download("stopwords")
#nltk.download("punkt_tab")
#nltk.download("wordnet")
def fr(r):
r = r.lower()
r = " ".join(tuple(re.findall(r'\w+', r)))
for i in ['\n', '\r', ',', '.', '-', ';', ':', '\'', '"']:
r = r.replace(i, "")
sw = set(stopwords.words("english"))
l = WordNetLemmatizer()
return " ".join([l.lemmatize(i.strip(), pos = 'v') for i in word_tokenize(r) if i.strip() not in sw])
r = r.progress_apply(fr)
#print(r)
print("A")
y = t['c']
r = t['r'].astype(str)
tk = kT(num_words = 6000)
tk.fit_on_texts(r)
print("F")
#print(tk.word_index)
s = tk.texts_to_sequences(r)
#print(s)
print("T")
@ -67,6 +32,7 @@ ts = kps(s, maxlen = 100)
print("P")
'''
m = km.Sequential([
kl.Input(shape = (None, ), dtype = 'int32'),
kl.Embedding(6000, 96),
@ -82,17 +48,19 @@ m = km.Sequential([
m.compile(optimizer = ko.Lion(learning_rate = 0.0005),
loss = 'binary_crossentropy',
metrics = ['accuracy'])
'''
#m.summary()
from model import m
ckpt = kc.ModelCheckpoint('model1.keras',
ckpt = kc.ModelCheckpoint('model2.keras',
monitor = 'val_accuracy',
save_best_only = True,
verbose = 1)
m.load_weights("model1.keras")
history = m.fit(ts,
y,
epochs = 3,
batch_size = 256,
epochs = 15,
batch_size = 1024,
validation_split = 0.1,
callbacks = [ckpt])

21
model.py Normal file
View File

@ -0,0 +1,21 @@
from tensorflow.keras import layers as kl
from tensorflow.keras import models as km
from tensorflow.keras import losses as ks
from tensorflow.keras import optimizers as ko
from tensorflow.keras import callbacks as kc
m = km.Sequential([
kl.Input(shape = (None, ), dtype = 'int32'),
kl.Embedding(6000, 96),
kl.Dropout(0.2),
kl.Conv1D(128, 5, activation = 'relu'),
kl.LSTM(128, return_sequences = True),
kl.LSTM(64),
kl.Dense(64),
kl.Dropout(0.5),
kl.Dense(1, activation = 'sigmoid')
])
m.compile(optimizer = ko.Lion(learning_rate = 0.0005),
loss = 'binary_crossentropy',
metrics = ['accuracy'])

11
plot.py Normal file
View File

@ -0,0 +1,11 @@
from matplotlib import pyplot as plt
X = [i for i in range(1, 21)]
tY = [0.8380, 0.9170, 0.9267, 0.9333, 0.9390, 0.9373, 0.9457, 0.9541, 0.9609, 0.9661, 0.9703, 0.9732, 0.9757, 0.9775, 0.9786, 0.9799, 0.9810, 0.9820, 0.9822, 0.9829]
vY = [0.9242, 0.9346, 0.9392, 0.9414, 0.9436, 0.9452, 0.9453, 0.9457, 0.9441, 0.9432, 0.9423, 0.9429, 0.9415, 0.9420, 0.9410, 0.9417, 0.9405, 0.9404, 0.9404, 0.9408]
plt.plot(X, tY)
plt.plot(X, vY)
plt.legend(["train_acc", "val_acc"])
plt.show()

View File

@ -3,7 +3,7 @@ import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
#from spellchecker import SpellChecker as sc
from spellchecker import SpellChecker as sc
nltk.download("stopwords")
nltk.download("punkt_tab")
@ -31,3 +31,26 @@ def fr(r):
# r[k] = w
return " ".join([l.lemmatize(i, pos = 'v') for i in r])
def frs(r):
r = r.lower()
r = " ".join(tuple(re.findall(r'\w+', r)))
for i in ['\n', '\r', ',', '.', '-', ';', ':', '\'', '"']:
r = r.replace(i, "")
sw = set(stopwords.words("english"))
l = WordNetLemmatizer()
c = sc()
r = [i.strip() for i in word_tokenize(r) if i.strip() not in sw]
# spellcheck
for k, i in enumerate(r):
w = c.correction(i)
if w:
r[k] = w
return " ".join([l.lemmatize(i, pos = 'v') for i in r])