diff --git a/main.py b/main.py index 646834d..a47aabb 100644 --- a/main.py +++ b/main.py @@ -7,59 +7,24 @@ from tensorflow.keras import callbacks as kc from tensorflow.keras.preprocessing.text import Tokenizer as kT from tensorflow.keras.utils import pad_sequences as kps -import re import numpy as np import pandas as pd -import nltk -from nltk.corpus import stopwords -from nltk.tokenize import word_tokenize -from nltk.stem import WordNetLemmatizer - -from tqdm import tqdm -tqdm.pandas() - - print("I") -t = pd.read_csv("yelp_review_polarity_csv/train.csv", +t = pd.read_csv("prepped_train.csv", header = None, - names = ['c', 'r']) + names = ['i', 'c', 'r']) print("R") -y = t['c'] - 1 -r = t['r'] - -#nltk.download("stopwords") -#nltk.download("punkt_tab") -#nltk.download("wordnet") - -def fr(r): - r = r.lower() - - r = " ".join(tuple(re.findall(r'\w+', r))) - - for i in ['\n', '\r', ',', '.', '-', ';', ':', '\'', '"']: - r = r.replace(i, "") - - sw = set(stopwords.words("english")) - - l = WordNetLemmatizer() - return " ".join([l.lemmatize(i.strip(), pos = 'v') for i in word_tokenize(r) if i.strip() not in sw]) - -r = r.progress_apply(fr) -#print(r) - -print("A") +y = t['c'] +r = t['r'].astype(str) tk = kT(num_words = 6000) - tk.fit_on_texts(r) print("F") -#print(tk.word_index) s = tk.texts_to_sequences(r) -#print(s) print("T") @@ -67,6 +32,7 @@ ts = kps(s, maxlen = 100) print("P") +''' m = km.Sequential([ kl.Input(shape = (None, ), dtype = 'int32'), kl.Embedding(6000, 96), @@ -82,17 +48,19 @@ m = km.Sequential([ m.compile(optimizer = ko.Lion(learning_rate = 0.0005), loss = 'binary_crossentropy', metrics = ['accuracy']) +''' -#m.summary() +from model import m -ckpt = kc.ModelCheckpoint('model1.keras', +ckpt = kc.ModelCheckpoint('model2.keras', monitor = 'val_accuracy', save_best_only = True, verbose = 1) +m.load_weights("model1.keras") history = m.fit(ts, y, - epochs = 3, - batch_size = 256, + epochs = 15, + batch_size = 1024, validation_split = 0.1, callbacks = [ckpt]) diff --git a/model.py b/model.py new file mode 100644 index 0000000..1a96139 --- /dev/null +++ b/model.py @@ -0,0 +1,21 @@ +from tensorflow.keras import layers as kl +from tensorflow.keras import models as km +from tensorflow.keras import losses as ks +from tensorflow.keras import optimizers as ko +from tensorflow.keras import callbacks as kc + +m = km.Sequential([ + kl.Input(shape = (None, ), dtype = 'int32'), + kl.Embedding(6000, 96), + kl.Dropout(0.2), + kl.Conv1D(128, 5, activation = 'relu'), + kl.LSTM(128, return_sequences = True), + kl.LSTM(64), + kl.Dense(64), + kl.Dropout(0.5), + kl.Dense(1, activation = 'sigmoid') +]) + +m.compile(optimizer = ko.Lion(learning_rate = 0.0005), + loss = 'binary_crossentropy', + metrics = ['accuracy']) diff --git a/plot.py b/plot.py new file mode 100644 index 0000000..69f56da --- /dev/null +++ b/plot.py @@ -0,0 +1,11 @@ +from matplotlib import pyplot as plt + +X = [i for i in range(1, 21)] +tY = [0.8380, 0.9170, 0.9267, 0.9333, 0.9390, 0.9373, 0.9457, 0.9541, 0.9609, 0.9661, 0.9703, 0.9732, 0.9757, 0.9775, 0.9786, 0.9799, 0.9810, 0.9820, 0.9822, 0.9829] +vY = [0.9242, 0.9346, 0.9392, 0.9414, 0.9436, 0.9452, 0.9453, 0.9457, 0.9441, 0.9432, 0.9423, 0.9429, 0.9415, 0.9420, 0.9410, 0.9417, 0.9405, 0.9404, 0.9404, 0.9408] + +plt.plot(X, tY) +plt.plot(X, vY) +plt.legend(["train_acc", "val_acc"]) + +plt.show() diff --git a/preprocessor.py b/preprocessor.py index 02ced78..e90251b 100644 --- a/preprocessor.py +++ b/preprocessor.py @@ -3,7 +3,7 @@ import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from nltk.stem import WordNetLemmatizer -#from spellchecker import SpellChecker as sc +from spellchecker import SpellChecker as sc nltk.download("stopwords") nltk.download("punkt_tab") @@ -31,3 +31,26 @@ def fr(r): # r[k] = w return " ".join([l.lemmatize(i, pos = 'v') for i in r]) + +def frs(r): + r = r.lower() + + r = " ".join(tuple(re.findall(r'\w+', r))) + + for i in ['\n', '\r', ',', '.', '-', ';', ':', '\'', '"']: + r = r.replace(i, "") + + sw = set(stopwords.words("english")) + + l = WordNetLemmatizer() + c = sc() + + r = [i.strip() for i in word_tokenize(r) if i.strip() not in sw] + + # spellcheck + for k, i in enumerate(r): + w = c.correction(i) + if w: + r[k] = w + + return " ".join([l.lemmatize(i, pos = 'v') for i in r])