diff --git a/main.py b/main.py new file mode 100644 index 0000000..646834d --- /dev/null +++ b/main.py @@ -0,0 +1,98 @@ +from tensorflow.keras import layers as kl +from tensorflow.keras import models as km +from tensorflow.keras import losses as ks +from tensorflow.keras import optimizers as ko +from tensorflow.keras import callbacks as kc + +from tensorflow.keras.preprocessing.text import Tokenizer as kT +from tensorflow.keras.utils import pad_sequences as kps + +import re +import numpy as np +import pandas as pd + +import nltk +from nltk.corpus import stopwords +from nltk.tokenize import word_tokenize +from nltk.stem import WordNetLemmatizer + +from tqdm import tqdm +tqdm.pandas() + + +print("I") +t = pd.read_csv("yelp_review_polarity_csv/train.csv", + header = None, + names = ['c', 'r']) +print("R") + +y = t['c'] - 1 +r = t['r'] + +#nltk.download("stopwords") +#nltk.download("punkt_tab") +#nltk.download("wordnet") + +def fr(r): + r = r.lower() + + r = " ".join(tuple(re.findall(r'\w+', r))) + + for i in ['\n', '\r', ',', '.', '-', ';', ':', '\'', '"']: + r = r.replace(i, "") + + sw = set(stopwords.words("english")) + + l = WordNetLemmatizer() + return " ".join([l.lemmatize(i.strip(), pos = 'v') for i in word_tokenize(r) if i.strip() not in sw]) + +r = r.progress_apply(fr) +#print(r) + +print("A") + +tk = kT(num_words = 6000) + +tk.fit_on_texts(r) + +print("F") +#print(tk.word_index) + +s = tk.texts_to_sequences(r) +#print(s) + +print("T") + +ts = kps(s, maxlen = 100) + +print("P") + +m = km.Sequential([ + kl.Input(shape = (None, ), dtype = 'int32'), + kl.Embedding(6000, 96), + kl.Dropout(0.2), + kl.Conv1D(128, 5, activation = 'relu'), + kl.LSTM(128, return_sequences = True), + kl.LSTM(64), + kl.Dense(64), + kl.Dropout(0.5), + kl.Dense(1, activation = 'sigmoid') +]) + +m.compile(optimizer = ko.Lion(learning_rate = 0.0005), + loss = 'binary_crossentropy', + metrics = ['accuracy']) + +#m.summary() + +ckpt = kc.ModelCheckpoint('model1.keras', + monitor = 'val_accuracy', + save_best_only = True, + verbose = 1) + +history = m.fit(ts, + y, + epochs = 3, + batch_size = 256, + validation_split = 0.1, + callbacks = [ckpt]) diff --git a/prep.py b/prep.py index f9511ba..d72e5fc 100644 --- a/prep.py +++ b/prep.py @@ -19,7 +19,7 @@ t = pd.read_csv("yelp_review_polarity_csv/train.csv", print("R") -y = t['c'] - 1 +y = (t['c'] - 1) r = t['r'] r = r.progress_apply(fr) diff --git a/preprocessor.py b/preprocessor.py index ef50b8b..02ced78 100644 --- a/preprocessor.py +++ b/preprocessor.py @@ -3,7 +3,7 @@ import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from nltk.stem import WordNetLemmatizer -from spellchecker import SpellChecker as sc +#from spellchecker import SpellChecker as sc nltk.download("stopwords") nltk.download("punkt_tab") @@ -20,4 +20,14 @@ def fr(r): sw = set(stopwords.words("english")) l = WordNetLemmatizer() - return " ".join([l.lemmatize(i.strip(), pos = 'v') for i in word_tokenize(r) if i.strip() not in sw]) + #c = sc() + + r = [i.strip() for i in word_tokenize(r) if i.strip() not in sw] + + # spellcheck + #for k, i in enumerate(r): + # w = c.correction(i) + # if w: + # r[k] = w + + return " ".join([l.lemmatize(i, pos = 'v') for i in r])