from tensorflow.keras import layers as kl from tensorflow.keras import models as km from tensorflow.keras import losses as ks from tensorflow.keras import optimizers as ko from tensorflow.keras import callbacks as kc from tensorflow.keras.preprocessing.text import Tokenizer as kT from tensorflow.keras.utils import pad_sequences as kps import re import numpy as np import pandas as pd import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from nltk.stem import WordNetLemmatizer from tqdm import tqdm tqdm.pandas() print("I") t = pd.read_csv("yelp_review_polarity_csv/train.csv", header = None, names = ['c', 'r']) print("R") y = t['c'] - 1 r = t['r'] #nltk.download("stopwords") #nltk.download("punkt_tab") #nltk.download("wordnet") def fr(r): r = r.lower() r = " ".join(tuple(re.findall(r'\w+', r))) for i in ['\n', '\r', ',', '.', '-', ';', ':', '\'', '"']: r = r.replace(i, "") sw = set(stopwords.words("english")) l = WordNetLemmatizer() return " ".join([l.lemmatize(i.strip(), pos = 'v') for i in word_tokenize(r) if i.strip() not in sw]) r = r.progress_apply(fr) #print(r) print("A") tk = kT(num_words = 6000) tk.fit_on_texts(r) print("F") #print(tk.word_index) s = tk.texts_to_sequences(r) #print(s) print("T") ts = kps(s, maxlen = 100) print("P") m = km.Sequential([ kl.Input(shape = (None, ), dtype = 'int32'), kl.Embedding(6000, 96), kl.Dropout(0.2), kl.Conv1D(128, 5, activation = 'relu'), kl.LSTM(128, return_sequences = True), kl.LSTM(64), kl.Dense(64), kl.Dropout(0.5), kl.Dense(1, activation = 'sigmoid') ]) m.compile(optimizer = ko.Lion(learning_rate = 0.0005), loss = 'binary_crossentropy', metrics = ['accuracy']) #m.summary() ckpt = kc.ModelCheckpoint('model1.keras', monitor = 'val_accuracy', save_best_only = True, verbose = 1) history = m.fit(ts, y, epochs = 3, batch_size = 256, validation_split = 0.1, callbacks = [ckpt])