99 lines
2.1 KiB
Python
99 lines
2.1 KiB
Python
from tensorflow.keras import layers as kl
|
|
from tensorflow.keras import models as km
|
|
from tensorflow.keras import losses as ks
|
|
from tensorflow.keras import optimizers as ko
|
|
from tensorflow.keras import callbacks as kc
|
|
|
|
from tensorflow.keras.preprocessing.text import Tokenizer as kT
|
|
from tensorflow.keras.utils import pad_sequences as kps
|
|
|
|
import re
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
import nltk
|
|
from nltk.corpus import stopwords
|
|
from nltk.tokenize import word_tokenize
|
|
from nltk.stem import WordNetLemmatizer
|
|
|
|
from tqdm import tqdm
|
|
tqdm.pandas()
|
|
|
|
|
|
print("I")
|
|
t = pd.read_csv("yelp_review_polarity_csv/train.csv",
|
|
header = None,
|
|
names = ['c', 'r'])
|
|
print("R")
|
|
|
|
y = t['c'] - 1
|
|
r = t['r']
|
|
|
|
#nltk.download("stopwords")
|
|
#nltk.download("punkt_tab")
|
|
#nltk.download("wordnet")
|
|
|
|
def fr(r):
|
|
r = r.lower()
|
|
|
|
r = " ".join(tuple(re.findall(r'\w+', r)))
|
|
|
|
for i in ['\n', '\r', ',', '.', '-', ';', ':', '\'', '"']:
|
|
r = r.replace(i, "")
|
|
|
|
sw = set(stopwords.words("english"))
|
|
|
|
l = WordNetLemmatizer()
|
|
return " ".join([l.lemmatize(i.strip(), pos = 'v') for i in word_tokenize(r) if i.strip() not in sw])
|
|
|
|
r = r.progress_apply(fr)
|
|
#print(r)
|
|
|
|
print("A")
|
|
|
|
tk = kT(num_words = 6000)
|
|
|
|
tk.fit_on_texts(r)
|
|
|
|
print("F")
|
|
#print(tk.word_index)
|
|
|
|
s = tk.texts_to_sequences(r)
|
|
#print(s)
|
|
|
|
print("T")
|
|
|
|
ts = kps(s, maxlen = 100)
|
|
|
|
print("P")
|
|
|
|
m = km.Sequential([
|
|
kl.Input(shape = (None, ), dtype = 'int32'),
|
|
kl.Embedding(6000, 96),
|
|
kl.Dropout(0.2),
|
|
kl.Conv1D(128, 5, activation = 'relu'),
|
|
kl.LSTM(128, return_sequences = True),
|
|
kl.LSTM(64),
|
|
kl.Dense(64),
|
|
kl.Dropout(0.5),
|
|
kl.Dense(1, activation = 'sigmoid')
|
|
])
|
|
|
|
m.compile(optimizer = ko.Lion(learning_rate = 0.0005),
|
|
loss = 'binary_crossentropy',
|
|
metrics = ['accuracy'])
|
|
|
|
#m.summary()
|
|
|
|
ckpt = kc.ModelCheckpoint('model1.keras',
|
|
monitor = 'val_accuracy',
|
|
save_best_only = True,
|
|
verbose = 1)
|
|
|
|
history = m.fit(ts,
|
|
y,
|
|
epochs = 3,
|
|
batch_size = 256,
|
|
validation_split = 0.1,
|
|
callbacks = [ckpt])
|