3
This commit is contained in:
parent
9c3e92b4f8
commit
93229d32d1
54
main.py
54
main.py
@ -7,59 +7,24 @@ from tensorflow.keras import callbacks as kc
|
|||||||
from tensorflow.keras.preprocessing.text import Tokenizer as kT
|
from tensorflow.keras.preprocessing.text import Tokenizer as kT
|
||||||
from tensorflow.keras.utils import pad_sequences as kps
|
from tensorflow.keras.utils import pad_sequences as kps
|
||||||
|
|
||||||
import re
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
import nltk
|
|
||||||
from nltk.corpus import stopwords
|
|
||||||
from nltk.tokenize import word_tokenize
|
|
||||||
from nltk.stem import WordNetLemmatizer
|
|
||||||
|
|
||||||
from tqdm import tqdm
|
|
||||||
tqdm.pandas()
|
|
||||||
|
|
||||||
|
|
||||||
print("I")
|
print("I")
|
||||||
t = pd.read_csv("yelp_review_polarity_csv/train.csv",
|
t = pd.read_csv("prepped_train.csv",
|
||||||
header = None,
|
header = None,
|
||||||
names = ['c', 'r'])
|
names = ['i', 'c', 'r'])
|
||||||
print("R")
|
print("R")
|
||||||
|
|
||||||
y = t['c'] - 1
|
y = t['c']
|
||||||
r = t['r']
|
r = t['r'].astype(str)
|
||||||
|
|
||||||
#nltk.download("stopwords")
|
|
||||||
#nltk.download("punkt_tab")
|
|
||||||
#nltk.download("wordnet")
|
|
||||||
|
|
||||||
def fr(r):
|
|
||||||
r = r.lower()
|
|
||||||
|
|
||||||
r = " ".join(tuple(re.findall(r'\w+', r)))
|
|
||||||
|
|
||||||
for i in ['\n', '\r', ',', '.', '-', ';', ':', '\'', '"']:
|
|
||||||
r = r.replace(i, "")
|
|
||||||
|
|
||||||
sw = set(stopwords.words("english"))
|
|
||||||
|
|
||||||
l = WordNetLemmatizer()
|
|
||||||
return " ".join([l.lemmatize(i.strip(), pos = 'v') for i in word_tokenize(r) if i.strip() not in sw])
|
|
||||||
|
|
||||||
r = r.progress_apply(fr)
|
|
||||||
#print(r)
|
|
||||||
|
|
||||||
print("A")
|
|
||||||
|
|
||||||
tk = kT(num_words = 6000)
|
tk = kT(num_words = 6000)
|
||||||
|
|
||||||
tk.fit_on_texts(r)
|
tk.fit_on_texts(r)
|
||||||
|
|
||||||
print("F")
|
print("F")
|
||||||
#print(tk.word_index)
|
|
||||||
|
|
||||||
s = tk.texts_to_sequences(r)
|
s = tk.texts_to_sequences(r)
|
||||||
#print(s)
|
|
||||||
|
|
||||||
print("T")
|
print("T")
|
||||||
|
|
||||||
@ -67,6 +32,7 @@ ts = kps(s, maxlen = 100)
|
|||||||
|
|
||||||
print("P")
|
print("P")
|
||||||
|
|
||||||
|
'''
|
||||||
m = km.Sequential([
|
m = km.Sequential([
|
||||||
kl.Input(shape = (None, ), dtype = 'int32'),
|
kl.Input(shape = (None, ), dtype = 'int32'),
|
||||||
kl.Embedding(6000, 96),
|
kl.Embedding(6000, 96),
|
||||||
@ -82,17 +48,19 @@ m = km.Sequential([
|
|||||||
m.compile(optimizer = ko.Lion(learning_rate = 0.0005),
|
m.compile(optimizer = ko.Lion(learning_rate = 0.0005),
|
||||||
loss = 'binary_crossentropy',
|
loss = 'binary_crossentropy',
|
||||||
metrics = ['accuracy'])
|
metrics = ['accuracy'])
|
||||||
|
'''
|
||||||
|
|
||||||
#m.summary()
|
from model import m
|
||||||
|
|
||||||
ckpt = kc.ModelCheckpoint('model1.keras',
|
ckpt = kc.ModelCheckpoint('model2.keras',
|
||||||
monitor = 'val_accuracy',
|
monitor = 'val_accuracy',
|
||||||
save_best_only = True,
|
save_best_only = True,
|
||||||
verbose = 1)
|
verbose = 1)
|
||||||
|
|
||||||
|
m.load_weights("model1.keras")
|
||||||
history = m.fit(ts,
|
history = m.fit(ts,
|
||||||
y,
|
y,
|
||||||
epochs = 3,
|
epochs = 15,
|
||||||
batch_size = 256,
|
batch_size = 1024,
|
||||||
validation_split = 0.1,
|
validation_split = 0.1,
|
||||||
callbacks = [ckpt])
|
callbacks = [ckpt])
|
||||||
|
|||||||
21
model.py
Normal file
21
model.py
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
from tensorflow.keras import layers as kl
|
||||||
|
from tensorflow.keras import models as km
|
||||||
|
from tensorflow.keras import losses as ks
|
||||||
|
from tensorflow.keras import optimizers as ko
|
||||||
|
from tensorflow.keras import callbacks as kc
|
||||||
|
|
||||||
|
m = km.Sequential([
|
||||||
|
kl.Input(shape = (None, ), dtype = 'int32'),
|
||||||
|
kl.Embedding(6000, 96),
|
||||||
|
kl.Dropout(0.2),
|
||||||
|
kl.Conv1D(128, 5, activation = 'relu'),
|
||||||
|
kl.LSTM(128, return_sequences = True),
|
||||||
|
kl.LSTM(64),
|
||||||
|
kl.Dense(64),
|
||||||
|
kl.Dropout(0.5),
|
||||||
|
kl.Dense(1, activation = 'sigmoid')
|
||||||
|
])
|
||||||
|
|
||||||
|
m.compile(optimizer = ko.Lion(learning_rate = 0.0005),
|
||||||
|
loss = 'binary_crossentropy',
|
||||||
|
metrics = ['accuracy'])
|
||||||
11
plot.py
Normal file
11
plot.py
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
from matplotlib import pyplot as plt
|
||||||
|
|
||||||
|
X = [i for i in range(1, 21)]
|
||||||
|
tY = [0.8380, 0.9170, 0.9267, 0.9333, 0.9390, 0.9373, 0.9457, 0.9541, 0.9609, 0.9661, 0.9703, 0.9732, 0.9757, 0.9775, 0.9786, 0.9799, 0.9810, 0.9820, 0.9822, 0.9829]
|
||||||
|
vY = [0.9242, 0.9346, 0.9392, 0.9414, 0.9436, 0.9452, 0.9453, 0.9457, 0.9441, 0.9432, 0.9423, 0.9429, 0.9415, 0.9420, 0.9410, 0.9417, 0.9405, 0.9404, 0.9404, 0.9408]
|
||||||
|
|
||||||
|
plt.plot(X, tY)
|
||||||
|
plt.plot(X, vY)
|
||||||
|
plt.legend(["train_acc", "val_acc"])
|
||||||
|
|
||||||
|
plt.show()
|
||||||
@ -3,7 +3,7 @@ import nltk
|
|||||||
from nltk.corpus import stopwords
|
from nltk.corpus import stopwords
|
||||||
from nltk.tokenize import word_tokenize
|
from nltk.tokenize import word_tokenize
|
||||||
from nltk.stem import WordNetLemmatizer
|
from nltk.stem import WordNetLemmatizer
|
||||||
#from spellchecker import SpellChecker as sc
|
from spellchecker import SpellChecker as sc
|
||||||
|
|
||||||
nltk.download("stopwords")
|
nltk.download("stopwords")
|
||||||
nltk.download("punkt_tab")
|
nltk.download("punkt_tab")
|
||||||
@ -31,3 +31,26 @@ def fr(r):
|
|||||||
# r[k] = w
|
# r[k] = w
|
||||||
|
|
||||||
return " ".join([l.lemmatize(i, pos = 'v') for i in r])
|
return " ".join([l.lemmatize(i, pos = 'v') for i in r])
|
||||||
|
|
||||||
|
def frs(r):
|
||||||
|
r = r.lower()
|
||||||
|
|
||||||
|
r = " ".join(tuple(re.findall(r'\w+', r)))
|
||||||
|
|
||||||
|
for i in ['\n', '\r', ',', '.', '-', ';', ':', '\'', '"']:
|
||||||
|
r = r.replace(i, "")
|
||||||
|
|
||||||
|
sw = set(stopwords.words("english"))
|
||||||
|
|
||||||
|
l = WordNetLemmatizer()
|
||||||
|
c = sc()
|
||||||
|
|
||||||
|
r = [i.strip() for i in word_tokenize(r) if i.strip() not in sw]
|
||||||
|
|
||||||
|
# spellcheck
|
||||||
|
for k, i in enumerate(r):
|
||||||
|
w = c.correction(i)
|
||||||
|
if w:
|
||||||
|
r[k] = w
|
||||||
|
|
||||||
|
return " ".join([l.lemmatize(i, pos = 'v') for i in r])
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user