3
This commit is contained in:
parent
9c3e92b4f8
commit
93229d32d1
54
main.py
54
main.py
@ -7,59 +7,24 @@ from tensorflow.keras import callbacks as kc
|
||||
from tensorflow.keras.preprocessing.text import Tokenizer as kT
|
||||
from tensorflow.keras.utils import pad_sequences as kps
|
||||
|
||||
import re
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
import nltk
|
||||
from nltk.corpus import stopwords
|
||||
from nltk.tokenize import word_tokenize
|
||||
from nltk.stem import WordNetLemmatizer
|
||||
|
||||
from tqdm import tqdm
|
||||
tqdm.pandas()
|
||||
|
||||
|
||||
print("I")
|
||||
t = pd.read_csv("yelp_review_polarity_csv/train.csv",
|
||||
t = pd.read_csv("prepped_train.csv",
|
||||
header = None,
|
||||
names = ['c', 'r'])
|
||||
names = ['i', 'c', 'r'])
|
||||
print("R")
|
||||
|
||||
y = t['c'] - 1
|
||||
r = t['r']
|
||||
|
||||
#nltk.download("stopwords")
|
||||
#nltk.download("punkt_tab")
|
||||
#nltk.download("wordnet")
|
||||
|
||||
def fr(r):
|
||||
r = r.lower()
|
||||
|
||||
r = " ".join(tuple(re.findall(r'\w+', r)))
|
||||
|
||||
for i in ['\n', '\r', ',', '.', '-', ';', ':', '\'', '"']:
|
||||
r = r.replace(i, "")
|
||||
|
||||
sw = set(stopwords.words("english"))
|
||||
|
||||
l = WordNetLemmatizer()
|
||||
return " ".join([l.lemmatize(i.strip(), pos = 'v') for i in word_tokenize(r) if i.strip() not in sw])
|
||||
|
||||
r = r.progress_apply(fr)
|
||||
#print(r)
|
||||
|
||||
print("A")
|
||||
y = t['c']
|
||||
r = t['r'].astype(str)
|
||||
|
||||
tk = kT(num_words = 6000)
|
||||
|
||||
tk.fit_on_texts(r)
|
||||
|
||||
print("F")
|
||||
#print(tk.word_index)
|
||||
|
||||
s = tk.texts_to_sequences(r)
|
||||
#print(s)
|
||||
|
||||
print("T")
|
||||
|
||||
@ -67,6 +32,7 @@ ts = kps(s, maxlen = 100)
|
||||
|
||||
print("P")
|
||||
|
||||
'''
|
||||
m = km.Sequential([
|
||||
kl.Input(shape = (None, ), dtype = 'int32'),
|
||||
kl.Embedding(6000, 96),
|
||||
@ -82,17 +48,19 @@ m = km.Sequential([
|
||||
m.compile(optimizer = ko.Lion(learning_rate = 0.0005),
|
||||
loss = 'binary_crossentropy',
|
||||
metrics = ['accuracy'])
|
||||
'''
|
||||
|
||||
#m.summary()
|
||||
from model import m
|
||||
|
||||
ckpt = kc.ModelCheckpoint('model1.keras',
|
||||
ckpt = kc.ModelCheckpoint('model2.keras',
|
||||
monitor = 'val_accuracy',
|
||||
save_best_only = True,
|
||||
verbose = 1)
|
||||
|
||||
m.load_weights("model1.keras")
|
||||
history = m.fit(ts,
|
||||
y,
|
||||
epochs = 3,
|
||||
batch_size = 256,
|
||||
epochs = 15,
|
||||
batch_size = 1024,
|
||||
validation_split = 0.1,
|
||||
callbacks = [ckpt])
|
||||
|
||||
21
model.py
Normal file
21
model.py
Normal file
@ -0,0 +1,21 @@
|
||||
from tensorflow.keras import layers as kl
|
||||
from tensorflow.keras import models as km
|
||||
from tensorflow.keras import losses as ks
|
||||
from tensorflow.keras import optimizers as ko
|
||||
from tensorflow.keras import callbacks as kc
|
||||
|
||||
m = km.Sequential([
|
||||
kl.Input(shape = (None, ), dtype = 'int32'),
|
||||
kl.Embedding(6000, 96),
|
||||
kl.Dropout(0.2),
|
||||
kl.Conv1D(128, 5, activation = 'relu'),
|
||||
kl.LSTM(128, return_sequences = True),
|
||||
kl.LSTM(64),
|
||||
kl.Dense(64),
|
||||
kl.Dropout(0.5),
|
||||
kl.Dense(1, activation = 'sigmoid')
|
||||
])
|
||||
|
||||
m.compile(optimizer = ko.Lion(learning_rate = 0.0005),
|
||||
loss = 'binary_crossentropy',
|
||||
metrics = ['accuracy'])
|
||||
11
plot.py
Normal file
11
plot.py
Normal file
@ -0,0 +1,11 @@
|
||||
from matplotlib import pyplot as plt
|
||||
|
||||
X = [i for i in range(1, 21)]
|
||||
tY = [0.8380, 0.9170, 0.9267, 0.9333, 0.9390, 0.9373, 0.9457, 0.9541, 0.9609, 0.9661, 0.9703, 0.9732, 0.9757, 0.9775, 0.9786, 0.9799, 0.9810, 0.9820, 0.9822, 0.9829]
|
||||
vY = [0.9242, 0.9346, 0.9392, 0.9414, 0.9436, 0.9452, 0.9453, 0.9457, 0.9441, 0.9432, 0.9423, 0.9429, 0.9415, 0.9420, 0.9410, 0.9417, 0.9405, 0.9404, 0.9404, 0.9408]
|
||||
|
||||
plt.plot(X, tY)
|
||||
plt.plot(X, vY)
|
||||
plt.legend(["train_acc", "val_acc"])
|
||||
|
||||
plt.show()
|
||||
@ -3,7 +3,7 @@ import nltk
|
||||
from nltk.corpus import stopwords
|
||||
from nltk.tokenize import word_tokenize
|
||||
from nltk.stem import WordNetLemmatizer
|
||||
#from spellchecker import SpellChecker as sc
|
||||
from spellchecker import SpellChecker as sc
|
||||
|
||||
nltk.download("stopwords")
|
||||
nltk.download("punkt_tab")
|
||||
@ -31,3 +31,26 @@ def fr(r):
|
||||
# r[k] = w
|
||||
|
||||
return " ".join([l.lemmatize(i, pos = 'v') for i in r])
|
||||
|
||||
def frs(r):
|
||||
r = r.lower()
|
||||
|
||||
r = " ".join(tuple(re.findall(r'\w+', r)))
|
||||
|
||||
for i in ['\n', '\r', ',', '.', '-', ';', ':', '\'', '"']:
|
||||
r = r.replace(i, "")
|
||||
|
||||
sw = set(stopwords.words("english"))
|
||||
|
||||
l = WordNetLemmatizer()
|
||||
c = sc()
|
||||
|
||||
r = [i.strip() for i in word_tokenize(r) if i.strip() not in sw]
|
||||
|
||||
# spellcheck
|
||||
for k, i in enumerate(r):
|
||||
w = c.correction(i)
|
||||
if w:
|
||||
r[k] = w
|
||||
|
||||
return " ".join([l.lemmatize(i, pos = 'v') for i in r])
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user