1
This commit is contained in:
commit
bddef39f9c
29
prep.py
Normal file
29
prep.py
Normal file
@ -0,0 +1,29 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
import nltk
|
||||
from nltk.corpus import stopwords
|
||||
from nltk.tokenize import word_tokenize
|
||||
from nltk.stem import WordNetLemmatizer
|
||||
|
||||
from tqdm import tqdm
|
||||
tqdm.pandas()
|
||||
|
||||
from preprocessor import fr
|
||||
|
||||
print("I")
|
||||
|
||||
t = pd.read_csv("yelp_review_polarity_csv/train.csv",
|
||||
header = None,
|
||||
names = ['c', 'r'])
|
||||
|
||||
print("R")
|
||||
|
||||
y = t['c'] - 1
|
||||
r = t['r']
|
||||
|
||||
r = r.progress_apply(fr)
|
||||
|
||||
o = pd.DataFrame([y, r]).T
|
||||
|
||||
o.to_csv("prepped_train.csv")
|
||||
23
preprocessor.py
Normal file
23
preprocessor.py
Normal file
@ -0,0 +1,23 @@
|
||||
import re
|
||||
import nltk
|
||||
from nltk.corpus import stopwords
|
||||
from nltk.tokenize import word_tokenize
|
||||
from nltk.stem import WordNetLemmatizer
|
||||
from spellchecker import SpellChecker as sc
|
||||
|
||||
nltk.download("stopwords")
|
||||
nltk.download("punkt_tab")
|
||||
nltk.download("wordnet")
|
||||
|
||||
def fr(r):
|
||||
r = r.lower()
|
||||
|
||||
r = " ".join(tuple(re.findall(r'\w+', r)))
|
||||
|
||||
for i in ['\n', '\r', ',', '.', '-', ';', ':', '\'', '"']:
|
||||
r = r.replace(i, "")
|
||||
|
||||
sw = set(stopwords.words("english"))
|
||||
|
||||
l = WordNetLemmatizer()
|
||||
return " ".join([l.lemmatize(i.strip(), pos = 'v') for i in word_tokenize(r) if i.strip() not in sw])
|
||||
Loading…
x
Reference in New Issue
Block a user