commit bddef39f9cebb8ba667705d20d78d4827bfbb8e1 Author: hasslesstech Date: Sat Dec 6 13:36:49 2025 +0200 1 diff --git a/prep.py b/prep.py new file mode 100644 index 0000000..f9511ba --- /dev/null +++ b/prep.py @@ -0,0 +1,29 @@ +import numpy as np +import pandas as pd + +import nltk +from nltk.corpus import stopwords +from nltk.tokenize import word_tokenize +from nltk.stem import WordNetLemmatizer + +from tqdm import tqdm +tqdm.pandas() + +from preprocessor import fr + +print("I") + +t = pd.read_csv("yelp_review_polarity_csv/train.csv", + header = None, + names = ['c', 'r']) + +print("R") + +y = t['c'] - 1 +r = t['r'] + +r = r.progress_apply(fr) + +o = pd.DataFrame([y, r]).T + +o.to_csv("prepped_train.csv") diff --git a/preprocessor.py b/preprocessor.py new file mode 100644 index 0000000..ef50b8b --- /dev/null +++ b/preprocessor.py @@ -0,0 +1,23 @@ +import re +import nltk +from nltk.corpus import stopwords +from nltk.tokenize import word_tokenize +from nltk.stem import WordNetLemmatizer +from spellchecker import SpellChecker as sc + +nltk.download("stopwords") +nltk.download("punkt_tab") +nltk.download("wordnet") + +def fr(r): + r = r.lower() + + r = " ".join(tuple(re.findall(r'\w+', r))) + + for i in ['\n', '\r', ',', '.', '-', ';', ':', '\'', '"']: + r = r.replace(i, "") + + sw = set(stopwords.words("english")) + + l = WordNetLemmatizer() + return " ".join([l.lemmatize(i.strip(), pos = 'v') for i in word_tokenize(r) if i.strip() not in sw])