This commit is contained in:
ІО-23 Шмуляр Олег 2025-12-06 13:36:49 +02:00
commit bddef39f9c
2 changed files with 52 additions and 0 deletions

29
prep.py Normal file
View File

@ -0,0 +1,29 @@
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
tqdm.pandas()
from preprocessor import fr
print("I")
t = pd.read_csv("yelp_review_polarity_csv/train.csv",
header = None,
names = ['c', 'r'])
print("R")
y = t['c'] - 1
r = t['r']
r = r.progress_apply(fr)
o = pd.DataFrame([y, r]).T
o.to_csv("prepped_train.csv")

23
preprocessor.py Normal file
View File

@ -0,0 +1,23 @@
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from spellchecker import SpellChecker as sc
nltk.download("stopwords")
nltk.download("punkt_tab")
nltk.download("wordnet")
def fr(r):
r = r.lower()
r = " ".join(tuple(re.findall(r'\w+', r)))
for i in ['\n', '\r', ',', '.', '-', ';', ':', '\'', '"']:
r = r.replace(i, "")
sw = set(stopwords.words("english"))
l = WordNetLemmatizer()
return " ".join([l.lemmatize(i.strip(), pos = 'v') for i in word_tokenize(r) if i.strip() not in sw])