neuro-lab7/preprocessor.py

24 lines
612 B
Python
Raw Normal View History

2025-12-06 13:36:49 +02:00
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from spellchecker import SpellChecker as sc
nltk.download("stopwords")
nltk.download("punkt_tab")
nltk.download("wordnet")
def fr(r):
r = r.lower()
r = " ".join(tuple(re.findall(r'\w+', r)))
for i in ['\n', '\r', ',', '.', '-', ';', ':', '\'', '"']:
r = r.replace(i, "")
sw = set(stopwords.words("english"))
l = WordNetLemmatizer()
return " ".join([l.lemmatize(i.strip(), pos = 'v') for i in word_tokenize(r) if i.strip() not in sw])