neuro-lab7/preprocessor.py

57 lines
1.2 KiB
Python
Raw Permalink Normal View History

2025-12-06 13:36:49 +02:00
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
2025-12-06 15:56:00 +02:00
from spellchecker import SpellChecker as sc
2025-12-06 13:36:49 +02:00
2025-12-06 16:36:55 +02:00
#nltk.download("stopwords")
#nltk.download("punkt_tab")
#nltk.download("wordnet")
2025-12-06 13:36:49 +02:00
def fr(r):
r = r.lower()
r = " ".join(tuple(re.findall(r'\w+', r)))
for i in ['\n', '\r', ',', '.', '-', ';', ':', '\'', '"']:
r = r.replace(i, "")
sw = set(stopwords.words("english"))
l = WordNetLemmatizer()
2025-12-06 13:50:19 +02:00
#c = sc()
r = [i.strip() for i in word_tokenize(r) if i.strip() not in sw]
# spellcheck
#for k, i in enumerate(r):
# w = c.correction(i)
# if w:
# r[k] = w
return " ".join([l.lemmatize(i, pos = 'v') for i in r])
2025-12-06 15:56:00 +02:00
def frs(r):
r = r.lower()
r = " ".join(tuple(re.findall(r'\w+', r)))
for i in ['\n', '\r', ',', '.', '-', ';', ':', '\'', '"']:
r = r.replace(i, "")
sw = set(stopwords.words("english"))
l = WordNetLemmatizer()
c = sc()
r = [i.strip() for i in word_tokenize(r) if i.strip() not in sw]
# spellcheck
for k, i in enumerate(r):
w = c.correction(i)
if w:
r[k] = w
return " ".join([l.lemmatize(i, pos = 'v') for i in r])