2025-12-06 13:36:49 +02:00
|
|
|
import re
|
|
|
|
|
import nltk
|
|
|
|
|
from nltk.corpus import stopwords
|
|
|
|
|
from nltk.tokenize import word_tokenize
|
|
|
|
|
from nltk.stem import WordNetLemmatizer
|
2025-12-06 15:56:00 +02:00
|
|
|
from spellchecker import SpellChecker as sc
|
2025-12-06 13:36:49 +02:00
|
|
|
|
2025-12-06 16:36:55 +02:00
|
|
|
#nltk.download("stopwords")
|
|
|
|
|
#nltk.download("punkt_tab")
|
|
|
|
|
#nltk.download("wordnet")
|
2025-12-06 13:36:49 +02:00
|
|
|
|
|
|
|
|
def fr(r):
|
|
|
|
|
r = r.lower()
|
|
|
|
|
|
|
|
|
|
r = " ".join(tuple(re.findall(r'\w+', r)))
|
|
|
|
|
|
|
|
|
|
for i in ['\n', '\r', ',', '.', '-', ';', ':', '\'', '"']:
|
|
|
|
|
r = r.replace(i, "")
|
|
|
|
|
|
|
|
|
|
sw = set(stopwords.words("english"))
|
|
|
|
|
|
|
|
|
|
l = WordNetLemmatizer()
|
2025-12-06 13:50:19 +02:00
|
|
|
#c = sc()
|
|
|
|
|
|
|
|
|
|
r = [i.strip() for i in word_tokenize(r) if i.strip() not in sw]
|
|
|
|
|
|
|
|
|
|
# spellcheck
|
|
|
|
|
#for k, i in enumerate(r):
|
|
|
|
|
# w = c.correction(i)
|
|
|
|
|
# if w:
|
|
|
|
|
# r[k] = w
|
|
|
|
|
|
|
|
|
|
return " ".join([l.lemmatize(i, pos = 'v') for i in r])
|
2025-12-06 15:56:00 +02:00
|
|
|
|
|
|
|
|
def frs(r):
|
|
|
|
|
r = r.lower()
|
|
|
|
|
|
|
|
|
|
r = " ".join(tuple(re.findall(r'\w+', r)))
|
|
|
|
|
|
|
|
|
|
for i in ['\n', '\r', ',', '.', '-', ';', ':', '\'', '"']:
|
|
|
|
|
r = r.replace(i, "")
|
|
|
|
|
|
|
|
|
|
sw = set(stopwords.words("english"))
|
|
|
|
|
|
|
|
|
|
l = WordNetLemmatizer()
|
|
|
|
|
c = sc()
|
|
|
|
|
|
|
|
|
|
r = [i.strip() for i in word_tokenize(r) if i.strip() not in sw]
|
|
|
|
|
|
|
|
|
|
# spellcheck
|
|
|
|
|
for k, i in enumerate(r):
|
|
|
|
|
w = c.correction(i)
|
|
|
|
|
if w:
|
|
|
|
|
r[k] = w
|
|
|
|
|
|
|
|
|
|
return " ".join([l.lemmatize(i, pos = 'v') for i in r])
|