24 lines
612 B
Python
24 lines
612 B
Python
|
|
import re
|
||
|
|
import nltk
|
||
|
|
from nltk.corpus import stopwords
|
||
|
|
from nltk.tokenize import word_tokenize
|
||
|
|
from nltk.stem import WordNetLemmatizer
|
||
|
|
from spellchecker import SpellChecker as sc
|
||
|
|
|
||
|
|
nltk.download("stopwords")
|
||
|
|
nltk.download("punkt_tab")
|
||
|
|
nltk.download("wordnet")
|
||
|
|
|
||
|
|
def fr(r):
|
||
|
|
r = r.lower()
|
||
|
|
|
||
|
|
r = " ".join(tuple(re.findall(r'\w+', r)))
|
||
|
|
|
||
|
|
for i in ['\n', '\r', ',', '.', '-', ';', ':', '\'', '"']:
|
||
|
|
r = r.replace(i, "")
|
||
|
|
|
||
|
|
sw = set(stopwords.words("english"))
|
||
|
|
|
||
|
|
l = WordNetLemmatizer()
|
||
|
|
return " ".join([l.lemmatize(i.strip(), pos = 'v') for i in word_tokenize(r) if i.strip() not in sw])
|