import re import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from nltk.stem import WordNetLemmatizer from spellchecker import SpellChecker as sc #nltk.download("stopwords") #nltk.download("punkt_tab") #nltk.download("wordnet") def fr(r): r = r.lower() r = " ".join(tuple(re.findall(r'\w+', r))) for i in ['\n', '\r', ',', '.', '-', ';', ':', '\'', '"']: r = r.replace(i, "") sw = set(stopwords.words("english")) l = WordNetLemmatizer() #c = sc() r = [i.strip() for i in word_tokenize(r) if i.strip() not in sw] # spellcheck #for k, i in enumerate(r): # w = c.correction(i) # if w: # r[k] = w return " ".join([l.lemmatize(i, pos = 'v') for i in r]) def frs(r): r = r.lower() r = " ".join(tuple(re.findall(r'\w+', r))) for i in ['\n', '\r', ',', '.', '-', ';', ':', '\'', '"']: r = r.replace(i, "") sw = set(stopwords.words("english")) l = WordNetLemmatizer() c = sc() r = [i.strip() for i in word_tokenize(r) if i.strip() not in sw] # spellcheck for k, i in enumerate(r): w = c.correction(i) if w: r[k] = w return " ".join([l.lemmatize(i, pos = 'v') for i in r])