From b02cee4fdbeee03616a8b260039303cbe7a985f2 Mon Sep 17 00:00:00 2001 From: dymik739 Date: Sat, 6 Apr 2024 23:27:13 +0300 Subject: [PATCH] troll-spelling-corrector: improve the detection and correction algorithms in the most horrendous way --- modules/troll-spelling-corrector/index.py | 105 ++++++++++++++++++++-- 1 file changed, 98 insertions(+), 7 deletions(-) diff --git a/modules/troll-spelling-corrector/index.py b/modules/troll-spelling-corrector/index.py index 4717075..1f97cca 100644 --- a/modules/troll-spelling-corrector/index.py +++ b/modules/troll-spelling-corrector/index.py @@ -1,5 +1,6 @@ import re +''' Grammar reference all_possible_match_strings = [ "силка", "силки", "силки", "силок", @@ -11,23 +12,113 @@ all_possible_match_strings = [ "лінк", "лінка", "лінки", "лінку", "лінки", "лінок", - "лінці", "лінці", "лінкам", + "лінку", "лінці", "лінкам", "лінк", "лінку", "лінки", "лінком", "лінкою", "лінками", "лінку", "лінці", "лінках", "лінке", "лінко", "лінки" ] +''' -unique_match_strings = set(all_possible_match_strings) +EXTRACT_PADDING = 11 +EXTRACT_INCLUDE_WHOLE_WORDS = True -regex_matchers = [re.compile(fr"\b{i}\b") for i in unique_match_strings] +corrections = [ # IDs + "посилання", # 0 + "посиланню", # 1 + "посиланням", # 2 + "на посиланні", # 3 + "посилань", # 4 + "посиланнями", # 5 + "посиланнях" # 6 +] + +replacements = [ + ["силка", "силки", "силку", "силко", + "лінк", "лінка", "лінки", "лінку", "лінке", "лінко"], + ["силці", + "лінку", "лінці"], + ["силкам", "силкою", + "лінкам", "лінком", "лінкою"], + ["на силці", + "на лінку", "на лінці"], + ["силок", + "лінок"], + ["силками", + "лінками"], + ["силках", + "лінках"] +] + +#unique_match_strings = set(all_possible_match_strings) + +#ua_alphabet = "абвгґдеєжзиіїйклмнопрстуфхцчшщьюя" +#regex_matchers = [re.compile(fr"((?<=[^{ua_alphabet}])|(?<=\b)|(?<=^)|(?<= )){i}((?=[^{ua_alphabet}])|(?=\b)|(?=$)|(?= ))", re.DEBUG) +# for i in unique_match_strings] + +ua_alphabet = "абвгґдеєжзиіїйклмнопрстуфхцчшщьюя" + +for i, group in enumerate(replacements): + for j, match_word in enumerate(group): + #replacements[i][j] = re.compile(fr"((?<=[^{ua_alphabet}])|(?<=\b)|(?<=^)|(?<= )){i}((?=[^{ua_alphabet}])|(?=\b)|(?=$)|(?= ))") + replacements[i][j] = [match_word, re.compile(fr"((?<=[^{ua_alphabet}])|(?<=\b)|(?<=^)|(?<= )){match_word}((?=[^{ua_alphabet}])|(?=\b)|(?=$)|(?= ))")] + #print(replacements[i][j]) + +#_ = [print(fr"(?<=[^абвгґдеєжзиіїйклмнопрстуфхцчшщьюя]){i}(?=[^абвгґдеєжзиіїйклмнопрстуфхцчшщьюя])") for i in unique_match_strings] def process(message, path): lowercase_message = message.text.lower() - for m in regex_matchers: - result = m.match(lowercase_message) - if result: - return "*посилання 🌚", None + for correct_word_id, group in enumerate(replacements): + for match_word, matcher in group: + result = matcher.search(lowercase_message) + + if result: + l = len(message.text) + + mistake_start = result.start() + mistake_end = result.end() + + print(mistake_start, mistake_end) + + original_text_before = message.text[max(mistake_start-EXTRACT_PADDING,0):mistake_start] + original_text_after = message.text[mistake_end:min(mistake_end+EXTRACT_PADDING,l)] + original_text_mistake = message.text[mistake_start:mistake_end] + + if EXTRACT_INCLUDE_WHOLE_WORDS: + while 0 <= mistake_start - EXTRACT_PADDING - 1 < l and \ + message.text[mistake_start-EXTRACT_PADDING-1].isalnum(): + mistake_start -= 1 + original_text_before = message.text[max(mistake_start-EXTRACT_PADDING,0):result.start()] + + while 0 <= mistake_end + EXTRACT_PADDING < l and \ + message.text[mistake_end+EXTRACT_PADDING].isalnum(): + mistake_end += 1 + original_text_after = message.text[result.end():min(mistake_end+EXTRACT_PADDING,l)] + + + if len(message.text[:mistake_start]) > EXTRACT_PADDING: + original_text_before_continue = "..." + else: + original_text_before_continue = "" + + if len(message.text[mistake_end:]) > EXTRACT_PADDING: + original_text_after_continue = "..." + else: + original_text_after_continue = "" + + original_extract = original_text_before_continue + original_text_before \ + + original_text_mistake + original_text_after + original_text_after_continue + + correct_word = corrections[correct_word_id] + if original_text_mistake == match_word.capitalize(): + correct_word = corrections[correct_word_id].capitalize() + elif original_text_mistake == match_word.upper(): + correct_word = corrections[correct_word_id].upper() + + fixed_extract = original_text_before_continue + original_text_before \ + + correct_word + original_text_after + original_text_after_continue + + return f'"{original_extract}" -> "{fixed_extract}" 🌚', None else: return "", None