troll-spelling-corrector: improve the detection and correction algorithms in the most horrendous way

This commit is contained in:
dymik739 2024-04-06 23:27:13 +03:00
parent 14ab2d3038
commit b02cee4fdb
1 changed files with 98 additions and 7 deletions

View File

@ -1,5 +1,6 @@
import re import re
''' Grammar reference
all_possible_match_strings = [ all_possible_match_strings = [
"силка", "силки", "силка", "силки",
"силки", "силок", "силки", "силок",
@ -11,23 +12,113 @@ all_possible_match_strings = [
"лінк", "лінка", "лінки", "лінк", "лінка", "лінки",
"лінку", "лінки", "лінок", "лінку", "лінки", "лінок",
"лінці", "лінці", "лінкам", "лінку", "лінці", "лінкам",
"лінк", "лінку", "лінки", "лінк", "лінку", "лінки",
"лінком", "лінкою", "лінками", "лінком", "лінкою", "лінками",
"лінку", "лінці", "лінках", "лінку", "лінці", "лінках",
"лінке", "лінко", "лінки" "лінке", "лінко", "лінки"
] ]
'''
unique_match_strings = set(all_possible_match_strings) EXTRACT_PADDING = 11
EXTRACT_INCLUDE_WHOLE_WORDS = True
regex_matchers = [re.compile(fr"\b{i}\b") for i in unique_match_strings] corrections = [ # IDs
"посилання", # 0
"посиланню", # 1
"посиланням", # 2
"на посиланні", # 3
"посилань", # 4
"посиланнями", # 5
"посиланнях" # 6
]
replacements = [
["силка", "силки", "силку", "силко",
"лінк", "лінка", "лінки", "лінку", "лінке", "лінко"],
["силці",
"лінку", "лінці"],
["силкам", "силкою",
"лінкам", "лінком", "лінкою"],
["на силці",
"на лінку", "на лінці"],
["силок",
"лінок"],
["силками",
"лінками"],
["силках",
"лінках"]
]
#unique_match_strings = set(all_possible_match_strings)
#ua_alphabet = "абвгґдеєжзиіїйклмнопрстуфхцчшщьюя"
#regex_matchers = [re.compile(fr"((?<=[^{ua_alphabet}])|(?<=\b)|(?<=^)|(?<= )){i}((?=[^{ua_alphabet}])|(?=\b)|(?=$)|(?= ))", re.DEBUG)
# for i in unique_match_strings]
ua_alphabet = "абвгґдеєжзиіїйклмнопрстуфхцчшщьюя"
for i, group in enumerate(replacements):
for j, match_word in enumerate(group):
#replacements[i][j] = re.compile(fr"((?<=[^{ua_alphabet}])|(?<=\b)|(?<=^)|(?<= )){i}((?=[^{ua_alphabet}])|(?=\b)|(?=$)|(?= ))")
replacements[i][j] = [match_word, re.compile(fr"((?<=[^{ua_alphabet}])|(?<=\b)|(?<=^)|(?<= )){match_word}((?=[^{ua_alphabet}])|(?=\b)|(?=$)|(?= ))")]
#print(replacements[i][j])
#_ = [print(fr"(?<=[^абвгґдеєжзиіїйклмнопрстуфхцчшщьюя]){i}(?=[^абвгґдеєжзиіїйклмнопрстуфхцчшщьюя])") for i in unique_match_strings]
def process(message, path): def process(message, path):
lowercase_message = message.text.lower() lowercase_message = message.text.lower()
for m in regex_matchers: for correct_word_id, group in enumerate(replacements):
result = m.match(lowercase_message) for match_word, matcher in group:
result = matcher.search(lowercase_message)
if result: if result:
return "*посилання 🌚", None l = len(message.text)
mistake_start = result.start()
mistake_end = result.end()
print(mistake_start, mistake_end)
original_text_before = message.text[max(mistake_start-EXTRACT_PADDING,0):mistake_start]
original_text_after = message.text[mistake_end:min(mistake_end+EXTRACT_PADDING,l)]
original_text_mistake = message.text[mistake_start:mistake_end]
if EXTRACT_INCLUDE_WHOLE_WORDS:
while 0 <= mistake_start - EXTRACT_PADDING - 1 < l and \
message.text[mistake_start-EXTRACT_PADDING-1].isalnum():
mistake_start -= 1
original_text_before = message.text[max(mistake_start-EXTRACT_PADDING,0):result.start()]
while 0 <= mistake_end + EXTRACT_PADDING < l and \
message.text[mistake_end+EXTRACT_PADDING].isalnum():
mistake_end += 1
original_text_after = message.text[result.end():min(mistake_end+EXTRACT_PADDING,l)]
if len(message.text[:mistake_start]) > EXTRACT_PADDING:
original_text_before_continue = "..."
else:
original_text_before_continue = ""
if len(message.text[mistake_end:]) > EXTRACT_PADDING:
original_text_after_continue = "..."
else:
original_text_after_continue = ""
original_extract = original_text_before_continue + original_text_before \
+ original_text_mistake + original_text_after + original_text_after_continue
correct_word = corrections[correct_word_id]
if original_text_mistake == match_word.capitalize():
correct_word = corrections[correct_word_id].capitalize()
elif original_text_mistake == match_word.upper():
correct_word = corrections[correct_word_id].upper()
fixed_extract = original_text_before_continue + original_text_before \
+ correct_word + original_text_after + original_text_after_continue
return f'"{original_extract}" -> "{fixed_extract}" 🌚', None
else: else:
return "", None return "", None