2024-04-06 18:14:53 +03:00
|
|
|
import re
|
|
|
|
|
2024-04-06 23:27:13 +03:00
|
|
|
''' Grammar reference
|
2024-04-06 18:14:53 +03:00
|
|
|
all_possible_match_strings = [
|
|
|
|
"силка", "силки",
|
|
|
|
"силки", "силок",
|
|
|
|
"силці", "силкам",
|
|
|
|
"силку", "силки",
|
|
|
|
"силкою", "силками",
|
2024-04-06 18:28:34 +03:00
|
|
|
"силці", "силках",
|
2024-04-06 18:14:53 +03:00
|
|
|
"силко", "силки",
|
|
|
|
|
|
|
|
"лінк", "лінка", "лінки",
|
|
|
|
"лінку", "лінки", "лінок",
|
2024-04-06 23:27:13 +03:00
|
|
|
"лінку", "лінці", "лінкам",
|
2024-04-06 18:14:53 +03:00
|
|
|
"лінк", "лінку", "лінки",
|
|
|
|
"лінком", "лінкою", "лінками",
|
|
|
|
"лінку", "лінці", "лінках",
|
|
|
|
"лінке", "лінко", "лінки"
|
|
|
|
]
|
2024-04-06 23:27:13 +03:00
|
|
|
'''
|
2024-04-06 18:14:53 +03:00
|
|
|
|
2024-04-06 23:27:13 +03:00
|
|
|
EXTRACT_PADDING = 11
|
|
|
|
EXTRACT_INCLUDE_WHOLE_WORDS = True
|
2024-04-06 18:14:53 +03:00
|
|
|
|
2024-04-06 23:27:13 +03:00
|
|
|
corrections = [ # IDs
|
|
|
|
"посилання", # 0
|
|
|
|
"посиланню", # 1
|
|
|
|
"посиланням", # 2
|
|
|
|
"на посиланні", # 3
|
|
|
|
"посилань", # 4
|
|
|
|
"посиланнями", # 5
|
|
|
|
"посиланнях" # 6
|
|
|
|
]
|
|
|
|
|
|
|
|
replacements = [
|
|
|
|
["силка", "силки", "силку", "силко",
|
|
|
|
"лінк", "лінка", "лінки", "лінку", "лінке", "лінко"],
|
|
|
|
["силці",
|
|
|
|
"лінку", "лінці"],
|
|
|
|
["силкам", "силкою",
|
|
|
|
"лінкам", "лінком", "лінкою"],
|
|
|
|
["на силці",
|
|
|
|
"на лінку", "на лінці"],
|
|
|
|
["силок",
|
|
|
|
"лінок"],
|
|
|
|
["силками",
|
|
|
|
"лінками"],
|
|
|
|
["силках",
|
|
|
|
"лінках"]
|
|
|
|
]
|
|
|
|
|
|
|
|
#unique_match_strings = set(all_possible_match_strings)
|
|
|
|
|
|
|
|
#ua_alphabet = "абвгґдеєжзиіїйклмнопрстуфхцчшщьюя"
|
|
|
|
#regex_matchers = [re.compile(fr"((?<=[^{ua_alphabet}])|(?<=\b)|(?<=^)|(?<= )){i}((?=[^{ua_alphabet}])|(?=\b)|(?=$)|(?= ))", re.DEBUG)
|
|
|
|
# for i in unique_match_strings]
|
|
|
|
|
|
|
|
ua_alphabet = "абвгґдеєжзиіїйклмнопрстуфхцчшщьюя"
|
|
|
|
|
|
|
|
for i, group in enumerate(replacements):
|
|
|
|
for j, match_word in enumerate(group):
|
|
|
|
#replacements[i][j] = re.compile(fr"((?<=[^{ua_alphabet}])|(?<=\b)|(?<=^)|(?<= )){i}((?=[^{ua_alphabet}])|(?=\b)|(?=$)|(?= ))")
|
|
|
|
replacements[i][j] = [match_word, re.compile(fr"((?<=[^{ua_alphabet}])|(?<=\b)|(?<=^)|(?<= )){match_word}((?=[^{ua_alphabet}])|(?=\b)|(?=$)|(?= ))")]
|
|
|
|
#print(replacements[i][j])
|
|
|
|
|
|
|
|
#_ = [print(fr"(?<=[^абвгґдеєжзиіїйклмнопрстуфхцчшщьюя]){i}(?=[^абвгґдеєжзиіїйклмнопрстуфхцчшщьюя])") for i in unique_match_strings]
|
2024-04-06 18:14:53 +03:00
|
|
|
|
2023-11-09 12:33:53 +02:00
|
|
|
def process(message, path):
|
2024-04-06 18:14:53 +03:00
|
|
|
lowercase_message = message.text.lower()
|
|
|
|
|
2024-04-06 23:27:13 +03:00
|
|
|
for correct_word_id, group in enumerate(replacements):
|
|
|
|
for match_word, matcher in group:
|
|
|
|
result = matcher.search(lowercase_message)
|
|
|
|
|
|
|
|
if result:
|
|
|
|
l = len(message.text)
|
|
|
|
|
|
|
|
mistake_start = result.start()
|
|
|
|
mistake_end = result.end()
|
|
|
|
|
|
|
|
print(mistake_start, mistake_end)
|
|
|
|
|
|
|
|
original_text_before = message.text[max(mistake_start-EXTRACT_PADDING,0):mistake_start]
|
|
|
|
original_text_after = message.text[mistake_end:min(mistake_end+EXTRACT_PADDING,l)]
|
|
|
|
original_text_mistake = message.text[mistake_start:mistake_end]
|
|
|
|
|
|
|
|
if EXTRACT_INCLUDE_WHOLE_WORDS:
|
|
|
|
while 0 <= mistake_start - EXTRACT_PADDING - 1 < l and \
|
|
|
|
message.text[mistake_start-EXTRACT_PADDING-1].isalnum():
|
|
|
|
mistake_start -= 1
|
|
|
|
original_text_before = message.text[max(mistake_start-EXTRACT_PADDING,0):result.start()]
|
|
|
|
|
|
|
|
while 0 <= mistake_end + EXTRACT_PADDING < l and \
|
|
|
|
message.text[mistake_end+EXTRACT_PADDING].isalnum():
|
|
|
|
mistake_end += 1
|
|
|
|
original_text_after = message.text[result.end():min(mistake_end+EXTRACT_PADDING,l)]
|
|
|
|
|
|
|
|
|
|
|
|
if len(message.text[:mistake_start]) > EXTRACT_PADDING:
|
|
|
|
original_text_before_continue = "..."
|
|
|
|
else:
|
|
|
|
original_text_before_continue = ""
|
|
|
|
|
|
|
|
if len(message.text[mistake_end:]) > EXTRACT_PADDING:
|
|
|
|
original_text_after_continue = "..."
|
|
|
|
else:
|
|
|
|
original_text_after_continue = ""
|
|
|
|
|
|
|
|
original_extract = original_text_before_continue + original_text_before \
|
|
|
|
+ original_text_mistake + original_text_after + original_text_after_continue
|
|
|
|
|
|
|
|
correct_word = corrections[correct_word_id]
|
|
|
|
if original_text_mistake == match_word.capitalize():
|
|
|
|
correct_word = corrections[correct_word_id].capitalize()
|
|
|
|
elif original_text_mistake == match_word.upper():
|
|
|
|
correct_word = corrections[correct_word_id].upper()
|
|
|
|
|
|
|
|
fixed_extract = original_text_before_continue + original_text_before \
|
|
|
|
+ correct_word + original_text_after + original_text_after_continue
|
|
|
|
|
|
|
|
return f'"{original_extract}" -> "{fixed_extract}" 🌚', None
|
2024-04-06 18:14:53 +03:00
|
|
|
else:
|
|
|
|
return "", None
|