Source code for pylelemmatize.char_distance

from difflib import SequenceMatcher
import string
import unicodedata

import numpy as np
from unidecode import unidecode




[docs] def char_similarity(a: str, b: str, symmetric: bool = True) -> float: """Compute similarity score between two characters based on multiple heuristics.""" score = 0.0 # Basic identity if a == b: return 1.0 # Unicode name similarity try: name_a = unicodedata.name(a) except ValueError: name_a = "" try: name_b = unicodedata.name(b) except ValueError: name_b = "" cat_1 = unicodedata.category(a) cat_2 = unicodedata.category(b) score += .1 * (cat_1[0] == cat_2[0]) # Same category class score += .2 * (cat_1 == cat_2) # Same category if name_a and name_b: name_a_pieces = set(name_a.split()) name_b_pieces = set(name_b.split()) common_pieces = len(name_a_pieces.intersection(name_b_pieces)) total_pieces = max(len(name_a_pieces), len(name_b_pieces)) score += 0.3 * (common_pieces / (total_pieces + .00000000001)) matcher = SequenceMatcher(None, name_a, name_b) score += 0.2 * matcher.ratio() matcher = SequenceMatcher(None, name_a[:len(name_a)//2], name_b[:len(name_b)//2]) # δ("ά" - "έ") > δ("ά" - "a") score += 0.1 * matcher.ratio() # Lowercase/uppercase match if a.lower() == b.lower() and a.isalpha() and b.isalpha(): score += 0.2 # Whitespace match if a.isspace() and b.isspace(): score += 0.2 # Punctuation match if a in string.punctuation and b in string.punctuation: score += 0.2 # Unidecode match unidecode_a = unidecode(a) unidecode_b = unidecode(b) #print(f"Unidecode: {unidecode_a} vs {unidecode_b}") if unidecode_a != "" and unidecode_b != "": #print(f"Unidecode comparison: {unidecode_a} {len(unidecode_a)} vs {unidecode_b} {len(unidecode_b)}") if len(unidecode_a) > 1 or len(unidecode_b) > 1: a_let = np.array(list(unidecode_a), dtype=str)[None, :] b_let = np.array(list(unidecode_b), dtype=str)[:, None] agreement = (a_let == b_let).astype(float) position_coefficient_a = np.linspace(1.5, 0.5, num=len(unidecode_a))[None, :] position_coefficient_b = np.linspace(1.5, 0.5, num=len(unidecode_b))[:, None] weighed_agreement = agreement * position_coefficient_a * position_coefficient_b #print(f"{a_let} vs {b_let} -> \nUnweighted:\n{agreement}\nWeighted:\n{weighed_agreement}\n") score += np.mean(weighed_agreement) #print(a_let, b_let, agreement) else: score += 0.7 * (unidecode_a == unidecode_b) unidecode_a, unidecode_b = unidecode_a.lower(), unidecode_b.lower() if len(unidecode_a) > 1 or len(unidecode_b) > 1: a_let = np.array(list(unidecode_a), dtype=str)[None, :] b_let = np.array(list(unidecode_b), dtype=str)[:, None] agreement = (a_let == b_let).astype(float) position_coefficient_a = np.linspace(1.5, 0.5, num=len(unidecode_a))[None, :] position_coefficient_b = np.linspace(1.5, 0.5, num=len(unidecode_b))[:, None] score += np.mean(agreement * position_coefficient_a * position_coefficient_b) else: score += 0.5 * (unidecode_a == unidecode_b) else: pass if unidecode_a == unidecode_b and unidecode_a != "": score += 0.25 elif unidecode_a != "" and unidecode_b != "" and unidecode_a.lower() in "aeiouy" and unidecode_b.lower() in "aeiou": # Vowel score += 0.2 elif unidecode_a != "" and unidecode_b != "" and unidecode_a.lower() not in "aeiouy" and unidecode_b.lower() not in "aeiou": # Consonant score += 0.1 # Ordinal proximity this should matter a very little ord_proximity = 1/(1+np.exp(-np.abs(ord(a) - ord(b)))) score += .05 * ord_proximity #print(f"score before: {score}", end="") score = 2*((1/(1+np.exp(-score))) - .5) # Normalize score to be between 0 and 1 #print(f" sig(score: {score})") if symmetric: return score * .5 + char_similarity(b, a, symmetric=False) * .5 # Ensure symmetry else: return score