from difflib import SequenceMatcher
import string
import unicodedata
import numpy as np
from unidecode import unidecode
[docs]
def char_similarity(a: str, b: str, symmetric: bool = True) -> float:
"""Compute similarity score between two characters based on multiple heuristics."""
score = 0.0
# Basic identity
if a == b:
return 1.0
# Unicode name similarity
try:
name_a = unicodedata.name(a)
except ValueError:
name_a = ""
try:
name_b = unicodedata.name(b)
except ValueError:
name_b = ""
cat_1 = unicodedata.category(a)
cat_2 = unicodedata.category(b)
score += .1 * (cat_1[0] == cat_2[0]) # Same category class
score += .2 * (cat_1 == cat_2) # Same category
if name_a and name_b:
name_a_pieces = set(name_a.split())
name_b_pieces = set(name_b.split())
common_pieces = len(name_a_pieces.intersection(name_b_pieces))
total_pieces = max(len(name_a_pieces), len(name_b_pieces))
score += 0.3 * (common_pieces / (total_pieces + .00000000001))
matcher = SequenceMatcher(None, name_a, name_b)
score += 0.2 * matcher.ratio()
matcher = SequenceMatcher(None, name_a[:len(name_a)//2], name_b[:len(name_b)//2]) # δ("ά" - "έ") > δ("ά" - "a")
score += 0.1 * matcher.ratio()
# Lowercase/uppercase match
if a.lower() == b.lower() and a.isalpha() and b.isalpha():
score += 0.2
# Whitespace match
if a.isspace() and b.isspace():
score += 0.2
# Punctuation match
if a in string.punctuation and b in string.punctuation:
score += 0.2
# Unidecode match
unidecode_a = unidecode(a)
unidecode_b = unidecode(b)
#print(f"Unidecode: {unidecode_a} vs {unidecode_b}")
if unidecode_a != "" and unidecode_b != "":
#print(f"Unidecode comparison: {unidecode_a} {len(unidecode_a)} vs {unidecode_b} {len(unidecode_b)}")
if len(unidecode_a) > 1 or len(unidecode_b) > 1:
a_let = np.array(list(unidecode_a), dtype=str)[None, :]
b_let = np.array(list(unidecode_b), dtype=str)[:, None]
agreement = (a_let == b_let).astype(float)
position_coefficient_a = np.linspace(1.5, 0.5, num=len(unidecode_a))[None, :]
position_coefficient_b = np.linspace(1.5, 0.5, num=len(unidecode_b))[:, None]
weighed_agreement = agreement * position_coefficient_a * position_coefficient_b
#print(f"{a_let} vs {b_let} -> \nUnweighted:\n{agreement}\nWeighted:\n{weighed_agreement}\n")
score += np.mean(weighed_agreement)
#print(a_let, b_let, agreement)
else:
score += 0.7 * (unidecode_a == unidecode_b)
unidecode_a, unidecode_b = unidecode_a.lower(), unidecode_b.lower()
if len(unidecode_a) > 1 or len(unidecode_b) > 1:
a_let = np.array(list(unidecode_a), dtype=str)[None, :]
b_let = np.array(list(unidecode_b), dtype=str)[:, None]
agreement = (a_let == b_let).astype(float)
position_coefficient_a = np.linspace(1.5, 0.5, num=len(unidecode_a))[None, :]
position_coefficient_b = np.linspace(1.5, 0.5, num=len(unidecode_b))[:, None]
score += np.mean(agreement * position_coefficient_a * position_coefficient_b)
else:
score += 0.5 * (unidecode_a == unidecode_b)
else:
pass
if unidecode_a == unidecode_b and unidecode_a != "":
score += 0.25
elif unidecode_a != "" and unidecode_b != "" and unidecode_a.lower() in "aeiouy" and unidecode_b.lower() in "aeiou": # Vowel
score += 0.2
elif unidecode_a != "" and unidecode_b != "" and unidecode_a.lower() not in "aeiouy" and unidecode_b.lower() not in "aeiou": # Consonant
score += 0.1
# Ordinal proximity this should matter a very little
ord_proximity = 1/(1+np.exp(-np.abs(ord(a) - ord(b))))
score += .05 * ord_proximity
#print(f"score before: {score}", end="")
score = 2*((1/(1+np.exp(-score))) - .5) # Normalize score to be between 0 and 1
#print(f" sig(score: {score})")
if symmetric:
return score * .5 + char_similarity(b, a, symmetric=False) * .5 # Ensure symmetry
else:
return score