from .comparator import StringComparator
from ..preprocessing.tokenizer import Tokenizer
from ..distance.levenshtein import Levenshtein
from ..preprocessing.tokenizer import WhitespaceTokenizer
[docs]class MongeElkan(StringComparator):
def __init__(
self,
comparator: StringComparator = Levenshtein(),
tokenizer: Tokenizer = WhitespaceTokenizer(),
symmetrize=False,
):
self.comparator = comparator
self.tokenizer = tokenizer
self.symmetrize = symmetrize
[docs] def monge_elkan(self, s: str, t: str):
s_tokens = self.tokenizer(s)
t_tokens = self.tokenizer(t)
return sum(min(self.comparator(i, j) for j in t_tokens) for i in s_tokens) / len(s_tokens)
[docs] def compare(self, s: str, t: str):
if self.symmetrize:
return min(self.monge_elkan(s, t), self.monge_elkan(t, s))
else:
return self.monge_elkan(s, t)