Source code for stringcompare.preprocessing.tokenizer

from abc import ABC, abstractmethod

[docs]class Tokenizer(ABC): """String tokenization interface.""" def __call__(self, sentence): return self.tokenize(sentence)
[docs] @abstractmethod def tokenize(self, sentence): pass
[docs] def batch_tokenize(self, sentences): return [self.tokenize(s) for s in sentences]
[docs]class DelimTokenizer(Tokenizer): def __init__(self, delim = " "): self.delim = delim
[docs] def tokenize(self, sentence): return sentence.split(self.delim)
[docs]class WhitespaceTokenizer(DelimTokenizer): def __init__(self): super().__init__(delim=" ")
[docs]class NGramTokenizer(Tokenizer): def __init__(self, n): self.n = n
[docs] def tokenize(self, sentence): return [sentence[i:i+self.n] for i in range(len(sentence)-self.n+1)]