from abc import ABC, abstractmethod
[docs]class Tokenizer(ABC):
"""String tokenization interface."""
def __call__(self, sentence):
return self.tokenize(sentence)
[docs] @abstractmethod
def tokenize(self, sentence):
pass
[docs] def batch_tokenize(self, sentences):
return [self.tokenize(s) for s in sentences]
[docs]class DelimTokenizer(Tokenizer):
def __init__(self, delim = " "):
self.delim = delim
[docs] def tokenize(self, sentence):
return sentence.split(self.delim)
[docs]class WhitespaceTokenizer(DelimTokenizer):
def __init__(self):
super().__init__(delim=" ")
[docs]class NGramTokenizer(Tokenizer):
def __init__(self, n):
self.n = n
[docs] def tokenize(self, sentence):
return [sentence[i:i+self.n] for i in range(len(sentence)-self.n+1)]