Skip to content

TokenSampler

Token Sampler module. Provides classes for sampling tokens from text.

Classes

BaseTokenSampler

Bases: ABC, TokenSampler

BaseTokenSampler is the base class for token samplers. It uses the given Tokenizer to convert a text in token. Classes inheriting from BaseTokenSampler only have to implement sample_tokens.

Source code in simplemma/token_sampler.py
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
class BaseTokenSampler(ABC, TokenSampler):
    """
    BaseTokenSampler is the base class for token samplers.
    It uses the given Tokenizer to convert a text in token.
    Classes inheriting from BaseTokenSampler only have to implement sample_tokens.
    """

    __slots__ = ["_tokenizer"]

    def __init__(
        self,
        tokenizer: Tokenizer = RegexTokenizer(SPLIT_INPUT),
    ) -> None:
        """
        Initialize the BaseTokenSampler.

        Args:
            tokenizer (Tokenizer, optional): The tokenizer to use for splitting text into tokens.
                Defaults to `RegexTokenizer(SPLIT_INPUT)`.
        """
        self._tokenizer = tokenizer

    def sample_text(self, text: str) -> List[str]:
        """
        Sample tokens from the input text.

        Args:
            text (str): The input text to sample tokens from.

        Returns:
            List[str]: The sampled tokens.

        """
        return self.sample_tokens(self._tokenizer.split_text(text))

    @abstractmethod
    def sample_tokens(self, tokens: Iterable[str]) -> List[str]:
        """
        Sample tokens from the given iterable of tokens.

        Args:
            tokens (Iterable[str]): The iterable of tokens to sample from.

        Returns:
            List[str]: The sampled tokens.

        """
        raise NotImplementedError

Functions

__init__(tokenizer=RegexTokenizer(SPLIT_INPUT))

Initialize the BaseTokenSampler.

Parameters:

Name Type Description Default
tokenizer Tokenizer

The tokenizer to use for splitting text into tokens. Defaults to RegexTokenizer(SPLIT_INPUT).

RegexTokenizer(SPLIT_INPUT)
Source code in simplemma/token_sampler.py
76
77
78
79
80
81
82
83
84
85
86
87
def __init__(
    self,
    tokenizer: Tokenizer = RegexTokenizer(SPLIT_INPUT),
) -> None:
    """
    Initialize the BaseTokenSampler.

    Args:
        tokenizer (Tokenizer, optional): The tokenizer to use for splitting text into tokens.
            Defaults to `RegexTokenizer(SPLIT_INPUT)`.
    """
    self._tokenizer = tokenizer
sample_text(text)

Sample tokens from the input text.

Parameters:

Name Type Description Default
text str

The input text to sample tokens from.

required

Returns:

Type Description
List[str]

List[str]: The sampled tokens.

Source code in simplemma/token_sampler.py
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
def sample_text(self, text: str) -> List[str]:
    """
    Sample tokens from the input text.

    Args:
        text (str): The input text to sample tokens from.

    Returns:
        List[str]: The sampled tokens.

    """
    return self.sample_tokens(self._tokenizer.split_text(text))
sample_tokens(tokens) abstractmethod

Sample tokens from the given iterable of tokens.

Parameters:

Name Type Description Default
tokens Iterable[str]

The iterable of tokens to sample from.

required

Returns:

Type Description
List[str]

List[str]: The sampled tokens.

Source code in simplemma/token_sampler.py
102
103
104
105
106
107
108
109
110
111
112
113
114
@abstractmethod
def sample_tokens(self, tokens: Iterable[str]) -> List[str]:
    """
    Sample tokens from the given iterable of tokens.

    Args:
        tokens (Iterable[str]): The iterable of tokens to sample from.

    Returns:
        List[str]: The sampled tokens.

    """
    raise NotImplementedError

MostCommonTokenSampler

Bases: BaseTokenSampler

Token sampler that selects the most common tokens.

Source code in simplemma/token_sampler.py
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
class MostCommonTokenSampler(BaseTokenSampler):
    """Token sampler that selects the most common tokens."""

    __slots__ = ["_capitalized_threshold", "_sample_size"]

    def __init__(
        self,
        tokenizer: Tokenizer = RegexTokenizer(SPLIT_INPUT),
        sample_size: int = 100,
        capitalized_threshold: float = 0.8,
    ) -> None:
        """
        Initialize the MostCommonTokenSampler.

        Args:
            tokenizer (Tokenizer, optional): The tokenizer to use for splitting text into tokens.
                Defaults to `RegexTokenizer(SPLIT_INPUT)`.
            sample_size (int, optional): The number of tokens to sample. Defaults to `100`.
            capitalized_threshold (float, optional): The threshold for removing capitalized tokens.
                Tokens with a frequency greater than this threshold will be removed. Defaults to `0.8`.
        """
        super().__init__(tokenizer)
        self._sample_size = sample_size
        self._capitalized_threshold = capitalized_threshold

    def sample_tokens(self, tokens: Iterable[str]) -> List[str]:
        """
        Sample tokens from the given iterable of tokens.

        Args:
            tokens (Iterable[str]): The iterable of tokens to sample from.

        Returns:
            List[str]: The sampled tokens.

        """
        counter = Counter(tokens)

        if self._capitalized_threshold > 0:
            deletions = [token for token in counter if token[0].isupper()]
            if len(deletions) < self._capitalized_threshold * len(counter):
                for token in deletions:
                    del counter[token]

        return [item[0] for item in counter.most_common(self._sample_size)]

Functions

__init__(tokenizer=RegexTokenizer(SPLIT_INPUT), sample_size=100, capitalized_threshold=0.8)

Initialize the MostCommonTokenSampler.

Parameters:

Name Type Description Default
tokenizer Tokenizer

The tokenizer to use for splitting text into tokens. Defaults to RegexTokenizer(SPLIT_INPUT).

RegexTokenizer(SPLIT_INPUT)
sample_size int

The number of tokens to sample. Defaults to 100.

100
capitalized_threshold float

The threshold for removing capitalized tokens. Tokens with a frequency greater than this threshold will be removed. Defaults to 0.8.

0.8
Source code in simplemma/token_sampler.py
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
def __init__(
    self,
    tokenizer: Tokenizer = RegexTokenizer(SPLIT_INPUT),
    sample_size: int = 100,
    capitalized_threshold: float = 0.8,
) -> None:
    """
    Initialize the MostCommonTokenSampler.

    Args:
        tokenizer (Tokenizer, optional): The tokenizer to use for splitting text into tokens.
            Defaults to `RegexTokenizer(SPLIT_INPUT)`.
        sample_size (int, optional): The number of tokens to sample. Defaults to `100`.
        capitalized_threshold (float, optional): The threshold for removing capitalized tokens.
            Tokens with a frequency greater than this threshold will be removed. Defaults to `0.8`.
    """
    super().__init__(tokenizer)
    self._sample_size = sample_size
    self._capitalized_threshold = capitalized_threshold
sample_tokens(tokens)

Sample tokens from the given iterable of tokens.

Parameters:

Name Type Description Default
tokens Iterable[str]

The iterable of tokens to sample from.

required

Returns:

Type Description
List[str]

List[str]: The sampled tokens.

Source code in simplemma/token_sampler.py
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
def sample_tokens(self, tokens: Iterable[str]) -> List[str]:
    """
    Sample tokens from the given iterable of tokens.

    Args:
        tokens (Iterable[str]): The iterable of tokens to sample from.

    Returns:
        List[str]: The sampled tokens.

    """
    counter = Counter(tokens)

    if self._capitalized_threshold > 0:
        deletions = [token for token in counter if token[0].isupper()]
        if len(deletions) < self._capitalized_threshold * len(counter):
            for token in deletions:
                del counter[token]

    return [item[0] for item in counter.most_common(self._sample_size)]

RelaxedMostCommonTokenSampler

Bases: MostCommonTokenSampler

Relaxed version of the most common token sampler. This sampler uses a relaxed splitting regex pattern and allows for a larger sample size.

Source code in simplemma/token_sampler.py
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
class RelaxedMostCommonTokenSampler(MostCommonTokenSampler):
    """
    Relaxed version of the most common token sampler.
    This sampler uses a relaxed splitting regex pattern and allows for a larger sample size.
    """

    def __init__(
        self,
        tokenizer: Tokenizer = RegexTokenizer(RELAXED_SPLIT_INPUT),
        sample_size: int = 1000,
        capitalized_threshold: float = 0,
    ) -> None:
        """
        Initialize the RelaxedMostCommonTokenSampler.
        This is just a `MostCommonTokenSampler` with a more relaxed regex pattern.

        Args:
            tokenizer (Tokenizer, optional): The tokenizer to use for splitting text into tokens.
                Defaults to `RegexTokenizer(RELAXED_SPLIT_INPUT)`.
            sample_size (int, optional): The number of tokens to sample. Defaults to `1000`.
            capitalized_threshold (float, optional): The threshold for removing capitalized tokens.
                Tokens with a frequency greater than this threshold will be removed.
                Defaults to `0`.

        """

        super().__init__(tokenizer, sample_size, capitalized_threshold)

Functions

__init__(tokenizer=RegexTokenizer(RELAXED_SPLIT_INPUT), sample_size=1000, capitalized_threshold=0)

Initialize the RelaxedMostCommonTokenSampler. This is just a MostCommonTokenSampler with a more relaxed regex pattern.

Parameters:

Name Type Description Default
tokenizer Tokenizer

The tokenizer to use for splitting text into tokens. Defaults to RegexTokenizer(RELAXED_SPLIT_INPUT).

RegexTokenizer(RELAXED_SPLIT_INPUT)
sample_size int

The number of tokens to sample. Defaults to 1000.

1000
capitalized_threshold float

The threshold for removing capitalized tokens. Tokens with a frequency greater than this threshold will be removed. Defaults to 0.

0
Source code in simplemma/token_sampler.py
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
def __init__(
    self,
    tokenizer: Tokenizer = RegexTokenizer(RELAXED_SPLIT_INPUT),
    sample_size: int = 1000,
    capitalized_threshold: float = 0,
) -> None:
    """
    Initialize the RelaxedMostCommonTokenSampler.
    This is just a `MostCommonTokenSampler` with a more relaxed regex pattern.

    Args:
        tokenizer (Tokenizer, optional): The tokenizer to use for splitting text into tokens.
            Defaults to `RegexTokenizer(RELAXED_SPLIT_INPUT)`.
        sample_size (int, optional): The number of tokens to sample. Defaults to `1000`.
        capitalized_threshold (float, optional): The threshold for removing capitalized tokens.
            Tokens with a frequency greater than this threshold will be removed.
            Defaults to `0`.

    """

    super().__init__(tokenizer, sample_size, capitalized_threshold)

TokenSampler

Bases: Protocol

Abstract base class for token samplers.

Token samplers are used to sample tokens from text.

Source code in simplemma/token_sampler.py
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
class TokenSampler(Protocol):
    """
    Abstract base class for token samplers.

    Token samplers are used to sample tokens from text.

    """

    @abstractmethod
    def sample_text(self, text: str) -> List[str]:
        """
        Sample tokens from the input text.

        Args:
            text (str): The input text to sample tokens from.

        Returns:
            List[str]: The sampled tokens.

        """
        raise NotImplementedError

    @abstractmethod
    def sample_tokens(self, tokens: Iterable[str]) -> List[str]:
        """
        Sample tokens from the given iterable of tokens.

        Args:
            tokens (Iterable[str]): The iterable of tokens to sample from.

        Returns:
            List[str]: The sampled tokens.

        """
        raise NotImplementedError

Functions

sample_text(text) abstractmethod

Sample tokens from the input text.

Parameters:

Name Type Description Default
text str

The input text to sample tokens from.

required

Returns:

Type Description
List[str]

List[str]: The sampled tokens.

Source code in simplemma/token_sampler.py
38
39
40
41
42
43
44
45
46
47
48
49
50
@abstractmethod
def sample_text(self, text: str) -> List[str]:
    """
    Sample tokens from the input text.

    Args:
        text (str): The input text to sample tokens from.

    Returns:
        List[str]: The sampled tokens.

    """
    raise NotImplementedError
sample_tokens(tokens) abstractmethod

Sample tokens from the given iterable of tokens.

Parameters:

Name Type Description Default
tokens Iterable[str]

The iterable of tokens to sample from.

required

Returns:

Type Description
List[str]

List[str]: The sampled tokens.

Source code in simplemma/token_sampler.py
52
53
54
55
56
57
58
59
60
61
62
63
64
@abstractmethod
def sample_tokens(self, tokens: Iterable[str]) -> List[str]:
    """
    Sample tokens from the given iterable of tokens.

    Args:
        tokens (Iterable[str]): The iterable of tokens to sample from.

    Returns:
        List[str]: The sampled tokens.

    """
    raise NotImplementedError