Skip to content

Lemmatizer

Lemmatizer module. Provides classes for lemmatizing token and full texts.

Classes

Lemmatizer

Lemmatizer class for performing token lemmatization.

Source code in simplemma/lemmatizer.py
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
class Lemmatizer:
    """Lemmatizer class for performing token lemmatization."""

    __slots__ = [
        "_cached_lemmatize",
        "_fallback_lemmatization_strategy",
        "_lemmatization_strategy",
        "_tokenizer",
    ]

    def __init__(
        self,
        cache_max_size: int = 1048576,
        tokenizer: Tokenizer = RegexTokenizer(),
        lemmatization_strategy: LemmatizationStrategy = DefaultStrategy(),
        fallback_lemmatization_strategy: LemmatizationFallbackStrategy = ToLowercaseFallbackStrategy(),
    ) -> None:
        """
        Initialize the Lemmatizer.

        Args:
            cache_max_size (int, optional): The maximum size of the cache for the lemmatization results.
                Defaults to `1048576`.
            tokenizer (Tokenizer, optional): The tokenizer to use for tokenization.
                Defaults to `RegexTokenizer()`.
            lemmatization_strategy (LemmatizationStrategy, optional): The lemmatization strategy to use.
                Defaults to `DefaultStrategy()`.
            fallback_lemmatization_strategy (LemmatizationFallbackStrategy, optional): The fallback lemmatization strategy to use.
                Defaults to `ToLowercaseFallbackStrategy()`.

        """
        self._tokenizer = tokenizer
        self._lemmatization_strategy = lemmatization_strategy
        self._fallback_lemmatization_strategy = fallback_lemmatization_strategy
        self._cached_lemmatize = lru_cache(maxsize=cache_max_size)(self._lemmatize)

    def is_known(
        self,
        token: str,
        lang: Union[str, Tuple[str, ...]],
    ) -> bool:
        """Check if a token is known in the specified language(s).

        Args:
            token: The token to check.
            lang: The language or languages to check in.

        Returns:
            bool: True if the token is known, False otherwise.
        """

        _control_input_type(token)
        lang = validate_lang_input(lang)

        dictionary_lookup = DictionaryLookupStrategy()
        return any(
            dictionary_lookup.get_lemma(token, lang_code) is not None
            for lang_code in lang
        )

    def lemmatize(
        self,
        token: str,
        lang: Union[str, Tuple[str, ...]],
    ) -> str:
        """Get the lemmatized form of a given word in the specified language(s).

        Args:
            token: The token to lemmatize.
            lang: The language or languages for lemmatization.

        Returns:
            str: The lemmatized form of the token.
        """
        return self._cached_lemmatize(token, lang)

    def _lemmatize(
        self,
        token: str,
        lang: Union[str, Tuple[str, ...]],
    ) -> str:
        """Internal method to lemmatize a token in the specified language(s).

        Args:
            token: The token to lemmatize.
            lang: The language or languages for lemmatization.

        Returns:
            str: The lemmatized form of the token.
        """
        _control_input_type(token)
        lang = validate_lang_input(lang)

        for lang_code in lang:
            candidate = self._lemmatization_strategy.get_lemma(token, lang_code)
            if candidate is not None:
                return candidate

        return self._fallback_lemmatization_strategy.get_lemma(token, next(iter(lang)))

    def get_lemmas_in_text(
        self,
        text: str,
        lang: Union[str, Tuple[str, ...]],
    ) -> Iterator[str]:
        """Get an iterator over lemmatized tokens in a text.

        Args:
            text: The text to process.
            lang: The language or languages for lemmatization.

        Yields:
            str: The lemmatized tokens in the text.
        """
        initial = True
        for token in self._tokenizer.split_text(text):
            yield self.lemmatize(token.lower() if initial else token, lang)
            initial = token in PUNCTUATION

Functions

__init__(cache_max_size=1048576, tokenizer=RegexTokenizer(), lemmatization_strategy=DefaultStrategy(), fallback_lemmatization_strategy=ToLowercaseFallbackStrategy())

Initialize the Lemmatizer.

Parameters:

Name Type Description Default
cache_max_size int

The maximum size of the cache for the lemmatization results. Defaults to 1048576.

1048576
tokenizer Tokenizer

The tokenizer to use for tokenization. Defaults to RegexTokenizer().

RegexTokenizer()
lemmatization_strategy LemmatizationStrategy

The lemmatization strategy to use. Defaults to DefaultStrategy().

DefaultStrategy()
fallback_lemmatization_strategy LemmatizationFallbackStrategy

The fallback lemmatization strategy to use. Defaults to ToLowercaseFallbackStrategy().

ToLowercaseFallbackStrategy()
Source code in simplemma/lemmatizer.py
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
def __init__(
    self,
    cache_max_size: int = 1048576,
    tokenizer: Tokenizer = RegexTokenizer(),
    lemmatization_strategy: LemmatizationStrategy = DefaultStrategy(),
    fallback_lemmatization_strategy: LemmatizationFallbackStrategy = ToLowercaseFallbackStrategy(),
) -> None:
    """
    Initialize the Lemmatizer.

    Args:
        cache_max_size (int, optional): The maximum size of the cache for the lemmatization results.
            Defaults to `1048576`.
        tokenizer (Tokenizer, optional): The tokenizer to use for tokenization.
            Defaults to `RegexTokenizer()`.
        lemmatization_strategy (LemmatizationStrategy, optional): The lemmatization strategy to use.
            Defaults to `DefaultStrategy()`.
        fallback_lemmatization_strategy (LemmatizationFallbackStrategy, optional): The fallback lemmatization strategy to use.
            Defaults to `ToLowercaseFallbackStrategy()`.

    """
    self._tokenizer = tokenizer
    self._lemmatization_strategy = lemmatization_strategy
    self._fallback_lemmatization_strategy = fallback_lemmatization_strategy
    self._cached_lemmatize = lru_cache(maxsize=cache_max_size)(self._lemmatize)
get_lemmas_in_text(text, lang)

Get an iterator over lemmatized tokens in a text.

Parameters:

Name Type Description Default
text str

The text to process.

required
lang Union[str, Tuple[str, ...]]

The language or languages for lemmatization.

required

Yields:

Name Type Description
str str

The lemmatized tokens in the text.

Source code in simplemma/lemmatizer.py
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
def get_lemmas_in_text(
    self,
    text: str,
    lang: Union[str, Tuple[str, ...]],
) -> Iterator[str]:
    """Get an iterator over lemmatized tokens in a text.

    Args:
        text: The text to process.
        lang: The language or languages for lemmatization.

    Yields:
        str: The lemmatized tokens in the text.
    """
    initial = True
    for token in self._tokenizer.split_text(text):
        yield self.lemmatize(token.lower() if initial else token, lang)
        initial = token in PUNCTUATION
is_known(token, lang)

Check if a token is known in the specified language(s).

Parameters:

Name Type Description Default
token str

The token to check.

required
lang Union[str, Tuple[str, ...]]

The language or languages to check in.

required

Returns:

Name Type Description
bool bool

True if the token is known, False otherwise.

Source code in simplemma/lemmatizer.py
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
def is_known(
    self,
    token: str,
    lang: Union[str, Tuple[str, ...]],
) -> bool:
    """Check if a token is known in the specified language(s).

    Args:
        token: The token to check.
        lang: The language or languages to check in.

    Returns:
        bool: True if the token is known, False otherwise.
    """

    _control_input_type(token)
    lang = validate_lang_input(lang)

    dictionary_lookup = DictionaryLookupStrategy()
    return any(
        dictionary_lookup.get_lemma(token, lang_code) is not None
        for lang_code in lang
    )
lemmatize(token, lang)

Get the lemmatized form of a given word in the specified language(s).

Parameters:

Name Type Description Default
token str

The token to lemmatize.

required
lang Union[str, Tuple[str, ...]]

The language or languages for lemmatization.

required

Returns:

Name Type Description
str str

The lemmatized form of the token.

Source code in simplemma/lemmatizer.py
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
def lemmatize(
    self,
    token: str,
    lang: Union[str, Tuple[str, ...]],
) -> str:
    """Get the lemmatized form of a given word in the specified language(s).

    Args:
        token: The token to lemmatize.
        lang: The language or languages for lemmatization.

    Returns:
        str: The lemmatized form of the token.
    """
    return self._cached_lemmatize(token, lang)

Functions

is_known(token, lang, greedy=False)

Check if a token is known in the specified language(s).

Parameters:

Name Type Description Default
token str

The token to check.

required
lang Union[str, Tuple[str, ...]]

The language or languages to check in.

required

Returns:

Name Type Description
bool bool

True if the token is known, False otherwise.

Source code in simplemma/lemmatizer.py
182
183
184
185
186
187
188
189
190
191
192
193
194
195
def is_known(
    token: str, lang: Union[str, Tuple[str, ...]], greedy: bool = False
) -> bool:
    """Check if a token is known in the specified language(s).

    Args:
        token: The token to check.
        lang: The language or languages to check in.

    Returns:
        bool: True if the token is known, False otherwise.
    """
    lemmatizer = _legacy_lemmatizer if not greedy else _legacy_greedy_lemmatizer
    return lemmatizer.is_known(token, lang)

lemma_iterator(text, lang, greedy=False)

Iterate over lemmatized tokens in a text.

Parameters:

Name Type Description Default
text str

The text to iterate over.

required
lang Union[str, Tuple[str, ...]]

The language or languages for lemmatization.

required
greedy bool

A flag indicating whether to use greedy lemmatization (default: False).

False
tokenizer

The tokenizer to use (default: RegexTokenizer()).

required

Yields:

Name Type Description
str str

The lemmatized tokens in the text.

Source code in simplemma/lemmatizer.py
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
def lemma_iterator(
    text: str, lang: Union[str, Tuple[str, ...]], greedy: bool = False
) -> Iterator[str]:
    """Iterate over lemmatized tokens in a text.

    Args:
        text: The text to iterate over.
        lang: The language or languages for lemmatization.
        greedy: A flag indicating whether to use greedy lemmatization (default: False).
        tokenizer: The tokenizer to use (default: RegexTokenizer()).

    Yields:
        str: The lemmatized tokens in the text.
    """
    lemmatizer = _legacy_lemmatizer if not greedy else _legacy_greedy_lemmatizer
    return lemmatizer.get_lemmas_in_text(text, lang)

lemmatize(token, lang, greedy=False)

Lemmatize a token in the specified language(s).

Parameters:

Name Type Description Default
token str

The token to lemmatize.

required
lang Union[str, Tuple[str, ...]]

The language or languages for lemmatization.

required
greedy bool

A flag indicating whether to use greedy lemmatization (default: False).

False

Returns:

Name Type Description
str str

The lemmatized form of the token.

Source code in simplemma/lemmatizer.py
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
def lemmatize(
    token: str, lang: Union[str, Tuple[str, ...]], greedy: bool = False
) -> str:
    """Lemmatize a token in the specified language(s).

    Args:
        token: The token to lemmatize.
        lang: The language or languages for lemmatization.
        greedy: A flag indicating whether to use greedy lemmatization (default: False).

    Returns:
        str: The lemmatized form of the token.
    """
    lemmatizer = _legacy_lemmatizer if not greedy else _legacy_greedy_lemmatizer
    return lemmatizer.lemmatize(token, lang)

text_lemmatizer(text, lang, greedy=False)

Lemmatize a text in the specified language(s).

Parameters:

Name Type Description Default
text str

The text to lemmatize.

required
lang Union[str, Tuple[str, ...]]

The language or languages for lemmatization.

required
greedy bool

A flag indicating whether to use greedy lemmatization (default: False).

False
tokenizer

The tokenizer to use (default: RegexTokenizer()).

required

Returns:

Type Description
List[str]

List[str]: The list of lemmatized tokens.

Source code in simplemma/lemmatizer.py
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
def text_lemmatizer(
    text: str, lang: Union[str, Tuple[str, ...]], greedy: bool = False
) -> List[str]:
    """Lemmatize a text in the specified language(s).

    Args:
        text: The text to lemmatize.
        lang: The language or languages for lemmatization.
        greedy: A flag indicating whether to use greedy lemmatization (default: False).
        tokenizer: The tokenizer to use (default: RegexTokenizer()).

    Returns:
        List[str]: The list of lemmatized tokens.
    """

    return list(
        lemma_iterator(
            text,
            lang,
            greedy,
        )
    )