Skip to content

Default Strategy

This module defines the DefaultStrategy class, which is a concrete implementation of the LemmatizationStrategy protocol. It provides lemmatization using a combination of different strategies such as dictionary lookup, hyphen removal, rule-based lemmatization, prefix decomposition, and affix decomposition.

Classes

DefaultStrategy

Bases: LemmatizationStrategy

This class represents a lemmatization strategy that combines different techniques to perform lemmatization. It implements the LemmatizationStrategy protocol.

Source code in simplemma/strategies/default.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
class DefaultStrategy(LemmatizationStrategy):
    """
    This class represents a lemmatization strategy that combines different techniques to perform lemmatization.
    It implements the `LemmatizationStrategy` protocol.
    """

    __slots__ = [
        "_dictionary_lookup",
        "_hyphen_search",
        "_rules_search",
        "_prefix_search",
        "_greedy_dictionary_lookup",
        "_affix_search",
    ]

    def __init__(
        self,
        greedy: bool = False,
        dictionary_factory: DictionaryFactory = DefaultDictionaryFactory(),
    ):
        """
        Initialize the Default Strategy.

        Args:
            greedy (bool): Whether to use a greedy approach for dictionary lookup. Defaults to `False`.
            dictionary_factory (DictionaryFactory): A factory for creating dictionaries.
                Defaults to [`DefaultDictionaryFactory()`][simplemma.strategies.dictionaries.dictionary_factory.DefaultDictionaryFactory]..

        """
        self._greedy = greedy
        self._dictionary_lookup = DictionaryLookupStrategy(dictionary_factory)
        self._hyphen_search = HyphenRemovalStrategy(self._dictionary_lookup)
        self._rules_search = RulesStrategy()
        self._prefix_search = PrefixDecompositionStrategy(
            dictionary_lookup=self._dictionary_lookup
        )
        greedy_dictionary_lookup = GreedyDictionaryLookupStrategy(dictionary_factory)
        self._affix_search = AffixDecompositionStrategy(
            greedy, self._dictionary_lookup, greedy_dictionary_lookup
        )

        self._greedy_dictionary_lookup = greedy_dictionary_lookup if greedy else None

    def get_lemma(self, token: str, lang: str) -> Optional[str]:
        """
        Get the lemma for a given token and language using the combination of different lemmatization techniques.

        Args:
            token (str): The token to lemmatize.
            lang (str): The language of the token.

        Returns:
            Optional[str]: The lemma of the token, or None if no lemma is found.

        """
        # filters
        if token.isnumeric():
            return token

        candidate = (
            # supervised searches
            self._dictionary_lookup.get_lemma(token, lang)
            or self._hyphen_search.get_lemma(token, lang)
            or self._rules_search.get_lemma(token, lang)
            or self._prefix_search.get_lemma(token, lang)
            # weakly supervised / greedier searches
            or self._affix_search.get_lemma(token, lang)
        )

        # additional round
        if candidate is not None and self._greedy_dictionary_lookup is not None:
            candidate = self._greedy_dictionary_lookup.get_lemma(candidate, lang)

        return candidate

Functions

__init__(greedy=False, dictionary_factory=DefaultDictionaryFactory())

Initialize the Default Strategy.

Parameters:

Name Type Description Default
greedy bool

Whether to use a greedy approach for dictionary lookup. Defaults to False.

False
dictionary_factory DictionaryFactory

A factory for creating dictionaries. Defaults to DefaultDictionaryFactory()..

DefaultDictionaryFactory()
Source code in simplemma/strategies/default.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
def __init__(
    self,
    greedy: bool = False,
    dictionary_factory: DictionaryFactory = DefaultDictionaryFactory(),
):
    """
    Initialize the Default Strategy.

    Args:
        greedy (bool): Whether to use a greedy approach for dictionary lookup. Defaults to `False`.
        dictionary_factory (DictionaryFactory): A factory for creating dictionaries.
            Defaults to [`DefaultDictionaryFactory()`][simplemma.strategies.dictionaries.dictionary_factory.DefaultDictionaryFactory]..

    """
    self._greedy = greedy
    self._dictionary_lookup = DictionaryLookupStrategy(dictionary_factory)
    self._hyphen_search = HyphenRemovalStrategy(self._dictionary_lookup)
    self._rules_search = RulesStrategy()
    self._prefix_search = PrefixDecompositionStrategy(
        dictionary_lookup=self._dictionary_lookup
    )
    greedy_dictionary_lookup = GreedyDictionaryLookupStrategy(dictionary_factory)
    self._affix_search = AffixDecompositionStrategy(
        greedy, self._dictionary_lookup, greedy_dictionary_lookup
    )

    self._greedy_dictionary_lookup = greedy_dictionary_lookup if greedy else None
get_lemma(token, lang)

Get the lemma for a given token and language using the combination of different lemmatization techniques.

Parameters:

Name Type Description Default
token str

The token to lemmatize.

required
lang str

The language of the token.

required

Returns:

Type Description
Optional[str]

Optional[str]: The lemma of the token, or None if no lemma is found.

Source code in simplemma/strategies/default.py
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
def get_lemma(self, token: str, lang: str) -> Optional[str]:
    """
    Get the lemma for a given token and language using the combination of different lemmatization techniques.

    Args:
        token (str): The token to lemmatize.
        lang (str): The language of the token.

    Returns:
        Optional[str]: The lemma of the token, or None if no lemma is found.

    """
    # filters
    if token.isnumeric():
        return token

    candidate = (
        # supervised searches
        self._dictionary_lookup.get_lemma(token, lang)
        or self._hyphen_search.get_lemma(token, lang)
        or self._rules_search.get_lemma(token, lang)
        or self._prefix_search.get_lemma(token, lang)
        # weakly supervised / greedier searches
        or self._affix_search.get_lemma(token, lang)
    )

    # additional round
    if candidate is not None and self._greedy_dictionary_lookup is not None:
        candidate = self._greedy_dictionary_lookup.get_lemma(candidate, lang)

    return candidate