This module defines the DefaultStrategy
class, which is a concrete implementation of the LemmatizationStrategy
protocol.
It provides lemmatization using a combination of different strategies such as dictionary lookup, hyphen removal, rule-based lemmatization, prefix decomposition, and affix decomposition.
Classes
DefaultStrategy
Bases: LemmatizationStrategy
This class represents a lemmatization strategy that combines different techniques to perform lemmatization.
It implements the LemmatizationStrategy
protocol.
Source code in simplemma/strategies/default.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91 | class DefaultStrategy(LemmatizationStrategy):
"""
This class represents a lemmatization strategy that combines different techniques to perform lemmatization.
It implements the `LemmatizationStrategy` protocol.
"""
__slots__ = [
"_dictionary_lookup",
"_hyphen_search",
"_rules_search",
"_prefix_search",
"_greedy_dictionary_lookup",
"_affix_search",
]
def __init__(
self,
greedy: bool = False,
dictionary_factory: DictionaryFactory = DefaultDictionaryFactory(),
):
"""
Initialize the Default Strategy.
Args:
greedy (bool): Whether to use a greedy approach for dictionary lookup. Defaults to `False`.
dictionary_factory (DictionaryFactory): A factory for creating dictionaries.
Defaults to [`DefaultDictionaryFactory()`][simplemma.strategies.dictionaries.dictionary_factory.DefaultDictionaryFactory]..
"""
self._greedy = greedy
self._dictionary_lookup = DictionaryLookupStrategy(dictionary_factory)
self._hyphen_search = HyphenRemovalStrategy(self._dictionary_lookup)
self._rules_search = RulesStrategy()
self._prefix_search = PrefixDecompositionStrategy(
dictionary_lookup=self._dictionary_lookup
)
greedy_dictionary_lookup = GreedyDictionaryLookupStrategy(dictionary_factory)
self._affix_search = AffixDecompositionStrategy(
greedy, self._dictionary_lookup, greedy_dictionary_lookup
)
self._greedy_dictionary_lookup = greedy_dictionary_lookup if greedy else None
def get_lemma(self, token: str, lang: str) -> Optional[str]:
"""
Get the lemma for a given token and language using the combination of different lemmatization techniques.
Args:
token (str): The token to lemmatize.
lang (str): The language of the token.
Returns:
Optional[str]: The lemma of the token, or None if no lemma is found.
"""
# filters
if token.isnumeric():
return token
candidate = (
# supervised searches
self._dictionary_lookup.get_lemma(token, lang)
or self._hyphen_search.get_lemma(token, lang)
or self._rules_search.get_lemma(token, lang)
or self._prefix_search.get_lemma(token, lang)
# weakly supervised / greedier searches
or self._affix_search.get_lemma(token, lang)
)
# additional round
if candidate is not None and self._greedy_dictionary_lookup is not None:
candidate = self._greedy_dictionary_lookup.get_lemma(candidate, lang)
return candidate
|
Functions
__init__(greedy=False, dictionary_factory=DefaultDictionaryFactory())
Initialize the Default Strategy.
Parameters:
Source code in simplemma/strategies/default.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59 | def __init__(
self,
greedy: bool = False,
dictionary_factory: DictionaryFactory = DefaultDictionaryFactory(),
):
"""
Initialize the Default Strategy.
Args:
greedy (bool): Whether to use a greedy approach for dictionary lookup. Defaults to `False`.
dictionary_factory (DictionaryFactory): A factory for creating dictionaries.
Defaults to [`DefaultDictionaryFactory()`][simplemma.strategies.dictionaries.dictionary_factory.DefaultDictionaryFactory]..
"""
self._greedy = greedy
self._dictionary_lookup = DictionaryLookupStrategy(dictionary_factory)
self._hyphen_search = HyphenRemovalStrategy(self._dictionary_lookup)
self._rules_search = RulesStrategy()
self._prefix_search = PrefixDecompositionStrategy(
dictionary_lookup=self._dictionary_lookup
)
greedy_dictionary_lookup = GreedyDictionaryLookupStrategy(dictionary_factory)
self._affix_search = AffixDecompositionStrategy(
greedy, self._dictionary_lookup, greedy_dictionary_lookup
)
self._greedy_dictionary_lookup = greedy_dictionary_lookup if greedy else None
|
get_lemma(token, lang)
Get the lemma for a given token and language using the combination of different lemmatization techniques.
Parameters:
Name |
Type |
Description |
Default |
token |
str
|
|
required
|
lang |
str
|
The language of the token.
|
required
|
Returns:
Type |
Description |
Optional[str]
|
Optional[str]: The lemma of the token, or None if no lemma is found.
|
Source code in simplemma/strategies/default.py
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91 | def get_lemma(self, token: str, lang: str) -> Optional[str]:
"""
Get the lemma for a given token and language using the combination of different lemmatization techniques.
Args:
token (str): The token to lemmatize.
lang (str): The language of the token.
Returns:
Optional[str]: The lemma of the token, or None if no lemma is found.
"""
# filters
if token.isnumeric():
return token
candidate = (
# supervised searches
self._dictionary_lookup.get_lemma(token, lang)
or self._hyphen_search.get_lemma(token, lang)
or self._rules_search.get_lemma(token, lang)
or self._prefix_search.get_lemma(token, lang)
# weakly supervised / greedier searches
or self._affix_search.get_lemma(token, lang)
)
# additional round
if candidate is not None and self._greedy_dictionary_lookup is not None:
candidate = self._greedy_dictionary_lookup.get_lemma(candidate, lang)
return candidate
|