Skip to content

Language Detector

Lemmatizer module. Provides classes for text language detection using lemmatization and token sampling.

Classes

LanguageDetector

A class that performs language detection using lemmatization and token sampling.

Source code in simplemma/language_detector.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
class LanguageDetector:
    """A class that performs language detection using lemmatization and token sampling."""

    __slots__ = [
        "_lang",
        "_lemmatization_strategy",
        "_orig_token_sampler",
        "_token_sampler",
    ]

    def __init__(
        self,
        lang: Union[str, Tuple[str, ...]],
        token_sampler: TokenSampler = MostCommonTokenSampler(),
        lemmatization_strategy: LemmatizationStrategy = DefaultStrategy(),
    ) -> None:
        """
        Initialize the LanguageDetector.

        Args:
            lang (Union[str, Tuple[str, ...]]): The target language or languages to detect.
            token_sampler (TokenSampler, optional): The token sampling strategy to use.
                Defaults to `MostCommonTokenSampler()`.
            lemmatization_strategy (LemmatizationStrategy, optional): The lemmatization
                strategy to use. `Defaults to DefaultStrategy()`.
        """

        self._lang = validate_lang_input(lang)
        self._token_sampler = token_sampler
        self._orig_token_sampler = token_sampler
        self._lemmatization_strategy = lemmatization_strategy

    def _restore_token_sampler(self) -> None:
        self._token_sampler = self._orig_token_sampler

    def proportion_in_each_language(
        self,
        text: str,
    ) -> Dict[str, float]:
        """
        Calculate the proportion of each language in the given text.

        Args:
            text (str): The input text to analyze.

        Returns:
            Dict[str, float]: A dictionary containing the detected languages and
                their respective proportions.
        """
        tokens = self._token_sampler.sample_text(text)

        total_tokens = len(tokens)
        if total_tokens == 0:
            return {"unk": 1}

        known_tokens_count = dict.fromkeys(self._lang, 0)
        unknown_tokens_count = 0
        for token in tokens:
            token_found = False
            for lang_code in self._lang:
                candidate = self._lemmatization_strategy.get_lemma(token, lang_code)
                if candidate is not None:
                    known_tokens_count[lang_code] += 1
                    token_found = True
            if not token_found:
                unknown_tokens_count += 1

        results: Dict[str, float] = dict(
            (lang_code, token_count / total_tokens)
            for (lang_code, token_count) in known_tokens_count.items()
        )
        results["unk"] = unknown_tokens_count / total_tokens
        return results

    def proportion_in_target_languages(
        self,
        text: str,
    ) -> float:
        """
        Calculate the proportion of text in the target language.

        Args:
            text (str): The input text to analyze.

        Returns:
            float: The proportion of text in the target language(s).
        """
        tokens = self._token_sampler.sample_text(text)
        if len(tokens) == 0:
            return 0

        in_target = 0
        for token in tokens:
            for lang_code in self._lang:
                candidate = self._lemmatization_strategy.get_lemma(token, lang_code)
                if candidate is not None:
                    in_target += 1
                    break
        return in_target / len(tokens)

    def main_language(
        self,
        text: str,
        additional_token_samplers: List[TokenSampler] = [
            RelaxedMostCommonTokenSampler()
        ],
    ) -> str:
        """
        Determine the main language of the given text.

        Args:
            text (str): The input text to analyze.
            additional_token_samplers (List[TokenSampler], optional): Additional token
                sampling strategies to use. Defaults to `[RelaxedMostCommonTokenSampler()]`.

        Returns:
            str: The main language of the text.
        """
        token_samplers = [self._token_sampler] + additional_token_samplers

        for token_sampler in token_samplers:
            self._token_sampler = token_sampler
            list_results = _as_list(self.proportion_in_each_language(text))
            if len(list_results) > 1 and list_results[0][1] != list_results[1][1]:
                self._restore_token_sampler()
                return list_results[0][0]

        self._restore_token_sampler()
        return "unk"

Functions

__init__(lang, token_sampler=MostCommonTokenSampler(), lemmatization_strategy=DefaultStrategy())

Initialize the LanguageDetector.

Parameters:

Name Type Description Default
lang Union[str, Tuple[str, ...]]

The target language or languages to detect.

required
token_sampler TokenSampler

The token sampling strategy to use. Defaults to MostCommonTokenSampler().

MostCommonTokenSampler()
lemmatization_strategy LemmatizationStrategy

The lemmatization strategy to use. Defaults to DefaultStrategy().

DefaultStrategy()
Source code in simplemma/language_detector.py
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
def __init__(
    self,
    lang: Union[str, Tuple[str, ...]],
    token_sampler: TokenSampler = MostCommonTokenSampler(),
    lemmatization_strategy: LemmatizationStrategy = DefaultStrategy(),
) -> None:
    """
    Initialize the LanguageDetector.

    Args:
        lang (Union[str, Tuple[str, ...]]): The target language or languages to detect.
        token_sampler (TokenSampler, optional): The token sampling strategy to use.
            Defaults to `MostCommonTokenSampler()`.
        lemmatization_strategy (LemmatizationStrategy, optional): The lemmatization
            strategy to use. `Defaults to DefaultStrategy()`.
    """

    self._lang = validate_lang_input(lang)
    self._token_sampler = token_sampler
    self._orig_token_sampler = token_sampler
    self._lemmatization_strategy = lemmatization_strategy
main_language(text, additional_token_samplers=[RelaxedMostCommonTokenSampler()])

Determine the main language of the given text.

Parameters:

Name Type Description Default
text str

The input text to analyze.

required
additional_token_samplers List[TokenSampler]

Additional token sampling strategies to use. Defaults to [RelaxedMostCommonTokenSampler()].

[RelaxedMostCommonTokenSampler()]

Returns:

Name Type Description
str str

The main language of the text.

Source code in simplemma/language_detector.py
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
def main_language(
    self,
    text: str,
    additional_token_samplers: List[TokenSampler] = [
        RelaxedMostCommonTokenSampler()
    ],
) -> str:
    """
    Determine the main language of the given text.

    Args:
        text (str): The input text to analyze.
        additional_token_samplers (List[TokenSampler], optional): Additional token
            sampling strategies to use. Defaults to `[RelaxedMostCommonTokenSampler()]`.

    Returns:
        str: The main language of the text.
    """
    token_samplers = [self._token_sampler] + additional_token_samplers

    for token_sampler in token_samplers:
        self._token_sampler = token_sampler
        list_results = _as_list(self.proportion_in_each_language(text))
        if len(list_results) > 1 and list_results[0][1] != list_results[1][1]:
            self._restore_token_sampler()
            return list_results[0][0]

    self._restore_token_sampler()
    return "unk"
proportion_in_each_language(text)

Calculate the proportion of each language in the given text.

Parameters:

Name Type Description Default
text str

The input text to analyze.

required

Returns:

Type Description
Dict[str, float]

Dict[str, float]: A dictionary containing the detected languages and their respective proportions.

Source code in simplemma/language_detector.py
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
def proportion_in_each_language(
    self,
    text: str,
) -> Dict[str, float]:
    """
    Calculate the proportion of each language in the given text.

    Args:
        text (str): The input text to analyze.

    Returns:
        Dict[str, float]: A dictionary containing the detected languages and
            their respective proportions.
    """
    tokens = self._token_sampler.sample_text(text)

    total_tokens = len(tokens)
    if total_tokens == 0:
        return {"unk": 1}

    known_tokens_count = dict.fromkeys(self._lang, 0)
    unknown_tokens_count = 0
    for token in tokens:
        token_found = False
        for lang_code in self._lang:
            candidate = self._lemmatization_strategy.get_lemma(token, lang_code)
            if candidate is not None:
                known_tokens_count[lang_code] += 1
                token_found = True
        if not token_found:
            unknown_tokens_count += 1

    results: Dict[str, float] = dict(
        (lang_code, token_count / total_tokens)
        for (lang_code, token_count) in known_tokens_count.items()
    )
    results["unk"] = unknown_tokens_count / total_tokens
    return results
proportion_in_target_languages(text)

Calculate the proportion of text in the target language.

Parameters:

Name Type Description Default
text str

The input text to analyze.

required

Returns:

Name Type Description
float float

The proportion of text in the target language(s).

Source code in simplemma/language_detector.py
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
def proportion_in_target_languages(
    self,
    text: str,
) -> float:
    """
    Calculate the proportion of text in the target language.

    Args:
        text (str): The input text to analyze.

    Returns:
        float: The proportion of text in the target language(s).
    """
    tokens = self._token_sampler.sample_text(text)
    if len(tokens) == 0:
        return 0

    in_target = 0
    for token in tokens:
        for lang_code in self._lang:
            candidate = self._lemmatization_strategy.get_lemma(token, lang_code)
            if candidate is not None:
                in_target += 1
                break
    return in_target / len(tokens)

Functions

in_target_language(text, lang, greedy=False, token_sampler=MostCommonTokenSampler())

Calculate the proportion of text in the target language(s).

Parameters:

Name Type Description Default
text str

The input text to analyze.

required
lang Union[str, Tuple[str, ...]]

The target language(s) to compare against.

required
greedy bool

Whether to use greedy lemmatization. Defaults to False.

False
token_sampler TokenSampler

The token sampling strategy to use. Defaults to MostCommonTokenSampler().

MostCommonTokenSampler()

Returns:

Name Type Description
float float

The proportion of text in the target language(s).

Source code in simplemma/language_detector.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
def in_target_language(
    text: str,
    lang: Union[str, Tuple[str, ...]],
    greedy: bool = False,
    token_sampler: TokenSampler = MostCommonTokenSampler(),
) -> float:
    """
    Calculate the proportion of text in the target language(s).

    Args:
        text (str): The input text to analyze.
        lang (Union[str, Tuple[str, ...]]): The target language(s) to compare against.
        greedy (bool, optional): Whether to use greedy lemmatization. Defaults to `False`.
        token_sampler (TokenSampler, optional): The token sampling strategy to use.
            Defaults to `MostCommonTokenSampler()`.

    Returns:
        float: The proportion of text in the target language(s).
    """

    return LanguageDetector(
        lang, token_sampler, DefaultStrategy(greedy)
    ).proportion_in_target_languages(text)

langdetect(text, lang, greedy=False, token_samplers=[MostCommonTokenSampler(), RelaxedMostCommonTokenSampler()])

Detect the language(s) of the given text and their proportions.

Parameters:

Name Type Description Default
text str

The input text to analyze.

required
lang Union[str, Tuple[str, ...]]

The target language(s) to compare against.

required
greedy bool

Whether to use greedy lemmatization. Defaults to False.

False
token_samplers List[TokenSampler]

The list of token sampling strategies to use. Defaults to [MostCommonTokenSampler(), RelaxedMostCommonTokenSampler()].

[MostCommonTokenSampler(), RelaxedMostCommonTokenSampler()]

Returns:

Type Description
List[Tuple[str, float]]

List[Tuple[str, float]]: A list of tuples containing the detected language(s) and their respective proportions.

Source code in simplemma/language_detector.py
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
def langdetect(
    text: str,
    lang: Union[str, Tuple[str, ...]],
    greedy: bool = False,
    token_samplers: List[TokenSampler] = [
        MostCommonTokenSampler(),
        RelaxedMostCommonTokenSampler(),
    ],
) -> List[Tuple[str, float]]:
    """
    Detect the language(s) of the given text and their proportions.

    Args:
        text (str): The input text to analyze.
        lang (Union[str, Tuple[str, ...]]): The target language(s) to compare against.
        greedy (bool, optional): Whether to use greedy lemmatization. Defaults to `False`.
        token_samplers (List[TokenSampler], optional): The list of token sampling strategies
            to use. Defaults to `[MostCommonTokenSampler(), RelaxedMostCommonTokenSampler()]`.

    Returns:
        List[Tuple[str, float]]: A list of tuples containing the detected language(s)
            and their respective proportions.
    """

    for token_sampler in token_samplers:
        results = LanguageDetector(
            lang, token_sampler, DefaultStrategy(greedy)
        ).proportion_in_each_language(text)

        # post-processing
        list_results = _as_list(results)
        if len(list_results) == 1 or list_results[0][1] != list_results[1][1]:
            return list_results
    return list_results