Skip to content

Trie Dictionary Factory

Classes

TrieDictionaryFactory

Bases: DictionaryFactory

Memory optimized DictionaryFactory backed by MARISA-tries.

This dictionary factory creates dictionaries, which are backed by a MARISA-trie instead of a dict, to make them consume very little memory compared to the DefaultDictionaryFactory. Trade-offs are that lookup performance isn't as good as with dicts.

Source code in simplemma/strategies/dictionaries/trie_dictionary_factory.py
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
class TrieDictionaryFactory(DictionaryFactory):
    """Memory optimized DictionaryFactory backed by MARISA-tries.

    This dictionary factory creates dictionaries, which are backed by a
    MARISA-trie instead of a dict, to make them consume very little
    memory compared to the DefaultDictionaryFactory. Trade-offs are that
    lookup performance isn't as good as with dicts.
    """

    __slots__: list[str] = ["_cache_dir", "_use_disk_cache", "_get_dictionary"]

    def __init__(
        self,
        cache_max_size: int = 8,
        use_disk_cache: bool = True,
        disk_cache_dir: str | None = None,
    ) -> None:
        """Initialize the TrieDictionaryFactory.

        Args:
            cache_max_size (int): The maximum number dictionaries to
                keep in memory. Defaults to `8`.
            use_disk_cache (bool): Whether to cache the tries on disk to
                speed up loading time. Defaults to `True`.
            disk_cache_dir (str | None): Path where the generated
                tries should be stored in. Defaults to a Simplemma-
                specific subdirectory of the user's cache directory.
        """

        if disk_cache_dir:
            self._cache_dir = Path(disk_cache_dir)
        else:
            self._cache_dir = (
                Path(user_cache_dir("simplemma")) / "marisa_trie" / SIMPLEMMA_VERSION
            )
        self._use_disk_cache = use_disk_cache
        self._get_dictionary = lru_cache(maxsize=cache_max_size)(
            self._get_dictionary_uncached
        )

    def _create_trie_from_pickled_dict(self, lang: str) -> BytesTrie:
        """Create a trie from a pickled dictionary."""
        unpickled_dict = DefaultDictionaryFactory(cache_max_size=0).get_dictionary(lang)
        return BytesTrie(
            zip(
                unpickled_dict.keys(),
                [value.encode() for value in unpickled_dict.values()],
            ),
            cache_size=HUGE_CACHE,
        )

    def _write_trie_to_disk(self, lang: str, trie: BytesTrie) -> None:
        """Persist the trie to disk for later usage.

        The persisted trie can be loaded by subsequent runs to speed up
        loading times.
        """
        logger.debug("Caching trie on disk. This might take a second.")
        self._cache_dir.mkdir(parents=True, exist_ok=True)

        trie.save(self._cache_dir / f"{lang}.dic")

    def _get_dictionary_uncached(self, lang: str) -> Mapping[str, str]:
        """Get the dictionary for the given language."""
        if lang not in SUPPORTED_LANGUAGES:
            raise ValueError(f"Unsupported language: {lang}")

        if self._use_disk_cache and (self._cache_dir / f"{lang}.dic").exists():
            trie = BytesTrie().load(str(self._cache_dir / f"{lang}.dic"))
        else:
            trie = self._create_trie_from_pickled_dict(lang)
            if self._use_disk_cache:
                self._write_trie_to_disk(lang, trie)

        return TrieWrapDict(trie)

    def get_dictionary(
        self,
        lang: str,
    ) -> Mapping[str, str]:
        "Retrieves a dictionary for the specified language."
        return self._get_dictionary(lang)

Functions

__init__(cache_max_size=8, use_disk_cache=True, disk_cache_dir=None)

Initialize the TrieDictionaryFactory.

Parameters:

Name Type Description Default
cache_max_size int

The maximum number dictionaries to keep in memory. Defaults to 8.

8
use_disk_cache bool

Whether to cache the tries on disk to speed up loading time. Defaults to True.

True
disk_cache_dir str | None

Path where the generated tries should be stored in. Defaults to a Simplemma- specific subdirectory of the user's cache directory.

None
Source code in simplemma/strategies/dictionaries/trie_dictionary_factory.py
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
def __init__(
    self,
    cache_max_size: int = 8,
    use_disk_cache: bool = True,
    disk_cache_dir: str | None = None,
) -> None:
    """Initialize the TrieDictionaryFactory.

    Args:
        cache_max_size (int): The maximum number dictionaries to
            keep in memory. Defaults to `8`.
        use_disk_cache (bool): Whether to cache the tries on disk to
            speed up loading time. Defaults to `True`.
        disk_cache_dir (str | None): Path where the generated
            tries should be stored in. Defaults to a Simplemma-
            specific subdirectory of the user's cache directory.
    """

    if disk_cache_dir:
        self._cache_dir = Path(disk_cache_dir)
    else:
        self._cache_dir = (
            Path(user_cache_dir("simplemma")) / "marisa_trie" / SIMPLEMMA_VERSION
        )
    self._use_disk_cache = use_disk_cache
    self._get_dictionary = lru_cache(maxsize=cache_max_size)(
        self._get_dictionary_uncached
    )
get_dictionary(lang)

Retrieves a dictionary for the specified language.

Source code in simplemma/strategies/dictionaries/trie_dictionary_factory.py
128
129
130
131
132
133
def get_dictionary(
    self,
    lang: str,
) -> Mapping[str, str]:
    "Retrieves a dictionary for the specified language."
    return self._get_dictionary(lang)

TrieWrapDict

Bases: MutableMapping[str, Any]

Wrapper around BytesTrie to make them behave like dicts.

Source code in simplemma/strategies/dictionaries/trie_dictionary_factory.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
class TrieWrapDict(MutableMapping[str, Any]):
    """Wrapper around BytesTrie to make them behave like dicts."""

    __slots__ = ("_trie",)

    def __init__(self, trie: BytesTrie) -> None:
        self._trie = trie

    def __getitem__(self, item: str) -> Any:
        return self._trie[item][0].decode()

    def __setitem__(self, key: Any, value: Any) -> None:
        raise NotImplementedError

    def __delitem__(self, key: Any) -> None:
        raise NotImplementedError

    def __iter__(self) -> Iterator[str]:
        yield from self._trie.iterkeys()

    def __len__(self) -> int:
        return len(self._trie)