Source code for pylelemmatize.fast_mapper

from collections import defaultdict
from typing import Dict, Optional, Tuple, Literal, Union
import numpy as np
from unidecode import unidecode
from .abstract_mapper import AbstractLemmatizer, GenericLemmatizer, fast_numpy_to_str, fast_str_to_numpy



[docs]
class LemmatizerBMP(GenericLemmatizer):

[docs]
    @staticmethod
    def alphabet_in_bmp(alphabet: Optional[str]) -> bool:
        """
        Check if all characters in the given alphabet are within the BMP (Basic Multilingual Plane).

        Parameters
        ----------
        alphabet : Optional[str]
            A string containing the alphabet to check. If None, the method returns True.

        Returns
        -------
        bool
            True if all characters are within the BMP, False otherwise.
        """
        if alphabet is None:
            return True
        return (fast_str_to_numpy(alphabet) > 65535).sum() == 0


    @staticmethod
    def __create_mappers(mapping_dict, unknown_chr) -> Tuple[defaultdict, np.ndarray, Dict[int, str], Dict[str, int], np.ndarray, np.ndarray, np.ndarray]:
        """
        Create mappings and data structures for character transformation.

        This static method generates several mappings and arrays to facilitate
        the transformation of characters based on a given mapping dictionary.
        It ensures that all characters involved are within the BMP (Basic Multilingual Plane)
        and handles unknown characters by mapping them to a specified `unknown_chr`.

        Parameters
        ----------
        mapping_dict : dict
            A dictionary where keys are source characters and values are destination characters.
            Both keys and values must be BMP characters.
        unknown_chr : str
            A single BMP character used to represent unknown or unmapped characters.

        Returns
        -------
        Tuple[defaultdict, np.ndarray, Dict[int, str], Dict[str, int], np.ndarray, np.ndarray, np.ndarray]
            - src_alphabet_str : str
              A string containing all source characters sorted in ascending order.
            - dst_alphabet_str : str
              A string containing all unique destination characters sorted in ascending order.
            - np_chrord2dense : np.ndarray
              A NumPy array mapping Unicode code points to dense indices.
            - np_dense2chrord : np.ndarray
              A NumPy array mapping dense indices back to Unicode code points.

        Raises
        ------
        AssertionError
            If any character in `mapping_dict` keys or values, or `unknown_chr`, is not a BMP character.
            If `unknown_chr` is present in `mapping_dict` but does not map to itself.
        """
        if any([ord(c) >= 65536 for c in mapping_dict.keys()]) or \
                any([ord(c) >= 65536 for c in mapping_dict.values()]) or \
                ord(unknown_chr) >= 65536:
            raise ValueError("LemmatizerBMP can only handle BMP characters. Please use GenericLemmatizer for non-BMP characters.")

        if unknown_chr in mapping_dict:
            assert mapping_dict[unknown_chr] == unknown_chr, "unknown_chr must map to itself in the mapping_dict."
            del mapping_dict[unknown_chr]  # Remove the unknown character from the mapping to avoid confusion

        src_alphabet_str = ''.join(sorted(mapping_dict.keys()))
        dst_alphabet_str = ''.join(sorted(set(mapping_dict.values())))

        chr2chr = defaultdict(lambda: unknown_chr)
        chr2chr.update(mapping_dict)
        
        dense2src_dst = [(n + 1, (c, chr2chr[c])) for n, c in enumerate(src_alphabet_str)]

        src_str = [(s, s) for _, (s, _) in dense2src_dst]
        src_str = ''.join([s for _, s in sorted(src_str)])

        dst_str = [(s, d) for _, (s, d) in dense2src_dst]
        dst_str = [(s, d) for s, d in dst_str if d != unknown_chr]  # Remove unknown characters from the destination string
        dst_str = ''.join([d for _, d in sorted(dst_str)])
        
        src_full_str = unknown_chr + src_str
        
        srcchr2dense = {s: n for n, s in enumerate(src_full_str)}

        np_chrord2dense = np.zeros(65536, dtype=np.uint16)
        np_dense2chrord = np.zeros(65536, dtype=np.uint16)
        for c, n in srcchr2dense.items():
            np_chrord2dense[ord(c)] = n
            np_dense2chrord[n] = ord(chr2chr[c])
        return src_alphabet_str, dst_alphabet_str, np_chrord2dense, np_dense2chrord


[docs]
    def __init__(self, mapping_dict: Union[Dict[str, str]] = {}, unknown_chr: str = "�", unicode_normalization: Literal["Dense", "Composite", None] = "Dense"):
        """
        Initialize the LemmatizerBMP instance.

        Parameters
        ----------
        mapping_dict : Union[Dict[str, str]], optional
            A dictionary mapping source characters to destination characters. 
            If a string is provided, it will be converted into a dictionary 
            where each character maps to itself. Defaults to an empty dictionary.
        unknown_chr : str, optional
            The character to use for unknown mappings. Defaults to "�".
        unicode_normalization : Literal["Dense", "Composite", None], optional
            The type of Unicode normalization to apply. 
            - "Dense": Use dense Unicode normalization.
            - "Composite": Use composite Unicode normalization.
            - None: No Unicode normalization is applied.
            Defaults to "Dense".

        Notes
        -----
        This constructor initializes the mapping dictionary, sets up Unicode 
        normalization, and creates internal mappings for efficient character 
        transformations.
        """
        if isinstance(mapping_dict, str):
            mapping_dict = {c: c for c in mapping_dict}
        super().__init__(unicode_normalization=unicode_normalization, unknown_chr=unknown_chr, mapping_dict=mapping_dict.copy())
        self.__src_alphabet_str, self.__dst_alphabet_str, self.__np_chrord2dense, self.__np_dense2chrord = self.__create_mappers(self.mapping_dict, self.unknown_chr)
        self.__max_label = self.__np_dense2chrord.max(0)



[docs]
    def __call__(self, text: str) -> str:
        """
        Transform the input text using the lemmatizer.

        Parameters
        ----------
        text : str
            The input text to transform.

        Returns
        -------
        str
            The transformed text.
        """
        label_seq = self.str_to_intlabel_seq(text)
        return self.intlabel_seq_to_str(label_seq)



[docs]
    def str_to_intlabel_seq(self, text: str) -> np.ndarray:
        """
        Convert a string to a sequence of integer labels.

        Parameters
        ----------
        text : str
            The input string to convert.

        Returns
        -------
        np.ndarray
            A NumPy array of integer labels representing the input string.
        """
        sparse_np_text = fast_str_to_numpy(text)
        dense_np_text = self.__np_chrord2dense[sparse_np_text]
        return dense_np_text



[docs]
    def intlabel_seq_to_str(self, dense_np_text: np.ndarray) -> str:
        """
        Convert a sequence of integer labels back to a string.

        Parameters
        ----------
        dense_np_text : np.ndarray
            A NumPy array of integer labels to convert.

        Returns
        -------
        str
            The reconstructed string.
        """
        output_sparse_text = self.__np_dense2chrord[dense_np_text]
        output_sparse_text[output_sparse_text == 0] = ord(self.unknown_chr)  # Replace unknown characters with the unknown character ordinal
        return fast_numpy_to_str(output_sparse_text)



[docs]
    def get_unigram(self, text: str) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
        """
        Compute unigram statistics for the input text.

        Parameters
        ----------
        text : str
            The input text to analyze.

        Returns
        -------
        Tuple[np.ndarray, np.ndarray, np.ndarray]
            - values : np.ndarray
              Unique integer labels in the text.
            - counts : np.ndarray
              Counts of each unique label.
            - labels : np.ndarray
              Mapping of integer labels to their corresponding characters.
        """
        np_text = self.str_to_intlabel_seq(self.unknown_chr + self.src_alphabet_str + text)
        values, counts = np.unique(np_text, return_counts=True)
        counts = counts - 1  # removing the counts of the added characters
        labels = self.intlabel_seq_to_str(values)
        labels = np.array(list(labels), dtype='<U1')
        return values, counts, labels



[docs]
    def str_to_onehot(self, text: str, time_first: bool = True) -> np.ndarray:
        """
        Convert a string to a one-hot encoded representation.

        Parameters
        ----------
        text : str
            The input string to convert.
        time_first : bool, optional
            If True, the output array will have shape (T, C), where T is the length of the string 
            and C is the number of unique characters. If False, the output will have shape (C, T). 
            Defaults to True.

        Returns
        -------
        np.ndarray
            A one-hot encoded NumPy array representing the input string.
        """
        #raise NotImplemented("Not implemented yet.")
        seq = self.str_to_intlabel_seq(text)
        onehot = np.zeros((len(seq), len(self)), dtype=np.double)

        onehot[np.arange(len(seq)), seq] = 1.0
        if not time_first:
            onehot = onehot.T
        return onehot

    

[docs]
    def onehot_to_str(self, onehot: np.ndarray, time_first: bool = True) -> str:
        """
        Convert a one-hot encoded representation back to a string.

        Parameters
        ----------
        onehot : np.ndarray
            A one-hot encoded NumPy array to convert.
        time_first : bool, optional
            If True, the input array is expected to have shape (T, C). If False, it is expected to have shape (C, T). 
            Defaults to True.
        Returns
        -------
        str
            The reconstructed string.
        """
        if onehot.ndim == 1:
            onehot = onehot.reshape(1, -1)
        dense_np_text = np.argmax(onehot, axis=1 if time_first else 0)
        return self.intlabel_seq_to_str(dense_np_text)


    @property
    def dst_alphabet_str(self) -> str:
        """
        Get the destination alphabet as a string.

        Returns
        -------
        str
            The destination alphabet string.
        """
        return self.__dst_alphabet_str

    @property
    def src_alphabet_str(self) -> str:
        """
        Get the source alphabet as a string.

        Returns
        -------
        str
            The source alphabet string.
        """
        return self.__src_alphabet_str
    
    def __repr__(self):
        """
        Return a string representation of the LemmatizerBMP instance.
        Can act as a string serialisation of the instance.

        Returns
        -------
        str
            A string representation of the instance.
        """
        return f"LemmatizerBMP(mapping_dict={repr(self.mapping_dict)}, unknown_chr={repr(self.unknown_chr)})"