Source code for pylelemmatize.fast_mapper

from collections import defaultdict
from typing import Dict, Optional, Tuple, Literal, Union
import numpy as np
from unidecode import unidecode
from .abstract_mapper import AbstractLemmatizer, GenericLemmatizer, fast_numpy_to_str, fast_str_to_numpy


[docs] class LemmatizerBMP(GenericLemmatizer):
[docs] @staticmethod def alphabet_in_bmp(alphabet: Optional[str]) -> bool: """ Check if all characters in the given alphabet are within the BMP (Basic Multilingual Plane). Parameters ---------- alphabet : Optional[str] A string containing the alphabet to check. If None, the method returns True. Returns ------- bool True if all characters are within the BMP, False otherwise. """ if alphabet is None: return True return (fast_str_to_numpy(alphabet) > 65535).sum() == 0
@staticmethod def __create_mappers(mapping_dict, unknown_chr) -> Tuple[defaultdict, np.ndarray, Dict[int, str], Dict[str, int], np.ndarray, np.ndarray, np.ndarray]: """ Create mappings and data structures for character transformation. This static method generates several mappings and arrays to facilitate the transformation of characters based on a given mapping dictionary. It ensures that all characters involved are within the BMP (Basic Multilingual Plane) and handles unknown characters by mapping them to a specified `unknown_chr`. Parameters ---------- mapping_dict : dict A dictionary where keys are source characters and values are destination characters. Both keys and values must be BMP characters. unknown_chr : str A single BMP character used to represent unknown or unmapped characters. Returns ------- Tuple[defaultdict, np.ndarray, Dict[int, str], Dict[str, int], np.ndarray, np.ndarray, np.ndarray] - src_alphabet_str : str A string containing all source characters sorted in ascending order. - dst_alphabet_str : str A string containing all unique destination characters sorted in ascending order. - np_chrord2dense : np.ndarray A NumPy array mapping Unicode code points to dense indices. - np_dense2chrord : np.ndarray A NumPy array mapping dense indices back to Unicode code points. Raises ------ AssertionError If any character in `mapping_dict` keys or values, or `unknown_chr`, is not a BMP character. If `unknown_chr` is present in `mapping_dict` but does not map to itself. """ if any([ord(c) >= 65536 for c in mapping_dict.keys()]) or \ any([ord(c) >= 65536 for c in mapping_dict.values()]) or \ ord(unknown_chr) >= 65536: raise ValueError("LemmatizerBMP can only handle BMP characters. Please use GenericLemmatizer for non-BMP characters.") if unknown_chr in mapping_dict: assert mapping_dict[unknown_chr] == unknown_chr, "unknown_chr must map to itself in the mapping_dict." del mapping_dict[unknown_chr] # Remove the unknown character from the mapping to avoid confusion src_alphabet_str = ''.join(sorted(mapping_dict.keys())) dst_alphabet_str = ''.join(sorted(set(mapping_dict.values()))) chr2chr = defaultdict(lambda: unknown_chr) chr2chr.update(mapping_dict) dense2src_dst = [(n + 1, (c, chr2chr[c])) for n, c in enumerate(src_alphabet_str)] src_str = [(s, s) for _, (s, _) in dense2src_dst] src_str = ''.join([s for _, s in sorted(src_str)]) dst_str = [(s, d) for _, (s, d) in dense2src_dst] dst_str = [(s, d) for s, d in dst_str if d != unknown_chr] # Remove unknown characters from the destination string dst_str = ''.join([d for _, d in sorted(dst_str)]) src_full_str = unknown_chr + src_str srcchr2dense = {s: n for n, s in enumerate(src_full_str)} np_chrord2dense = np.zeros(65536, dtype=np.uint16) np_dense2chrord = np.zeros(65536, dtype=np.uint16) for c, n in srcchr2dense.items(): np_chrord2dense[ord(c)] = n np_dense2chrord[n] = ord(chr2chr[c]) return src_alphabet_str, dst_alphabet_str, np_chrord2dense, np_dense2chrord
[docs] def __init__(self, mapping_dict: Union[Dict[str, str]] = {}, unknown_chr: str = "�", unicode_normalization: Literal["Dense", "Composite", None] = "Dense"): """ Initialize the LemmatizerBMP instance. Parameters ---------- mapping_dict : Union[Dict[str, str]], optional A dictionary mapping source characters to destination characters. If a string is provided, it will be converted into a dictionary where each character maps to itself. Defaults to an empty dictionary. unknown_chr : str, optional The character to use for unknown mappings. Defaults to "�". unicode_normalization : Literal["Dense", "Composite", None], optional The type of Unicode normalization to apply. - "Dense": Use dense Unicode normalization. - "Composite": Use composite Unicode normalization. - None: No Unicode normalization is applied. Defaults to "Dense". Notes ----- This constructor initializes the mapping dictionary, sets up Unicode normalization, and creates internal mappings for efficient character transformations. """ if isinstance(mapping_dict, str): mapping_dict = {c: c for c in mapping_dict} super().__init__(unicode_normalization=unicode_normalization, unknown_chr=unknown_chr, mapping_dict=mapping_dict.copy()) self.__src_alphabet_str, self.__dst_alphabet_str, self.__np_chrord2dense, self.__np_dense2chrord = self.__create_mappers(self.mapping_dict, self.unknown_chr) self.__max_label = self.__np_dense2chrord.max(0)
[docs] def __call__(self, text: str) -> str: """ Transform the input text using the lemmatizer. Parameters ---------- text : str The input text to transform. Returns ------- str The transformed text. """ label_seq = self.str_to_intlabel_seq(text) return self.intlabel_seq_to_str(label_seq)
[docs] def str_to_intlabel_seq(self, text: str) -> np.ndarray: """ Convert a string to a sequence of integer labels. Parameters ---------- text : str The input string to convert. Returns ------- np.ndarray A NumPy array of integer labels representing the input string. """ sparse_np_text = fast_str_to_numpy(text) dense_np_text = self.__np_chrord2dense[sparse_np_text] return dense_np_text
[docs] def intlabel_seq_to_str(self, dense_np_text: np.ndarray) -> str: """ Convert a sequence of integer labels back to a string. Parameters ---------- dense_np_text : np.ndarray A NumPy array of integer labels to convert. Returns ------- str The reconstructed string. """ output_sparse_text = self.__np_dense2chrord[dense_np_text] output_sparse_text[output_sparse_text == 0] = ord(self.unknown_chr) # Replace unknown characters with the unknown character ordinal return fast_numpy_to_str(output_sparse_text)
[docs] def get_unigram(self, text: str) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: """ Compute unigram statistics for the input text. Parameters ---------- text : str The input text to analyze. Returns ------- Tuple[np.ndarray, np.ndarray, np.ndarray] - values : np.ndarray Unique integer labels in the text. - counts : np.ndarray Counts of each unique label. - labels : np.ndarray Mapping of integer labels to their corresponding characters. """ np_text = self.str_to_intlabel_seq(self.unknown_chr + self.src_alphabet_str + text) values, counts = np.unique(np_text, return_counts=True) counts = counts - 1 # removing the counts of the added characters labels = self.intlabel_seq_to_str(values) labels = np.array(list(labels), dtype='<U1') return values, counts, labels
[docs] def str_to_onehot(self, text: str, time_first: bool = True) -> np.ndarray: """ Convert a string to a one-hot encoded representation. Parameters ---------- text : str The input string to convert. time_first : bool, optional If True, the output array will have shape (T, C), where T is the length of the string and C is the number of unique characters. If False, the output will have shape (C, T). Defaults to True. Returns ------- np.ndarray A one-hot encoded NumPy array representing the input string. """ #raise NotImplemented("Not implemented yet.") seq = self.str_to_intlabel_seq(text) onehot = np.zeros((len(seq), len(self)), dtype=np.double) onehot[np.arange(len(seq)), seq] = 1.0 if not time_first: onehot = onehot.T return onehot
[docs] def onehot_to_str(self, onehot: np.ndarray, time_first: bool = True) -> str: """ Convert a one-hot encoded representation back to a string. Parameters ---------- onehot : np.ndarray A one-hot encoded NumPy array to convert. time_first : bool, optional If True, the input array is expected to have shape (T, C). If False, it is expected to have shape (C, T). Defaults to True. Returns ------- str The reconstructed string. """ if onehot.ndim == 1: onehot = onehot.reshape(1, -1) dense_np_text = np.argmax(onehot, axis=1 if time_first else 0) return self.intlabel_seq_to_str(dense_np_text)
@property def dst_alphabet_str(self) -> str: """ Get the destination alphabet as a string. Returns ------- str The destination alphabet string. """ return self.__dst_alphabet_str @property def src_alphabet_str(self) -> str: """ Get the source alphabet as a string. Returns ------- str The source alphabet string. """ return self.__src_alphabet_str def __repr__(self): """ Return a string representation of the LemmatizerBMP instance. Can act as a string serialisation of the instance. Returns ------- str A string representation of the instance. """ return f"LemmatizerBMP(mapping_dict={repr(self.mapping_dict)}, unknown_chr={repr(self.unknown_chr)})"