Source code for tcutility.spell_check

import numpy as np
from tcutility import log
from typing import List


[docs] def naive_recursive(a: str, b: str) -> float: ''' The naïve recursive algorithm to obtain the Levenshtein distance between two strings. We do not recommend using this algorithm as it is quite slow and faster alternatives exist. Args: a, b: strings to compare. Returns: The Levenshtein distance between the strings ``a`` and ``b``. .. seealso:: :func:`wagner_fischer` A more efficient algorithm to obtain the Levenshtein distance (up to 25x faster). ''' if len(b) == 0: return len(a) if len(a) == 0: return len(b) def tail(s): return s[1:] if a[0] == b[0]: return naive_recursive(tail(a), tail(b)) d = 1 + min(naive_recursive(tail(a), b), naive_recursive(a, tail(b)), naive_recursive(tail(a), tail(b))) return d
[docs] def wagner_fischer(a: str, b: str, substitution_cost: float = 1, case_missmatch_cost: float = 1, insertion_cost: float = 1) -> float: ''' Return the Levenshtein distance using the Wagner-Fischer algorithm. You can also change the penalty for various errors for this algorithm. By default, all types of errors incur a penalty of 1. Args: a, b: strings to compare. substitution_cost: the penalty for the erroneous substitution of a character. case_missmatch_cost: the penalty for miss-matching the case of a character. insertion_cost: the cost for the erroneous insertion or deletion of a character. Returns: The Levenshtein distance between the strings ``a`` and ``b``. Example: .. code-block:: python >>> wagner_fischer('kitten', 'sitting') 3 .. seealso:: :func:`naive_recursive` An alternative (and slower) algorithm to obtain the Levenshtein distance. ''' # initialize an empty matrix d = np.zeros((len(a)+1, len(b)+1)).astype(float) # the top row and left column are always the same for i in range(len(a)): d[i+1, 0] = i+1 for i in range(len(b)): d[0, i+1] = i+1 # iteratively build the matrix for i in range(1, len(a)+1): for j in range(1, len(b)+1): # if the two characters are the same the cost is 0 if a[i-1] == b[j-1]: cost = 0 # case miss-match elif a[i-1].lower() == b[j-1].lower(): cost = case_missmatch_cost # substitution else: cost = substitution_cost # here determine if the substitution/miss-match cost is greater than the insertion or deletion cost d[i, j] = min(d[i-1, j] + insertion_cost, d[i, j-1] + insertion_cost, d[i-1, j-1] + cost) # return the bottom-right element, this is the final edit-distance return d[-1, -1]
[docs] def get_closest(a: str, others: List[str], compare_func=wagner_fischer, ignore_case: bool = False, ignore_chars: str = '', maximum_distance: int = None, **kwargs) -> List[str]: ''' Return strings that are similar to an input string using the Levenshtein distance. Args: a: the string to compare the rest to. others: a collection of strings to compare to a. The returned strings will be taken from this collection. compare_func: the function to use to compare the strings. Defaults to the efficient :func:`wagner_fischer` algorithm. ignore_case: whether the case of the strings is taken into account. If enabled, all strings are turned to lower-case before comparison. ignore_chars: a strings specifying characters that should be ignored. maximum_distance: the maximum Levenshtein distance to allow. If it is lower than the lowest distance for the collection of strings, we return the strings with the lowest distance. If set to ``None`` we return the lowest distance strings. Returns: A collection of strings that have a Levenshtein distance to ``a`` below ``maximum_distance`` or have the lowest distance to ``a`` if all strings have a distance greater than ``maximum_distance``. If the lowest distance is ``0``, return an empty list instead. Example: .. code-block:: python >>> closest = get_closest('kitten', ['mitten', 'bitten', 'sitting']) >>> print(closest) ['mitten', 'bitten'] ''' if ignore_case: a = a.lower() for char in ignore_chars: a = a.replace(char, '') dists = [] for other in others: if ignore_case: other = other.lower() for char in ignore_chars: other = other.replace(char, '') dists.append(compare_func(a, other, **kwargs)) if maximum_distance is None: maximum_distance = -1 lowest_strs = [other for dist, other in zip(dists, others) if 0 < dist <= max(maximum_distance, min(dists))] return lowest_strs
[docs] def make_suggestion(a: str, others: List[str], **kwargs): ''' Print a warning that gives suggestions for strings that are close to a given string. Example: .. code-block:: python >>> make_suggestion('kitten', ['mitten', 'bitten', 'sitting']) [2024/01/30 15:26:35] [WARNING](main): Could not find "kitten". Did you mean mitten or bitten? .. seealso:: :func:`get_closest` for a description of the function arguments. ''' closest = get_closest(a, others, **kwargs) if len(closest) > 1: closest_string = " or ".join([", ".join(closest[:-1]), closest[-1]]) else: closest_string = closest[0] # write a warning message log.warn(f'Could not find "{a}". Did you mean {closest_string}?', caller_level=3)
[docs] def check(a: str, others: List[str], caller_level=2, **kwargs): closest = get_closest(a, others, **kwargs) if len(closest) == 0: return if len(closest) > 1: closest_string = " or ".join([", ".join(closest[:-1]), closest[-1]]) else: closest_string = closest[0] # write a warning message msg = f"({log.caller_name(caller_level)}): " + f'Could not find "{a}". Did you mean {closest_string}?' raise ValueError(msg)