import numpy as np
from tcutility import log
from typing import List
[docs]
def naive_recursive(a: str, b: str) -> float:
'''
The naïve recursive algorithm to obtain the Levenshtein distance between two strings.
We do not recommend using this algorithm as it is quite slow and faster alternatives exist.
Args:
a, b: strings to compare.
Returns:
The Levenshtein distance between the strings ``a`` and ``b``.
.. seealso::
:func:`wagner_fischer`
A more efficient algorithm to obtain the Levenshtein distance (up to 25x faster).
'''
if len(b) == 0:
return len(a)
if len(a) == 0:
return len(b)
def tail(s):
return s[1:]
if a[0] == b[0]:
return naive_recursive(tail(a), tail(b))
d = 1 + min(naive_recursive(tail(a), b), naive_recursive(a, tail(b)), naive_recursive(tail(a), tail(b)))
return d
[docs]
def wagner_fischer(a: str, b: str, substitution_cost: float = 1, case_missmatch_cost: float = 1, insertion_cost: float = 1) -> float:
'''
Return the Levenshtein distance using the Wagner-Fischer algorithm.
You can also change the penalty for various errors for this algorithm.
By default, all types of errors incur a penalty of 1.
Args:
a, b: strings to compare.
substitution_cost: the penalty for the erroneous substitution of a character.
case_missmatch_cost: the penalty for miss-matching the case of a character.
insertion_cost: the cost for the erroneous insertion or deletion of a character.
Returns:
The Levenshtein distance between the strings ``a`` and ``b``.
Example:
.. code-block:: python
>>> wagner_fischer('kitten', 'sitting')
3
.. seealso::
:func:`naive_recursive`
An alternative (and slower) algorithm to obtain the Levenshtein distance.
'''
# initialize an empty matrix
d = np.zeros((len(a)+1, len(b)+1)).astype(float)
# the top row and left column are always the same
for i in range(len(a)):
d[i+1, 0] = i+1
for i in range(len(b)):
d[0, i+1] = i+1
# iteratively build the matrix
for i in range(1, len(a)+1):
for j in range(1, len(b)+1):
# if the two characters are the same the cost is 0
if a[i-1] == b[j-1]:
cost = 0
# case miss-match
elif a[i-1].lower() == b[j-1].lower():
cost = case_missmatch_cost
# substitution
else:
cost = substitution_cost
# here determine if the substitution/miss-match cost is greater than the insertion or deletion cost
d[i, j] = min(d[i-1, j] + insertion_cost, d[i, j-1] + insertion_cost, d[i-1, j-1] + cost)
# return the bottom-right element, this is the final edit-distance
return d[-1, -1]
[docs]
def get_closest(a: str, others: List[str], compare_func=wagner_fischer, ignore_case: bool = False, ignore_chars: str = '', maximum_distance: int = None, **kwargs) -> List[str]:
'''
Return strings that are similar to an input string using the Levenshtein distance.
Args:
a: the string to compare the rest to.
others: a collection of strings to compare to a. The returned strings will be taken from this collection.
compare_func: the function to use to compare the strings. Defaults to the efficient :func:`wagner_fischer` algorithm.
ignore_case: whether the case of the strings is taken into account. If enabled, all strings are turned to lower-case before comparison.
ignore_chars: a strings specifying characters that should be ignored.
maximum_distance: the maximum Levenshtein distance to allow.
If it is lower than the lowest distance for the collection of strings, we return the strings with the lowest distance.
If set to ``None`` we return the lowest distance strings.
Returns:
A collection of strings that have a Levenshtein distance to ``a`` below ``maximum_distance``
or have the lowest distance to ``a`` if all strings have a distance greater than ``maximum_distance``.
If the lowest distance is ``0``, return an empty list instead.
Example:
.. code-block:: python
>>> closest = get_closest('kitten', ['mitten', 'bitten', 'sitting'])
>>> print(closest)
['mitten', 'bitten']
'''
if ignore_case:
a = a.lower()
for char in ignore_chars:
a = a.replace(char, '')
dists = []
for other in others:
if ignore_case:
other = other.lower()
for char in ignore_chars:
other = other.replace(char, '')
dists.append(compare_func(a, other, **kwargs))
if maximum_distance is None:
maximum_distance = -1
lowest_strs = [other for dist, other in zip(dists, others) if 0 < dist <= max(maximum_distance, min(dists))]
return lowest_strs
[docs]
def make_suggestion(a: str, others: List[str], **kwargs):
'''
Print a warning that gives suggestions for strings that are close to a given string.
Example:
.. code-block:: python
>>> make_suggestion('kitten', ['mitten', 'bitten', 'sitting'])
[2024/01/30 15:26:35] [WARNING](main): Could not find "kitten". Did you mean mitten or bitten?
.. seealso::
:func:`get_closest` for a description of the function arguments.
'''
closest = get_closest(a, others, **kwargs)
if len(closest) > 1:
closest_string = " or ".join([", ".join(closest[:-1]), closest[-1]])
else:
closest_string = closest[0]
# write a warning message
log.warn(f'Could not find "{a}". Did you mean {closest_string}?', caller_level=3)
[docs]
def check(a: str, others: List[str], caller_level=2, **kwargs):
closest = get_closest(a, others, **kwargs)
if len(closest) == 0:
return
if len(closest) > 1:
closest_string = " or ".join([", ".join(closest[:-1]), closest[-1]])
else:
closest_string = closest[0]
# write a warning message
msg = f"({log.caller_name(caller_level)}): " + f'Could not find "{a}". Did you mean {closest_string}?'
raise ValueError(msg)