|
#!/usr/bin/env python
|
|
|
|
# credit: https://github.com/aslpavel/sweep.py/blob/master/sweep.py
|
|
|
|
from functools import partial
|
|
|
|
def fuzzy_scorer(niddle, haystack):
|
|
"""Fuzzy matching for `fzy` utility
|
|
source: https://github.com/jhawthorn/fzy/blob/master/src/match.c
|
|
"""
|
|
SCORE_MIN = float("-inf")
|
|
SCORE_MAX = float("inf")
|
|
SCORE_GAP_LEADING = -0.005
|
|
SCORE_GAP_TRAILING = -0.005
|
|
SCORE_GAP_INNER = -0.01
|
|
SCORE_MATCH_CONSECUTIVE = 1.0
|
|
|
|
def char_range_with(c_start, c_stop, v, d):
|
|
d = d.copy()
|
|
d.update((chr(c), v) for c in range(ord(c_start), ord(c_stop) + 1))
|
|
return d
|
|
|
|
lower_with = partial(char_range_with, "a", "z")
|
|
upper_with = partial(char_range_with, "A", "Z")
|
|
digit_with = partial(char_range_with, "0", "9")
|
|
|
|
SCORE_MATCH_SLASH = 0.9
|
|
SCORE_MATCH_WORD = 0.8
|
|
SCORE_MATCH_CAPITAL = 0.7
|
|
SCORE_MATCH_DOT = 0.6
|
|
BONUS_MAP = {
|
|
"/": SCORE_MATCH_SLASH,
|
|
"-": SCORE_MATCH_WORD,
|
|
"_": SCORE_MATCH_WORD,
|
|
" ": SCORE_MATCH_WORD,
|
|
".": SCORE_MATCH_DOT,
|
|
}
|
|
BONUS_STATES = [{}, BONUS_MAP, lower_with(SCORE_MATCH_CAPITAL, BONUS_MAP)]
|
|
BONUS_INDEX = digit_with(1, lower_with(1, upper_with(2, {})))
|
|
|
|
def bonus(haystack):
|
|
"""Additional bonus based on previous char in haystack
|
|
"""
|
|
c_prev = "/"
|
|
bonus = []
|
|
for c in haystack:
|
|
bonus.append(BONUS_STATES[BONUS_INDEX.get(c, 0)].get(c_prev, 0))
|
|
c_prev = c
|
|
return bonus
|
|
|
|
def subsequence(niddle, haystack):
|
|
"""Check if niddle is subsequence of haystack
|
|
"""
|
|
niddle, haystack = niddle.lower(), haystack.lower()
|
|
if not niddle:
|
|
True
|
|
offset = 0
|
|
for char in niddle:
|
|
offset = haystack.find(char, offset) + 1
|
|
if offset <= 0:
|
|
return False
|
|
return True
|
|
|
|
def score(niddle, haystack):
|
|
"""Calculate score, and positions of haystack
|
|
"""
|
|
n, m = len(niddle), len(haystack)
|
|
bonus_score = bonus(haystack)
|
|
niddle, haystack = niddle.lower(), haystack.lower()
|
|
|
|
if n == 0 or n == m:
|
|
return SCORE_MAX, list(range(n))
|
|
D = [[0] * m for _ in range(n)] # best score ending with `niddle[:i]`
|
|
M = [[0] * m for _ in range(n)] # best score for `niddle[:i]`
|
|
for i in range(n):
|
|
prev_score = SCORE_MIN
|
|
gap_score = SCORE_GAP_TRAILING if i == n - 1 else SCORE_GAP_INNER
|
|
|
|
for j in range(m):
|
|
if niddle[i] == haystack[j]:
|
|
score = SCORE_MIN
|
|
if i == 0:
|
|
score = j * SCORE_GAP_LEADING + bonus_score[j]
|
|
elif j != 0:
|
|
score = max(
|
|
M[i - 1][j - 1] + bonus_score[j],
|
|
D[i - 1][j - 1] + SCORE_MATCH_CONSECUTIVE,
|
|
)
|
|
D[i][j] = score
|
|
M[i][j] = prev_score = max(score, prev_score + gap_score)
|
|
else:
|
|
D[i][j] = SCORE_MIN
|
|
M[i][j] = prev_score = prev_score + gap_score
|
|
|
|
match_required = False
|
|
position = [0] * n
|
|
i, j = n - 1, m - 1
|
|
while i >= 0:
|
|
while j >= 0:
|
|
if (match_required or D[i][j] == M[i][j]) and D[i][j] != SCORE_MIN:
|
|
match_required = (
|
|
i > 0
|
|
and j > 0
|
|
and M[i][j] == D[i - 1][j - 1] + SCORE_MATCH_CONSECUTIVE
|
|
)
|
|
position[i] = j
|
|
j -= 1
|
|
break
|
|
else:
|
|
j -= 1
|
|
i -= 1
|
|
|
|
return M[n - 1][m - 1], position
|
|
|
|
if subsequence(niddle, haystack):
|
|
return score(niddle, haystack)
|
|
else:
|
|
return SCORE_MIN, None
|
|
|
|
def rank_candidates(query, candidates):
|
|
"""Score candidates against query in batch and return sorted result
|
|
"""
|
|
result = []
|
|
candidates = candidates if isinstance(candidates, list) else list(candidates)
|
|
for text in candidates:
|
|
score, indices = fuzzy_scorer(query, text)
|
|
if indices is None:
|
|
continue
|
|
result.append({"text": text, "score": score, "indices": indices})
|
|
|
|
result.sort(key=lambda x: x["score"], reverse=True)
|
|
return result
|