From 8c2871f89846f2c34f52061323dc7503855266ea Mon Sep 17 00:00:00 2001 From: Shirayuki Nekomata Date: Fri, 18 Oct 2019 10:39:27 +0700 Subject: Make it easier for user to search for tags #### Closes #231 Applying the algorithm for `Needles and Haystack` to find and match tag in tags, for example: ![Example](https://cdn.discordapp.com/attachments/634243438459486219/634592981915140107/unknown.png) This only applies to searching tag_name with more than 3 in length, and at least 80% of its letters are found, from left to right. There are 3 levels of checking, stop at first found: - Check if exact name ( case insensitive ) O(1) getting from a dictionary Dict[str, Tag] - Check for all tags that has 100% matching via algorithm - Check for all tags that has >= 80% matching If there are more than one hit, it will be shown as suggestions: ![Suggestions](https://cdn.discordapp.com/attachments/634243438459486219/634595369531211778/unknown.png) In order to avoid api being called multiple times, I've implemented a cache to only refresh itself when the is a gap of more than 5 minutes from the last api call to get all tags. Editing / Adding / Deleting tags will also modify the cache directly. ##### What about other solution like fuzzywuzzy? fuzzywuzzy was considered for using, but from testing, it was giving much lower scores than expected: Code used to test: ```py from fuzzywuzzy import fuzz def _fuzzy_search(search: str, target: str) -> bool: found = 0 index = 0 _search = search.lower().replace(' ', '') _target = target.lower().replace(' ', '') for letter in _search: index = _target.find(letter, index) if index == -1: break found += index > 0 # return found / len(_search) * 100 return ( found / len(_search) * 100, fuzz.ratio(search, target), fuzz.partial_ratio(search, target) ) tests = ( 'this-is-gonna-be-fun', 'this-too-will-be-fun' ) for test in tests: print(test, '->', _fuzzy_search('this too fun', test)) ``` Result from test: ```py this-is-gonna-be-fun -> (30.0, 50, 50) this-too-will-be-fun -> (90.0, 62, 58) ``` --- bot/cogs/tags.py | 70 ++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 61 insertions(+), 9 deletions(-) diff --git a/bot/cogs/tags.py b/bot/cogs/tags.py index cd70e783a..1aea97b37 100644 --- a/bot/cogs/tags.py +++ b/bot/cogs/tags.py @@ -9,7 +9,6 @@ from bot.converters import TagContentConverter, TagNameConverter from bot.decorators import with_role from bot.pagination import LinePaginator - log = logging.getLogger(__name__) TEST_CHANNELS = ( @@ -26,6 +25,44 @@ class Tags(Cog): self.bot = bot self.tag_cooldowns = {} + self._cache = {} + self._last_fetch = None + + async def _get_tags(self, is_forced: bool = False) -> None: + """Getting all tags.""" + # Refresh only when there's a more than 5m gap from last call. + if is_forced or not self._last_fetch or time.time() - self._last_fetch > 5 * 60: + tags = await self.bot.api_client.get('bot/tags') + self._cache = {tag['title'].lower(): tag for tag in tags} + + @staticmethod + def _fuzzy_search(search: str, target: str) -> bool: + found = 0 + index = 0 + _search = search.lower().replace(' ', '') + _target = target.lower().replace(' ', '') + for letter in _search: + index = _target.find(letter, index) + if index == -1: + break + found += index > 0 + return found / len(_search) * 100 + + def _get_suggestions(self, tag_name: str, score: int) -> list: + return sorted( + (tag for tag in self._cache.values() if Tags._fuzzy_search(tag_name, tag['title']) >= score), + key=lambda tag: Tags._fuzzy_search(tag_name, tag['title']), + reverse=True + ) + + async def _get_tag(self, tag_name: str) -> list: + """Get a specific tag.""" + await self._get_tags() + found = [self._cache.get(tag_name.lower(), None)] + if not found[0]: + return self._get_suggestions(tag_name, 100) or self._get_suggestions(tag_name, 80) + return found + @group(name='tags', aliases=('tag', 't'), invoke_without_command=True) async def tags_group(self, ctx: Context, *, tag_name: TagNameConverter = None) -> None: """Show all known tags, a single tag, or run a subcommand.""" @@ -59,17 +96,29 @@ class Tags(Cog): f"Cooldown ends in {time_left:.1f} seconds.") return + await self._get_tags() + if tag_name is not None: - tag = await self.bot.api_client.get(f'bot/tags/{tag_name}') - if ctx.channel.id not in TEST_CHANNELS: - self.tag_cooldowns[tag_name] = { - "time": time.time(), - "channel": ctx.channel.id - } - await ctx.send(embed=Embed.from_dict(tag['embed'])) + # tag = await self.bot.api_client.get(f'bot/tags/{tag_name}') + founds = await self._get_tag(tag_name) + + if len(founds) == 1: + tag = founds[0] + if ctx.channel.id not in TEST_CHANNELS: + self.tag_cooldowns[tag_name] = { + "time": time.time(), + "channel": ctx.channel.id + } + await ctx.send(embed=Embed.from_dict(tag['embed'])) + elif founds and len(tag_name) >= 3: + await ctx.send(embed=Embed( + title='Did you mean ...', + description='\n'.join(tag['title'] for tag in founds[:10]) + )) else: - tags = await self.bot.api_client.get('bot/tags') + # tags = await self.bot.api_client.get('bot/tags') + tags = self._cache.values() if not tags: await ctx.send(embed=Embed( description="**There are no tags in the database!**", @@ -105,6 +154,7 @@ class Tags(Cog): } await self.bot.api_client.post('bot/tags', json=body) + self._cache[tag_name.lower()] = await self.bot.api_client.get(f'bot/tags/{tag_name}') log.debug(f"{ctx.author} successfully added the following tag to our database: \n" f"tag_name: {tag_name}\n" @@ -134,6 +184,7 @@ class Tags(Cog): } await self.bot.api_client.patch(f'bot/tags/{tag_name}', json=body) + self._cache[tag_name.lower()] = await self.bot.api_client.get(f'bot/tags/{tag_name}') log.debug(f"{ctx.author} successfully edited the following tag in our database: \n" f"tag_name: {tag_name}\n" @@ -150,6 +201,7 @@ class Tags(Cog): async def delete_command(self, ctx: Context, *, tag_name: TagNameConverter) -> None: """Remove a tag from the database.""" await self.bot.api_client.delete(f'bot/tags/{tag_name}') + self._cache.pop(tag_name.lower(), None) log.debug(f"{ctx.author} successfully deleted the tag called '{tag_name}'") await ctx.send(embed=Embed( -- cgit v1.2.3 From 7f0e6733de8e2b6c3d13834916d790673547e1fb Mon Sep 17 00:00:00 2001 From: Shirayuki Nekomata Date: Tue, 4 Feb 2020 12:23:29 +0700 Subject: Fixed _last_fetch not being updated after each api call. - Changed type of `self._last_fetch` to `float` and give it the initial value of `0.0` instead of `None` - Assigned `time.time()` to `time_now` to avoid calling this function twice. - Added `self._last_fetch = time_now` after calling the api call. --- bot/cogs/tags.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/bot/cogs/tags.py b/bot/cogs/tags.py index 7effaf754..9e06b702c 100644 --- a/bot/cogs/tags.py +++ b/bot/cogs/tags.py @@ -27,14 +27,16 @@ class Tags(Cog): self.tag_cooldowns = {} self._cache = {} - self._last_fetch = None + self._last_fetch: float = 0.0 async def _get_tags(self, is_forced: bool = False) -> None: - """Getting all tags.""" - # Refresh only when there's a more than 5m gap from last call. - if is_forced or not self._last_fetch or time.time() - self._last_fetch > 5 * 60: + """Get all tags.""" + # refresh only when there's a more than 5m gap from last call. + time_now: float = time.time() + if is_forced or not self._last_fetch or time_now - self._last_fetch > 5 * 60: tags = await self.bot.api_client.get('bot/tags') self._cache = {tag['title'].lower(): tag for tag in tags} + self._last_fetch = time_now @staticmethod def _fuzzy_search(search: str, target: str) -> bool: -- cgit v1.2.3 From 868de4716c5b6a3120f665d460a8987bd6f16302 Mon Sep 17 00:00:00 2001 From: Shirayuki Nekomata Date: Tue, 4 Feb 2020 12:26:27 +0700 Subject: Refactored _get_suggestions following Mark's suggestions about inefficiency. - Matching scores will be calculated once now and stored in the dict `scores`. - Allow `_get_suggestions()` to go through a list of score threshold and return the first list of matching tags that's not empty and above the threshold. This avoid calling the function multiple time like before ( `self._get_suggestions(tag_name, 100) or self._get_suggestions(tag_name, 80)` for example, is calling this function twice, and is inefficient ) - Deleted commented line. - Added `typing` module for more typehints. --- bot/cogs/tags.py | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/bot/cogs/tags.py b/bot/cogs/tags.py index 9e06b702c..8d3586b19 100644 --- a/bot/cogs/tags.py +++ b/bot/cogs/tags.py @@ -1,5 +1,6 @@ import logging import time +from typing import Dict, List, Optional from discord import Colour, Embed from discord.ext.commands import Cog, Context, group @@ -39,9 +40,9 @@ class Tags(Cog): self._last_fetch = time_now @staticmethod - def _fuzzy_search(search: str, target: str) -> bool: - found = 0 - index = 0 + def _fuzzy_search(search: str, target: str) -> int: + """A simple scoring algorithm based on how many letters are found / total, with order in mind.""" + found, index = 0, 0 _search = search.lower().replace(' ', '') _target = target.lower().replace(' ', '') for letter in _search: @@ -51,19 +52,32 @@ class Tags(Cog): found += index > 0 return found / len(_search) * 100 - def _get_suggestions(self, tag_name: str, score: int) -> list: - return sorted( - (tag for tag in self._cache.values() if Tags._fuzzy_search(tag_name, tag['title']) >= score), - key=lambda tag: Tags._fuzzy_search(tag_name, tag['title']), - reverse=True - ) + def _get_suggestions(self, tag_name: str, thresholds: Optional[List[int]] = None) -> List[str]: + """Return a list of suggested tags.""" + scores: Dict[str, int] = { + tag_title: Tags._fuzzy_search(tag_name, tag['title']) + for tag_title, tag in self._cache.items() + } + + thresholds = thresholds or [100, 80] + + for threshold in thresholds: + suggestions = [ + self._cache[tag_title] + for tag_title, matching_score in scores.items() + if matching_score >= threshold + ] + if suggestions: + return suggestions + + return [] async def _get_tag(self, tag_name: str) -> list: """Get a specific tag.""" await self._get_tags() found = [self._cache.get(tag_name.lower(), None)] if not found[0]: - return self._get_suggestions(tag_name, 100) or self._get_suggestions(tag_name, 80) + return self._get_suggestions(tag_name, thresholds=[100, 80]) return found @group(name='tags', aliases=('tag', 't'), invoke_without_command=True) @@ -102,7 +116,6 @@ class Tags(Cog): await self._get_tags() if tag_name is not None: - # tag = await self.bot.api_client.get(f'bot/tags/{tag_name}') founds = await self._get_tag(tag_name) if len(founds) == 1: @@ -120,7 +133,6 @@ class Tags(Cog): )) else: - # tags = await self.bot.api_client.get('bot/tags') tags = self._cache.values() if not tags: await ctx.send(embed=Embed( -- cgit v1.2.3 From a38926fe797cdcc13d64d836776f56db09e9efd2 Mon Sep 17 00:00:00 2001 From: Shirayuki Nekomata Date: Wed, 5 Feb 2020 04:00:46 +0700 Subject: Removed non-alphabets from both search and tag_name when scoring. - Added a regex to remove non-alphabet ( `[^a-z]` with `re.IGNORECASE` ) --- bot/cogs/tags.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/bot/cogs/tags.py b/bot/cogs/tags.py index 8d3586b19..0e8cf0278 100644 --- a/bot/cogs/tags.py +++ b/bot/cogs/tags.py @@ -1,4 +1,5 @@ import logging +import re import time from typing import Dict, List, Optional @@ -19,6 +20,8 @@ TEST_CHANNELS = ( Channels.helpers ) +REGEX_NON_ALPHABET = re.compile(r"[^a-z]", re.IGNORECASE & re.MULTILINE) + class Tags(Cog): """Save new tags and fetch existing tags.""" @@ -43,8 +46,8 @@ class Tags(Cog): def _fuzzy_search(search: str, target: str) -> int: """A simple scoring algorithm based on how many letters are found / total, with order in mind.""" found, index = 0, 0 - _search = search.lower().replace(' ', '') - _target = target.lower().replace(' ', '') + _search = REGEX_NON_ALPHABET.sub('', search.lower()) + _target = REGEX_NON_ALPHABET.sub('', target.lower()) for letter in _search: index = _target.find(letter, index) if index == -1: -- cgit v1.2.3 From a6341b13cb3b3fc3ea95942a51478e875205abc6 Mon Sep 17 00:00:00 2001 From: Shirayuki Nekomata Date: Wed, 5 Feb 2020 04:01:41 +0700 Subject: Increased default thresholds from just [100, 80] to [100, 90, 80, 70, 60] - Since it is returning as soon as there are suggestions found for a threshold, this will give a better reflection of what the bot thinks user is searching for. --- bot/cogs/tags.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bot/cogs/tags.py b/bot/cogs/tags.py index 0e8cf0278..8122f739e 100644 --- a/bot/cogs/tags.py +++ b/bot/cogs/tags.py @@ -62,7 +62,7 @@ class Tags(Cog): for tag_title, tag in self._cache.items() } - thresholds = thresholds or [100, 80] + thresholds = thresholds or [100, 90, 80, 70, 60] for threshold in thresholds: suggestions = [ @@ -80,7 +80,7 @@ class Tags(Cog): await self._get_tags() found = [self._cache.get(tag_name.lower(), None)] if not found[0]: - return self._get_suggestions(tag_name, thresholds=[100, 80]) + return self._get_suggestions(tag_name) return found @group(name='tags', aliases=('tag', 't'), invoke_without_command=True) -- cgit v1.2.3 From c054790975670ee9e2b1855590d01491f3732b33 Mon Sep 17 00:00:00 2001 From: Shirayuki Nekomata Date: Wed, 5 Feb 2020 11:48:07 +0700 Subject: Removed regex, implemented a stricter letter searching. --- bot/cogs/tags.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/bot/cogs/tags.py b/bot/cogs/tags.py index 8122f739e..eaf307569 100644 --- a/bot/cogs/tags.py +++ b/bot/cogs/tags.py @@ -1,5 +1,4 @@ import logging -import re import time from typing import Dict, List, Optional @@ -20,8 +19,6 @@ TEST_CHANNELS = ( Channels.helpers ) -REGEX_NON_ALPHABET = re.compile(r"[^a-z]", re.IGNORECASE & re.MULTILINE) - class Tags(Cog): """Save new tags and fetch existing tags.""" @@ -46,13 +43,18 @@ class Tags(Cog): def _fuzzy_search(search: str, target: str) -> int: """A simple scoring algorithm based on how many letters are found / total, with order in mind.""" found, index = 0, 0 - _search = REGEX_NON_ALPHABET.sub('', search.lower()) - _target = REGEX_NON_ALPHABET.sub('', target.lower()) - for letter in _search: - index = _target.find(letter, index) - if index == -1: - break - found += index > 0 + _search = search.lower().replace(' ', '') + _targets = iter(target.lower()) + _target = next(_targets) + try: + for letter in _search: + index = _target.find(letter, index) + while index == -1: + _target = next(_targets) + index = _target.find(letter) + found += 1 + except StopIteration: + pass return found / len(_search) * 100 def _get_suggestions(self, tag_name: str, thresholds: Optional[List[int]] = None) -> List[str]: -- cgit v1.2.3 From 8dd66bc12ecae678c2f17819b298b60823806b95 Mon Sep 17 00:00:00 2001 From: Shirayuki Nekomata Date: Wed, 5 Feb 2020 14:03:54 +0700 Subject: Made searching even stricter by searching from start of each word - Added regex back to sub and split by non-alphabet. - Now use two pointers to move from words to words. --- bot/cogs/tags.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/bot/cogs/tags.py b/bot/cogs/tags.py index eaf307569..54a51921c 100644 --- a/bot/cogs/tags.py +++ b/bot/cogs/tags.py @@ -1,4 +1,5 @@ import logging +import re import time from typing import Dict, List, Optional @@ -19,6 +20,8 @@ TEST_CHANNELS = ( Channels.helpers ) +REGEX_NON_ALPHABET = re.compile(r"[^a-z]", re.MULTILINE & re.IGNORECASE) + class Tags(Cog): """Save new tags and fetch existing tags.""" @@ -42,20 +45,19 @@ class Tags(Cog): @staticmethod def _fuzzy_search(search: str, target: str) -> int: """A simple scoring algorithm based on how many letters are found / total, with order in mind.""" - found, index = 0, 0 - _search = search.lower().replace(' ', '') - _targets = iter(target.lower()) + current, index = 0, 0 + _search = REGEX_NON_ALPHABET.sub('', search.lower()) + _targets = iter(REGEX_NON_ALPHABET.split(target.lower())) _target = next(_targets) try: - for letter in _search: - index = _target.find(letter, index) - while index == -1: - _target = next(_targets) - index = _target.find(letter) - found += 1 - except StopIteration: + while True: + while index < len(_target) and _search[current] == _target[index]: + current += 1 + index += 1 + index, _target = 0, next(_targets) + except (StopIteration, IndexError): pass - return found / len(_search) * 100 + return current / len(_search) * 100 def _get_suggestions(self, tag_name: str, thresholds: Optional[List[int]] = None) -> List[str]: """Return a list of suggested tags.""" -- cgit v1.2.3