From 8c2871f89846f2c34f52061323dc7503855266ea Mon Sep 17 00:00:00 2001
From: Shirayuki Nekomata <lethienphuoc1991@gmail.com>
Date: Fri, 18 Oct 2019 10:39:27 +0700
Subject: Make it easier for user to search for tags

#### Closes #231

Applying the algorithm for `Needles and Haystack` to find and match tag in tags, for example:

![Example](https://cdn.discordapp.com/attachments/634243438459486219/634592981915140107/unknown.png)

This only applies to searching tag_name with more than 3 in length, and at least 80% of its letters are found, from left to right.

There are 3 levels of checking, stop at first found:
- Check if exact name ( case insensitive ) O(1) getting from a dictionary Dict[str, Tag]
- Check for all tags that has 100% matching via algorithm
- Check for all tags that has >= 80% matching

If there are more than one hit, it will be shown as suggestions:

![Suggestions](https://cdn.discordapp.com/attachments/634243438459486219/634595369531211778/unknown.png)

In order to avoid api being called multiple times, I've implemented a cache to only refresh itself when the is a gap of more than 5 minutes from the last api call to get all tags.

Editing / Adding / Deleting tags will also modify the cache directly.

##### What about other solution like fuzzywuzzy?

fuzzywuzzy was considered for using, but from testing, it was giving much lower scores than expected:

Code used to test:

```py
from fuzzywuzzy import fuzz

def _fuzzy_search(search: str, target: str) -> bool:
    found = 0
    index = 0
    _search = search.lower().replace(' ', '')
    _target = target.lower().replace(' ', '')
    for letter in _search:
        index = _target.find(letter, index)
        if index == -1:
            break
        found += index > 0
    # return found / len(_search) * 100
    return (
        found / len(_search) * 100,
        fuzz.ratio(search, target),
        fuzz.partial_ratio(search, target)
    )

tests = (
    'this-is-gonna-be-fun',
    'this-too-will-be-fun'
)

for test in tests:
    print(test, '->', _fuzzy_search('this too fun', test))
```

Result from test:
```py
this-is-gonna-be-fun -> (30.0, 50, 50)
this-too-will-be-fun -> (90.0, 62, 58)
```
---
 bot/cogs/tags.py | 70 ++++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 61 insertions(+), 9 deletions(-)

diff --git a/bot/cogs/tags.py b/bot/cogs/tags.py
index cd70e783a..1aea97b37 100644
--- a/bot/cogs/tags.py
+++ b/bot/cogs/tags.py
@@ -9,7 +9,6 @@ from bot.converters import TagContentConverter, TagNameConverter
 from bot.decorators import with_role
 from bot.pagination import LinePaginator
 
-
 log = logging.getLogger(__name__)
 
 TEST_CHANNELS = (
@@ -26,6 +25,44 @@ class Tags(Cog):
         self.bot = bot
         self.tag_cooldowns = {}
 
+        self._cache = {}
+        self._last_fetch = None
+
+    async def _get_tags(self, is_forced: bool = False) -> None:
+        """Getting all tags."""
+        # Refresh only when there's a more than 5m gap from last call.
+        if is_forced or not self._last_fetch or time.time() - self._last_fetch > 5 * 60:
+            tags = await self.bot.api_client.get('bot/tags')
+            self._cache = {tag['title'].lower(): tag for tag in tags}
+
+    @staticmethod
+    def _fuzzy_search(search: str, target: str) -> bool:
+        found = 0
+        index = 0
+        _search = search.lower().replace(' ', '')
+        _target = target.lower().replace(' ', '')
+        for letter in _search:
+            index = _target.find(letter, index)
+            if index == -1:
+                break
+            found += index > 0
+        return found / len(_search) * 100
+
+    def _get_suggestions(self, tag_name: str, score: int) -> list:
+        return sorted(
+            (tag for tag in self._cache.values() if Tags._fuzzy_search(tag_name, tag['title']) >= score),
+            key=lambda tag: Tags._fuzzy_search(tag_name, tag['title']),
+            reverse=True
+        )
+
+    async def _get_tag(self, tag_name: str) -> list:
+        """Get a specific tag."""
+        await self._get_tags()
+        found = [self._cache.get(tag_name.lower(), None)]
+        if not found[0]:
+            return self._get_suggestions(tag_name, 100) or self._get_suggestions(tag_name, 80)
+        return found
+
     @group(name='tags', aliases=('tag', 't'), invoke_without_command=True)
     async def tags_group(self, ctx: Context, *, tag_name: TagNameConverter = None) -> None:
         """Show all known tags, a single tag, or run a subcommand."""
@@ -59,17 +96,29 @@ class Tags(Cog):
                         f"Cooldown ends in {time_left:.1f} seconds.")
             return
 
+        await self._get_tags()
+
         if tag_name is not None:
-            tag = await self.bot.api_client.get(f'bot/tags/{tag_name}')
-            if ctx.channel.id not in TEST_CHANNELS:
-                self.tag_cooldowns[tag_name] = {
-                    "time": time.time(),
-                    "channel": ctx.channel.id
-                }
-            await ctx.send(embed=Embed.from_dict(tag['embed']))
+            # tag = await self.bot.api_client.get(f'bot/tags/{tag_name}')
+            founds = await self._get_tag(tag_name)
+
+            if len(founds) == 1:
+                tag = founds[0]
+                if ctx.channel.id not in TEST_CHANNELS:
+                    self.tag_cooldowns[tag_name] = {
+                        "time": time.time(),
+                        "channel": ctx.channel.id
+                    }
+                await ctx.send(embed=Embed.from_dict(tag['embed']))
+            elif founds and len(tag_name) >= 3:
+                await ctx.send(embed=Embed(
+                    title='Did you mean ...',
+                    description='\n'.join(tag['title'] for tag in founds[:10])
+                ))
 
         else:
-            tags = await self.bot.api_client.get('bot/tags')
+            # tags = await self.bot.api_client.get('bot/tags')
+            tags = self._cache.values()
             if not tags:
                 await ctx.send(embed=Embed(
                     description="**There are no tags in the database!**",
@@ -105,6 +154,7 @@ class Tags(Cog):
         }
 
         await self.bot.api_client.post('bot/tags', json=body)
+        self._cache[tag_name.lower()] = await self.bot.api_client.get(f'bot/tags/{tag_name}')
 
         log.debug(f"{ctx.author} successfully added the following tag to our database: \n"
                   f"tag_name: {tag_name}\n"
@@ -134,6 +184,7 @@ class Tags(Cog):
         }
 
         await self.bot.api_client.patch(f'bot/tags/{tag_name}', json=body)
+        self._cache[tag_name.lower()] = await self.bot.api_client.get(f'bot/tags/{tag_name}')
 
         log.debug(f"{ctx.author} successfully edited the following tag in our database: \n"
                   f"tag_name: {tag_name}\n"
@@ -150,6 +201,7 @@ class Tags(Cog):
     async def delete_command(self, ctx: Context, *, tag_name: TagNameConverter) -> None:
         """Remove a tag from the database."""
         await self.bot.api_client.delete(f'bot/tags/{tag_name}')
+        self._cache.pop(tag_name.lower(), None)
 
         log.debug(f"{ctx.author} successfully deleted the tag called '{tag_name}'")
         await ctx.send(embed=Embed(
-- 
cgit v1.2.3


From 7f0e6733de8e2b6c3d13834916d790673547e1fb Mon Sep 17 00:00:00 2001
From: Shirayuki Nekomata <lethienphuoc1991@gmail.com>
Date: Tue, 4 Feb 2020 12:23:29 +0700
Subject: Fixed _last_fetch not being updated after each api call.

- Changed type of `self._last_fetch` to `float` and give it the initial value of `0.0` instead of `None`
- Assigned `time.time()` to `time_now` to avoid calling this function twice.
- Added `self._last_fetch = time_now` after calling the api call.
---
 bot/cogs/tags.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/bot/cogs/tags.py b/bot/cogs/tags.py
index 7effaf754..9e06b702c 100644
--- a/bot/cogs/tags.py
+++ b/bot/cogs/tags.py
@@ -27,14 +27,16 @@ class Tags(Cog):
         self.tag_cooldowns = {}
 
         self._cache = {}
-        self._last_fetch = None
+        self._last_fetch: float = 0.0
 
     async def _get_tags(self, is_forced: bool = False) -> None:
-        """Getting all tags."""
-        # Refresh only when there's a more than 5m gap from last call.
-        if is_forced or not self._last_fetch or time.time() - self._last_fetch > 5 * 60:
+        """Get all tags."""
+        # refresh only when there's a more than 5m gap from last call.
+        time_now: float = time.time()
+        if is_forced or not self._last_fetch or time_now - self._last_fetch > 5 * 60:
             tags = await self.bot.api_client.get('bot/tags')
             self._cache = {tag['title'].lower(): tag for tag in tags}
+            self._last_fetch = time_now
 
     @staticmethod
     def _fuzzy_search(search: str, target: str) -> bool:
-- 
cgit v1.2.3


From 868de4716c5b6a3120f665d460a8987bd6f16302 Mon Sep 17 00:00:00 2001
From: Shirayuki Nekomata <lethienphuoc1991@gmail.com>
Date: Tue, 4 Feb 2020 12:26:27 +0700
Subject: Refactored _get_suggestions following Mark's suggestions about
 inefficiency.

- Matching scores will be calculated once now and stored in the dict `scores`.
- Allow `_get_suggestions()` to go through a list of score threshold and return the first list of matching tags that's not empty and above the threshold. This avoid calling the function multiple time like before ( `self._get_suggestions(tag_name, 100) or self._get_suggestions(tag_name, 80)` for example, is calling this function twice, and is inefficient )
- Deleted commented line.
- Added `typing` module for more typehints.
---
 bot/cogs/tags.py | 36 ++++++++++++++++++++++++------------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/bot/cogs/tags.py b/bot/cogs/tags.py
index 9e06b702c..8d3586b19 100644
--- a/bot/cogs/tags.py
+++ b/bot/cogs/tags.py
@@ -1,5 +1,6 @@
 import logging
 import time
+from typing import Dict, List, Optional
 
 from discord import Colour, Embed
 from discord.ext.commands import Cog, Context, group
@@ -39,9 +40,9 @@ class Tags(Cog):
             self._last_fetch = time_now
 
     @staticmethod
-    def _fuzzy_search(search: str, target: str) -> bool:
-        found = 0
-        index = 0
+    def _fuzzy_search(search: str, target: str) -> int:
+        """A simple scoring algorithm based on how many letters are found / total, with order in mind."""
+        found, index = 0, 0
         _search = search.lower().replace(' ', '')
         _target = target.lower().replace(' ', '')
         for letter in _search:
@@ -51,19 +52,32 @@ class Tags(Cog):
             found += index > 0
         return found / len(_search) * 100
 
-    def _get_suggestions(self, tag_name: str, score: int) -> list:
-        return sorted(
-            (tag for tag in self._cache.values() if Tags._fuzzy_search(tag_name, tag['title']) >= score),
-            key=lambda tag: Tags._fuzzy_search(tag_name, tag['title']),
-            reverse=True
-        )
+    def _get_suggestions(self, tag_name: str, thresholds: Optional[List[int]] = None) -> List[str]:
+        """Return a list of suggested tags."""
+        scores: Dict[str, int] = {
+            tag_title: Tags._fuzzy_search(tag_name, tag['title'])
+            for tag_title, tag in self._cache.items()
+        }
+
+        thresholds = thresholds or [100, 80]
+
+        for threshold in thresholds:
+            suggestions = [
+                self._cache[tag_title]
+                for tag_title, matching_score in scores.items()
+                if matching_score >= threshold
+            ]
+            if suggestions:
+                return suggestions
+
+        return []
 
     async def _get_tag(self, tag_name: str) -> list:
         """Get a specific tag."""
         await self._get_tags()
         found = [self._cache.get(tag_name.lower(), None)]
         if not found[0]:
-            return self._get_suggestions(tag_name, 100) or self._get_suggestions(tag_name, 80)
+            return self._get_suggestions(tag_name, thresholds=[100, 80])
         return found
 
     @group(name='tags', aliases=('tag', 't'), invoke_without_command=True)
@@ -102,7 +116,6 @@ class Tags(Cog):
         await self._get_tags()
 
         if tag_name is not None:
-            # tag = await self.bot.api_client.get(f'bot/tags/{tag_name}')
             founds = await self._get_tag(tag_name)
 
             if len(founds) == 1:
@@ -120,7 +133,6 @@ class Tags(Cog):
                 ))
 
         else:
-            # tags = await self.bot.api_client.get('bot/tags')
             tags = self._cache.values()
             if not tags:
                 await ctx.send(embed=Embed(
-- 
cgit v1.2.3


From a38926fe797cdcc13d64d836776f56db09e9efd2 Mon Sep 17 00:00:00 2001
From: Shirayuki Nekomata <lethienphuoc1991@gmail.com>
Date: Wed, 5 Feb 2020 04:00:46 +0700
Subject: Removed non-alphabets from both search and tag_name when scoring.

- Added a regex to remove non-alphabet ( `[^a-z]` with `re.IGNORECASE` )
---
 bot/cogs/tags.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/bot/cogs/tags.py b/bot/cogs/tags.py
index 8d3586b19..0e8cf0278 100644
--- a/bot/cogs/tags.py
+++ b/bot/cogs/tags.py
@@ -1,4 +1,5 @@
 import logging
+import re
 import time
 from typing import Dict, List, Optional
 
@@ -19,6 +20,8 @@ TEST_CHANNELS = (
     Channels.helpers
 )
 
+REGEX_NON_ALPHABET = re.compile(r"[^a-z]", re.IGNORECASE & re.MULTILINE)
+
 
 class Tags(Cog):
     """Save new tags and fetch existing tags."""
@@ -43,8 +46,8 @@ class Tags(Cog):
     def _fuzzy_search(search: str, target: str) -> int:
         """A simple scoring algorithm based on how many letters are found / total, with order in mind."""
         found, index = 0, 0
-        _search = search.lower().replace(' ', '')
-        _target = target.lower().replace(' ', '')
+        _search = REGEX_NON_ALPHABET.sub('', search.lower())
+        _target = REGEX_NON_ALPHABET.sub('', target.lower())
         for letter in _search:
             index = _target.find(letter, index)
             if index == -1:
-- 
cgit v1.2.3


From a6341b13cb3b3fc3ea95942a51478e875205abc6 Mon Sep 17 00:00:00 2001
From: Shirayuki Nekomata <lethienphuoc1991@gmail.com>
Date: Wed, 5 Feb 2020 04:01:41 +0700
Subject: Increased default thresholds from just [100, 80] to [100, 90, 80, 70,
 60]

- Since it is returning as soon as there are suggestions found for a threshold, this will give a better reflection of what the bot thinks user is searching for.
---
 bot/cogs/tags.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bot/cogs/tags.py b/bot/cogs/tags.py
index 0e8cf0278..8122f739e 100644
--- a/bot/cogs/tags.py
+++ b/bot/cogs/tags.py
@@ -62,7 +62,7 @@ class Tags(Cog):
             for tag_title, tag in self._cache.items()
         }
 
-        thresholds = thresholds or [100, 80]
+        thresholds = thresholds or [100, 90, 80, 70, 60]
 
         for threshold in thresholds:
             suggestions = [
@@ -80,7 +80,7 @@ class Tags(Cog):
         await self._get_tags()
         found = [self._cache.get(tag_name.lower(), None)]
         if not found[0]:
-            return self._get_suggestions(tag_name, thresholds=[100, 80])
+            return self._get_suggestions(tag_name)
         return found
 
     @group(name='tags', aliases=('tag', 't'), invoke_without_command=True)
-- 
cgit v1.2.3


From c054790975670ee9e2b1855590d01491f3732b33 Mon Sep 17 00:00:00 2001
From: Shirayuki Nekomata <lethienphuoc1991@gmail.com>
Date: Wed, 5 Feb 2020 11:48:07 +0700
Subject: Removed regex, implemented a stricter letter searching.

---
 bot/cogs/tags.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/bot/cogs/tags.py b/bot/cogs/tags.py
index 8122f739e..eaf307569 100644
--- a/bot/cogs/tags.py
+++ b/bot/cogs/tags.py
@@ -1,5 +1,4 @@
 import logging
-import re
 import time
 from typing import Dict, List, Optional
 
@@ -20,8 +19,6 @@ TEST_CHANNELS = (
     Channels.helpers
 )
 
-REGEX_NON_ALPHABET = re.compile(r"[^a-z]", re.IGNORECASE & re.MULTILINE)
-
 
 class Tags(Cog):
     """Save new tags and fetch existing tags."""
@@ -46,13 +43,18 @@ class Tags(Cog):
     def _fuzzy_search(search: str, target: str) -> int:
         """A simple scoring algorithm based on how many letters are found / total, with order in mind."""
         found, index = 0, 0
-        _search = REGEX_NON_ALPHABET.sub('', search.lower())
-        _target = REGEX_NON_ALPHABET.sub('', target.lower())
-        for letter in _search:
-            index = _target.find(letter, index)
-            if index == -1:
-                break
-            found += index > 0
+        _search = search.lower().replace(' ', '')
+        _targets = iter(target.lower())
+        _target = next(_targets)
+        try:
+            for letter in _search:
+                index = _target.find(letter, index)
+                while index == -1:
+                    _target = next(_targets)
+                    index = _target.find(letter)
+                found += 1
+        except StopIteration:
+            pass
         return found / len(_search) * 100
 
     def _get_suggestions(self, tag_name: str, thresholds: Optional[List[int]] = None) -> List[str]:
-- 
cgit v1.2.3


From 8dd66bc12ecae678c2f17819b298b60823806b95 Mon Sep 17 00:00:00 2001
From: Shirayuki Nekomata <lethienphuoc1991@gmail.com>
Date: Wed, 5 Feb 2020 14:03:54 +0700
Subject: Made searching even stricter by searching from start of each word

- Added regex back to sub and split by non-alphabet.
- Now use two pointers to move from words to words.
---
 bot/cogs/tags.py | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/bot/cogs/tags.py b/bot/cogs/tags.py
index eaf307569..54a51921c 100644
--- a/bot/cogs/tags.py
+++ b/bot/cogs/tags.py
@@ -1,4 +1,5 @@
 import logging
+import re
 import time
 from typing import Dict, List, Optional
 
@@ -19,6 +20,8 @@ TEST_CHANNELS = (
     Channels.helpers
 )
 
+REGEX_NON_ALPHABET = re.compile(r"[^a-z]", re.MULTILINE & re.IGNORECASE)
+
 
 class Tags(Cog):
     """Save new tags and fetch existing tags."""
@@ -42,20 +45,19 @@ class Tags(Cog):
     @staticmethod
     def _fuzzy_search(search: str, target: str) -> int:
         """A simple scoring algorithm based on how many letters are found / total, with order in mind."""
-        found, index = 0, 0
-        _search = search.lower().replace(' ', '')
-        _targets = iter(target.lower())
+        current, index = 0, 0
+        _search = REGEX_NON_ALPHABET.sub('', search.lower())
+        _targets = iter(REGEX_NON_ALPHABET.split(target.lower()))
         _target = next(_targets)
         try:
-            for letter in _search:
-                index = _target.find(letter, index)
-                while index == -1:
-                    _target = next(_targets)
-                    index = _target.find(letter)
-                found += 1
-        except StopIteration:
+            while True:
+                while index < len(_target) and _search[current] == _target[index]:
+                    current += 1
+                    index += 1
+                index, _target = 0, next(_targets)
+        except (StopIteration, IndexError):
             pass
-        return found / len(_search) * 100
+        return current / len(_search) * 100
 
     def _get_suggestions(self, tag_name: str, thresholds: Optional[List[int]] = None) -> List[str]:
         """Return a list of suggested tags."""
-- 
cgit v1.2.3