From b5af23252fe9186a6b1412cf67a935380f616555 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 17 Jun 2020 19:42:25 +0200 Subject: Resolve relative href urls in a html elements. Most docs will use relative urls to link across their pages, without resolving them ourselves the links remain unusable in discord's markdown and break out of codeblocks on mobile. --- bot/cogs/doc.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/bot/cogs/doc.py b/bot/cogs/doc.py index 204cffb37..51fb2cb82 100644 --- a/bot/cogs/doc.py +++ b/bot/cogs/doc.py @@ -7,6 +7,7 @@ from collections import OrderedDict from contextlib import suppress from types import SimpleNamespace from typing import Any, Callable, Optional, Tuple +from urllib.parse import urljoin import discord from bs4 import BeautifulSoup @@ -98,6 +99,10 @@ def async_cache(max_size: int = 128, arg_offset: int = 0) -> Callable: class DocMarkdownConverter(MarkdownConverter): """Subclass markdownify's MarkdownCoverter to provide custom conversion methods.""" + def __init__(self, *, page_url: str, **options): + super().__init__(**options) + self.page_url = page_url + def convert_code(self, el: PageElement, text: str) -> str: """Undo `markdownify`s underscore escaping.""" return f"`{text}`".replace('\\', '') @@ -107,10 +112,15 @@ class DocMarkdownConverter(MarkdownConverter): code = ''.join(el.strings) return f"```py\n{code}```" + def convert_a(self, el: PageElement, text: str) -> str: + """Resolve relative URLs to `self.page_url`.""" + el["href"] = urljoin(self.page_url, el["href"]) + return super().convert_a(el, text) + -def markdownify(html: str) -> DocMarkdownConverter: +def markdownify(html: str, *, url: str = "") -> DocMarkdownConverter: """Create a DocMarkdownConverter object from the input html.""" - return DocMarkdownConverter(bullets='•').convert(html) + return DocMarkdownConverter(bullets='•', page_url=url).convert(html) class InventoryURL(commands.Converter): @@ -293,7 +303,7 @@ class Doc(commands.Cog): signatures = scraped_html[0] permalink = self.inventories[symbol] - description = markdownify(scraped_html[1]) + description = markdownify(scraped_html[1], url=permalink) # Truncate the description of the embed to the last occurrence # of a double newline (interpreted as a paragraph) before index 1000. -- cgit v1.2.3 From 5dfbec9d589f62bb1270b162d734749d5b7b069d Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 17 Jun 2020 21:41:04 +0200 Subject: Make doc get greedy. This allows us to find docs for symbols with spaces in them. --- bot/cogs/doc.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/bot/cogs/doc.py b/bot/cogs/doc.py index 51fb2cb82..010cb9f4c 100644 --- a/bot/cogs/doc.py +++ b/bot/cogs/doc.py @@ -353,12 +353,12 @@ class Doc(commands.Cog): return embed @commands.group(name='docs', aliases=('doc', 'd'), invoke_without_command=True) - async def docs_group(self, ctx: commands.Context, symbol: commands.clean_content = None) -> None: + async def docs_group(self, ctx: commands.Context, *, symbol: str) -> None: """Lookup documentation for Python symbols.""" - await ctx.invoke(self.get_command, symbol) + await ctx.invoke(self.get_command, symbol=symbol) @docs_group.command(name='get', aliases=('g',)) - async def get_command(self, ctx: commands.Context, symbol: commands.clean_content = None) -> None: + async def get_command(self, ctx: commands.Context, *, symbol: str) -> None: """ Return a documentation embed for a given symbol. @@ -370,7 +370,7 @@ class Doc(commands.Cog): !docs aiohttp.ClientSession !docs get aiohttp.ClientSession """ - if symbol is None: + if not symbol: inventory_embed = discord.Embed( title=f"All inventories (`{len(self.base_urls)}` total)", colour=discord.Colour.blue() @@ -392,8 +392,9 @@ class Doc(commands.Cog): doc_embed = await self.get_symbol_embed(symbol) if doc_embed is None: + symbol = await discord.ext.commands.clean_content().convert(ctx, symbol) error_embed = discord.Embed( - description=f"Sorry, I could not find any documentation for `{symbol}`.", + description=f"Sorry, I could not find any documentation for `{(symbol)}`.", colour=discord.Colour.red() ) error_message = await ctx.send(embed=error_embed) -- cgit v1.2.3 From 39aa2fbe0d19edcb61080e49d591a370820bce47 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 17 Jun 2020 21:48:55 +0200 Subject: Skip symbols with slashes in them. The symbols mostly point to autogenerated pages, and do not link to specific symbols on their pages and are thus unreachable with the current implementation. --- bot/cogs/doc.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bot/cogs/doc.py b/bot/cogs/doc.py index 010cb9f4c..59c3cc729 100644 --- a/bot/cogs/doc.py +++ b/bot/cogs/doc.py @@ -191,6 +191,8 @@ class Doc(commands.Cog): for group, value in package.items(): for symbol, (package_name, _version, relative_doc_url, _) in value.items(): + if "/" in symbol: + continue # skip unreachable symbols with slashes absolute_doc_url = base_url + relative_doc_url if symbol in self.inventories: -- cgit v1.2.3 From 41e906d6b978f0745f0aff5e7065ce142282a44f Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Thu, 18 Jun 2020 00:20:25 +0200 Subject: Move symbol parsing into separate methods. --- bot/cogs/doc.py | 66 +++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 23 deletions(-) diff --git a/bot/cogs/doc.py b/bot/cogs/doc.py index 59c3cc729..a1364dd8b 100644 --- a/bot/cogs/doc.py +++ b/bot/cogs/doc.py @@ -6,7 +6,7 @@ import textwrap from collections import OrderedDict from contextlib import suppress from types import SimpleNamespace -from typing import Any, Callable, Optional, Tuple +from typing import Any, Callable, List, Optional, Tuple from urllib.parse import urljoin import discord @@ -265,30 +265,14 @@ class Doc(commands.Cog): return None if symbol_id == f"module-{symbol}": - # Get page content from the module headerlink to the - # first tag that has its class in `SEARCH_END_TAG_ATTRS` - start_tag = symbol_heading.find("a", attrs={"class": "headerlink"}) - if start_tag is None: - return [], "" - - end_tag = start_tag.find_next(self._match_end_tag) - if end_tag is None: - return [], "" - - description_start_index = search_html.find(str(start_tag.parent)) + len(str(start_tag.parent)) - description_end_index = search_html.find(str(end_tag)) - description = search_html[description_start_index:description_end_index] - signatures = None + parsed_module = self.parse_module_symbol(symbol_heading, search_html) + if parsed_module is None: + return None + else: + signatures, description = parsed_module else: - signatures = [] - description = str(symbol_heading.find_next_sibling("dd")) - description_pos = search_html.find(description) - # Get text of up to 3 signatures, remove unwanted symbols - for element in [symbol_heading] + symbol_heading.find_next_siblings("dt", limit=2): - signature = UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text) - if signature and search_html.find(str(element)) < description_pos: - signatures.append(signature) + signatures, description = self.parse_symbol(symbol_heading, search_html) return signatures, description.replace('¶', '') @@ -354,6 +338,42 @@ class Doc(commands.Cog): ) return embed + @classmethod + def parse_module_symbol(cls, heading: PageElement, html: str) -> Optional[Tuple[None, str]]: + """Get page content from the headerlink up to a table or a tag with its class in `SEARCH_END_TAG_ATTRS`.""" + start_tag = heading.find("a", attrs={"class": "headerlink"}) + if start_tag is None: + return None + + end_tag = start_tag.find_next(cls._match_end_tag) + if end_tag is None: + return None + + description_start_index = html.find(str(start_tag.parent)) + len(str(start_tag.parent)) + description_end_index = html.find(str(end_tag)) + description = html[description_start_index:description_end_index] + + return None, description + + @staticmethod + def parse_symbol(heading: PageElement, html: str) -> Tuple[List[str], str]: + """ + Parse the signatures and description of a symbol. + + Collects up to 3 signatures from dt tags and a description from their sibling dd tag. + """ + signatures = [] + description = str(heading.find_next_sibling("dd")) + description_pos = html.find(description) + + for element in [heading] + heading.find_next_siblings("dt", limit=2): + signature = UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text) + + if signature and html.find(str(element)) < description_pos: + signatures.append(signature) + + return signatures, description + @commands.group(name='docs', aliases=('doc', 'd'), invoke_without_command=True) async def docs_group(self, ctx: commands.Context, *, symbol: str) -> None: """Lookup documentation for Python symbols.""" -- cgit v1.2.3 From b0f46ace7b2d4997d5002eb75199490f7828d829 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Thu, 18 Jun 2020 03:58:27 +0200 Subject: Make sure only class contents are included, without methods. When parsing classes, methods would sometimes get included causing bad looking markdown to be included in the description, this is solved by collecting all text *up to* the next dt tag. fixes: #990 --- bot/cogs/doc.py | 55 ++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 13 deletions(-) diff --git a/bot/cogs/doc.py b/bot/cogs/doc.py index a1364dd8b..51323e64f 100644 --- a/bot/cogs/doc.py +++ b/bot/cogs/doc.py @@ -6,7 +6,7 @@ import textwrap from collections import OrderedDict from contextlib import suppress from types import SimpleNamespace -from typing import Any, Callable, List, Optional, Tuple +from typing import Any, Callable, List, Optional, Tuple, Union from urllib.parse import urljoin import discord @@ -265,7 +265,7 @@ class Doc(commands.Cog): return None if symbol_id == f"module-{symbol}": - parsed_module = self.parse_module_symbol(symbol_heading, search_html) + parsed_module = self.parse_module_symbol(symbol_heading) if parsed_module is None: return None else: @@ -339,32 +339,29 @@ class Doc(commands.Cog): return embed @classmethod - def parse_module_symbol(cls, heading: PageElement, html: str) -> Optional[Tuple[None, str]]: + def parse_module_symbol(cls, heading: PageElement) -> Optional[Tuple[None, str]]: """Get page content from the headerlink up to a table or a tag with its class in `SEARCH_END_TAG_ATTRS`.""" start_tag = heading.find("a", attrs={"class": "headerlink"}) if start_tag is None: return None - end_tag = start_tag.find_next(cls._match_end_tag) - if end_tag is None: + description = cls.find_all_text_until_tag(start_tag, cls._match_end_tag) + if description is None: return None - description_start_index = html.find(str(start_tag.parent)) + len(str(start_tag.parent)) - description_end_index = html.find(str(end_tag)) - description = html[description_start_index:description_end_index] - return None, description - @staticmethod - def parse_symbol(heading: PageElement, html: str) -> Tuple[List[str], str]: + @classmethod + def parse_symbol(cls, heading: PageElement, html: str) -> Tuple[List[str], str]: """ Parse the signatures and description of a symbol. Collects up to 3 signatures from dt tags and a description from their sibling dd tag. """ signatures = [] - description = str(heading.find_next_sibling("dd")) - description_pos = html.find(description) + description_element = heading.find_next_sibling("dd") + description_pos = html.find(str(description_element)) + description = "".join(cls.find_all_text_until_tag(description_element, ("dt",))) for element in [heading] + heading.find_next_siblings("dt", limit=2): signature = UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text) @@ -374,6 +371,38 @@ class Doc(commands.Cog): return signatures, description + @staticmethod + def find_all_text_until_tag( + start_element: PageElement, + tag_filter: Union[Tuple[str], Callable[[Tag], bool]] + ) -> Optional[str]: + """ + Get all text from

elements until a tag matching `tag_filter` is found, max 1000 elements searched. + + `tag_filter` can be either a tuple of string names to check against, + or a filtering callable that's applied to the tags. + If no matching end tag is found, None is returned. + """ + text = "" + element = start_element + for _ in range(1000): + if element is None: + break + + element = element.find_next() + if element.name == "p": + text += str(element) + + elif isinstance(tag_filter, tuple): + if element.name in tag_filter: + break + else: + if tag_filter(element): + break + else: + return None + return text + @commands.group(name='docs', aliases=('doc', 'd'), invoke_without_command=True) async def docs_group(self, ctx: commands.Context, *, symbol: str) -> None: """Lookup documentation for Python symbols.""" -- cgit v1.2.3 From 8756c741035d007a5d3f3309b877f56b9ccd0ef1 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 21 Jun 2020 00:59:32 +0200 Subject: Account for `NavigableString`s when gathering text. `find_next()` only goes to tags, leaving out text outside of them when parsing. --- bot/cogs/doc.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/bot/cogs/doc.py b/bot/cogs/doc.py index 51323e64f..d64e6692f 100644 --- a/bot/cogs/doc.py +++ b/bot/cogs/doc.py @@ -11,7 +11,7 @@ from urllib.parse import urljoin import discord from bs4 import BeautifulSoup -from bs4.element import PageElement, Tag +from bs4.element import NavigableString, PageElement, Tag from discord.errors import NotFound from discord.ext import commands from markdownify import MarkdownConverter @@ -377,7 +377,9 @@ class Doc(commands.Cog): tag_filter: Union[Tuple[str], Callable[[Tag], bool]] ) -> Optional[str]: """ - Get all text from

elements until a tag matching `tag_filter` is found, max 1000 elements searched. + Get all text from

elements and strings until a tag matching `tag_filter` is found. + + Max 1000 elements are searched to avoid going through whole pages when no matching tag is found. `tag_filter` can be either a tuple of string names to check against, or a filtering callable that's applied to the tags. @@ -389,7 +391,11 @@ class Doc(commands.Cog): if element is None: break - element = element.find_next() + element = element.next + while isinstance(element, NavigableString): + text += element + element = element.next + if element.name == "p": text += str(element) -- cgit v1.2.3 From e11c5a35f8f494f13323d53c0c514524902b2ae7 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 21 Jun 2020 01:45:54 +0200 Subject: Also check signatures before selected symbol when collecting 3 signatures. --- bot/cogs/doc.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bot/cogs/doc.py b/bot/cogs/doc.py index d64e6692f..b0adc52ba 100644 --- a/bot/cogs/doc.py +++ b/bot/cogs/doc.py @@ -363,7 +363,11 @@ class Doc(commands.Cog): description_pos = html.find(str(description_element)) description = "".join(cls.find_all_text_until_tag(description_element, ("dt",))) - for element in [heading] + heading.find_next_siblings("dt", limit=2): + for element in ( + *reversed(heading.find_previous_siblings("dt", limit=2)), + heading, + *heading.find_next_siblings("dt", limit=2), + )[-3:]: signature = UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text) if signature and html.find(str(element)) < description_pos: -- cgit v1.2.3 From bdccd72747829560eddecc2ae247e5da3a936237 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 21 Jun 2020 01:46:46 +0200 Subject: Remove unnecessary join. `find_all_text_until_tag` already returns a string so a join is not needed. --- bot/cogs/doc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/cogs/doc.py b/bot/cogs/doc.py index b0adc52ba..35139a050 100644 --- a/bot/cogs/doc.py +++ b/bot/cogs/doc.py @@ -361,7 +361,7 @@ class Doc(commands.Cog): signatures = [] description_element = heading.find_next_sibling("dd") description_pos = html.find(str(description_element)) - description = "".join(cls.find_all_text_until_tag(description_element, ("dt",))) + description = cls.find_all_text_until_tag(description_element, ("dt",)) for element in ( *reversed(heading.find_previous_siblings("dt", limit=2)), -- cgit v1.2.3 From d1900d537086b5d195da320cdc949e64afb99cd0 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 21 Jun 2020 01:52:02 +0200 Subject: Add symbol group name to symbol inventory entries. --- bot/cogs/doc.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/bot/cogs/doc.py b/bot/cogs/doc.py index 35139a050..741fd0ddd 100644 --- a/bot/cogs/doc.py +++ b/bot/cogs/doc.py @@ -6,7 +6,7 @@ import textwrap from collections import OrderedDict from contextlib import suppress from types import SimpleNamespace -from typing import Any, Callable, List, Optional, Tuple, Union +from typing import Any, Callable, List, NamedTuple, Optional, Tuple, Union from urllib.parse import urljoin import discord @@ -67,6 +67,13 @@ FAILED_REQUEST_RETRY_AMOUNT = 3 NOT_FOUND_DELETE_DELAY = RedirectOutput.delete_delay +class DocItem(NamedTuple): + """Holds inventory symbol information.""" + + url: str + group: str + + def async_cache(max_size: int = 128, arg_offset: int = 0) -> Callable: """ LRU cache implementation for coroutines. @@ -194,10 +201,10 @@ class Doc(commands.Cog): if "/" in symbol: continue # skip unreachable symbols with slashes absolute_doc_url = base_url + relative_doc_url + group_name = group.split(":")[1] if symbol in self.inventories: - group_name = group.split(":")[1] - symbol_base_url = self.inventories[symbol].split("/", 3)[2] + symbol_base_url = self.inventories[symbol].url.split("/", 3)[2] if ( group_name in NO_OVERRIDE_GROUPS or any(package in symbol_base_url for package in NO_OVERRIDE_PACKAGES) @@ -209,11 +216,11 @@ class Doc(commands.Cog): # Split `package_name` because of packages like Pillow that have spaces in them. symbol = f"{package_name.split()[0]}.{symbol}" - self.inventories[symbol] = absolute_doc_url + self.inventories[symbol] = DocItem(absolute_doc_url, group_name) self.renamed_symbols.add(symbol) continue - self.inventories[symbol] = absolute_doc_url + self.inventories[symbol] = DocItem(absolute_doc_url, group_name) log.trace(f"Fetched inventory for {package_name}.") @@ -248,15 +255,15 @@ class Doc(commands.Cog): If the given symbol is a module, returns a tuple `(None, str)` else if the symbol could not be found, returns `None`. """ - url = self.inventories.get(symbol) - if url is None: + symbol_info = self.inventories.get(symbol) + if symbol_info is None: return None - async with self.bot.http_session.get(url) as response: + async with self.bot.http_session.get(symbol_info.url) as response: html = await response.text(encoding='utf-8') # Find the signature header and parse the relevant parts. - symbol_id = url.split('#')[-1] + symbol_id = symbol_info.url.split('#')[-1] soup = BeautifulSoup(html, 'lxml') symbol_heading = soup.find(id=symbol_id) search_html = str(soup) @@ -288,7 +295,7 @@ class Doc(commands.Cog): return None signatures = scraped_html[0] - permalink = self.inventories[symbol] + permalink = self.inventories[symbol].url description = markdownify(scraped_html[1], url=permalink) # Truncate the description of the embed to the last occurrence -- cgit v1.2.3 From d790c404ca3dba3843f351d6f42e766956aa73a1 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 21 Jun 2020 02:37:32 +0200 Subject: Renamed existing symbols from `NO_OVERRIDE_GROUPS` instead of replacing. Before, when a symbol from the group shared the name with a symbol outside of it the symbol was simply replaced and lost. The new implementation renames the old symbols to the group_name.symbol format before the new symbol takes their place. --- bot/cogs/doc.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/bot/cogs/doc.py b/bot/cogs/doc.py index 741fd0ddd..4eea06386 100644 --- a/bot/cogs/doc.py +++ b/bot/cogs/doc.py @@ -209,16 +209,21 @@ class Doc(commands.Cog): group_name in NO_OVERRIDE_GROUPS or any(package in symbol_base_url for package in NO_OVERRIDE_PACKAGES) ): - symbol = f"{group_name}.{symbol}" - # If renamed `symbol` already exists, add library name in front to differentiate between them. - if symbol in self.renamed_symbols: - # Split `package_name` because of packages like Pillow that have spaces in them. - symbol = f"{package_name.split()[0]}.{symbol}" - self.inventories[symbol] = DocItem(absolute_doc_url, group_name) + elif (overridden_symbol_group := self.inventories[symbol].group) in NO_OVERRIDE_GROUPS: + overridden_symbol = f"{overridden_symbol_group}.{symbol}" + if overridden_symbol in self.renamed_symbols: + overridden_symbol = f"{package_name.split()[0]}.{overridden_symbol}" + + self.inventories[overridden_symbol] = self.inventories[symbol] + self.renamed_symbols.add(overridden_symbol) + + # If renamed `symbol` already exists, add library name in front to differentiate between them. + if symbol in self.renamed_symbols: + # Split `package_name` because of packages like Pillow that have spaces in them. + symbol = f"{package_name.split()[0]}.{symbol}" self.renamed_symbols.add(symbol) - continue self.inventories[symbol] = DocItem(absolute_doc_url, group_name) -- cgit v1.2.3 From bca55c25ffb3631ba05889a88908a02ccb2beb2a Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 21 Jun 2020 02:42:26 +0200 Subject: Fix typehint. --- bot/cogs/doc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/cogs/doc.py b/bot/cogs/doc.py index 4eea06386..a01f6d64d 100644 --- a/bot/cogs/doc.py +++ b/bot/cogs/doc.py @@ -125,7 +125,7 @@ class DocMarkdownConverter(MarkdownConverter): return super().convert_a(el, text) -def markdownify(html: str, *, url: str = "") -> DocMarkdownConverter: +def markdownify(html: str, *, url: str = "") -> str: """Create a DocMarkdownConverter object from the input html.""" return DocMarkdownConverter(bullets='•', page_url=url).convert(html) -- cgit v1.2.3 From 38991027a38b1adc4be3c99d126dae76a3a62036 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 21 Jun 2020 03:09:23 +0200 Subject: Correct return when a module symbol could not be parsed. --- bot/cogs/doc.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/bot/cogs/doc.py b/bot/cogs/doc.py index a01f6d64d..1c9d80e47 100644 --- a/bot/cogs/doc.py +++ b/bot/cogs/doc.py @@ -279,7 +279,7 @@ class Doc(commands.Cog): if symbol_id == f"module-{symbol}": parsed_module = self.parse_module_symbol(symbol_heading) if parsed_module is None: - return None + return [], "" else: signatures, description = parsed_module @@ -538,14 +538,13 @@ class Doc(commands.Cog): old_inventories = set(self.base_urls) with ctx.typing(): await self.refresh_inventory() - # Get differences of added and removed inventories - added = ', '.join(inv for inv in self.base_urls if inv not in old_inventories) - if added: - added = f"+ {added}" - - removed = ', '.join(inv for inv in old_inventories if inv not in self.base_urls) - if removed: - removed = f"- {removed}" + new_inventories = set(self.base_urls) + + if added := ", ".join(new_inventories - old_inventories): + added = "+ " + added + + if removed := ", ".join(old_inventories - new_inventories): + removed = "- " + removed embed = discord.Embed( title="Inventories refreshed", -- cgit v1.2.3 From a28ae5dfb610151060eab9856c44b2d192131f0d Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 21 Jun 2020 15:58:55 +0200 Subject: Strip backticks from symbol input. This allows the user to wrap symbols in codeblocks to avoid markdown. --- bot/cogs/doc.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bot/cogs/doc.py b/bot/cogs/doc.py index 1c9d80e47..0dc1713a3 100644 --- a/bot/cogs/doc.py +++ b/bot/cogs/doc.py @@ -458,6 +458,7 @@ class Doc(commands.Cog): await ctx.send(embed=inventory_embed) else: + symbol = symbol.strip("`") # Fetching documentation for a symbol (at least for the first time, since # caching is used) takes quite some time, so let's send typing to indicate # that we got the command, but are still working on it. -- cgit v1.2.3 From c461bef250cd3d44fac2c0e64da21072f963909d Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 27 Jun 2020 15:46:47 +0200 Subject: Redesign `find_all_text_until_tag` to search through all direct children. The previous approach didn't work for arbitrary tags with text. --- bot/cogs/doc.py | 39 ++++++++++++--------------------------- 1 file changed, 12 insertions(+), 27 deletions(-) diff --git a/bot/cogs/doc.py b/bot/cogs/doc.py index 0dc1713a3..e4b54f0a5 100644 --- a/bot/cogs/doc.py +++ b/bot/cogs/doc.py @@ -11,7 +11,7 @@ from urllib.parse import urljoin import discord from bs4 import BeautifulSoup -from bs4.element import NavigableString, PageElement, Tag +from bs4.element import PageElement, Tag from discord.errors import NotFound from discord.ext import commands from markdownify import MarkdownConverter @@ -357,7 +357,7 @@ class Doc(commands.Cog): if start_tag is None: return None - description = cls.find_all_text_until_tag(start_tag, cls._match_end_tag) + description = cls.find_all_children_until_tag(start_tag, cls._match_end_tag) if description is None: return None @@ -373,7 +373,7 @@ class Doc(commands.Cog): signatures = [] description_element = heading.find_next_sibling("dd") description_pos = html.find(str(description_element)) - description = cls.find_all_text_until_tag(description_element, ("dt",)) + description = cls.find_all_children_until_tag(description_element, tag_filter=("dt", "dl")) for element in ( *reversed(heading.find_previous_siblings("dt", limit=2)), @@ -388,41 +388,26 @@ class Doc(commands.Cog): return signatures, description @staticmethod - def find_all_text_until_tag( + def find_all_children_until_tag( start_element: PageElement, - tag_filter: Union[Tuple[str], Callable[[Tag], bool]] + tag_filter: Union[Tuple[str, ...], Callable[[Tag], bool]] ) -> Optional[str]: """ - Get all text from

elements and strings until a tag matching `tag_filter` is found. - - Max 1000 elements are searched to avoid going through whole pages when no matching tag is found. + Get all direct children until a child matching `tag_filter` is found. `tag_filter` can be either a tuple of string names to check against, or a filtering callable that's applied to the tags. - If no matching end tag is found, None is returned. """ text = "" - element = start_element - for _ in range(1000): - if element is None: - break - - element = element.next - while isinstance(element, NavigableString): - text += element - element = element.next - if element.name == "p": - text += str(element) - - elif isinstance(tag_filter, tuple): + for element in start_element.find_next().find_next_siblings(): + if isinstance(tag_filter, tuple): if element.name in tag_filter: break - else: - if tag_filter(element): - break - else: - return None + elif tag_filter(element): + break + text += str(element) + return text @commands.group(name='docs', aliases=('doc', 'd'), invoke_without_command=True) -- cgit v1.2.3 From ff3afe58548a8f1ed675c1933545e481e99bfc78 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 27 Jun 2020 15:48:28 +0200 Subject: Only include one newline for `p` tags in `li` elements. --- bot/cogs/doc.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/bot/cogs/doc.py b/bot/cogs/doc.py index e4b54f0a5..c1e8cebcf 100644 --- a/bot/cogs/doc.py +++ b/bot/cogs/doc.py @@ -124,6 +124,13 @@ class DocMarkdownConverter(MarkdownConverter): el["href"] = urljoin(self.page_url, el["href"]) return super().convert_a(el, text) + def convert_p(self, el: PageElement, text: str) -> str: + """Include only one newline instead of two when the parent is a li tag.""" + parent = el.parent + if parent is not None and parent.name == "li": + return f"{text}\n" + return super().convert_p(el, text) + def markdownify(html: str, *, url: str = "") -> str: """Create a DocMarkdownConverter object from the input html.""" -- cgit v1.2.3 From 6532618a503a55653499089a2d6a4ca43be7e2bf Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 28 Jun 2020 01:45:17 +0200 Subject: Only update added inventory instead of all. --- bot/cogs/doc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/cogs/doc.py b/bot/cogs/doc.py index c1e8cebcf..7c4beb075 100644 --- a/bot/cogs/doc.py +++ b/bot/cogs/doc.py @@ -504,7 +504,7 @@ class Doc(commands.Cog): # Rebuilding the inventory can take some time, so lets send out a # typing event to show that the Bot is still working. async with ctx.typing(): - await self.refresh_inventory() + await self.update_single(package_name, base_url, inventory_url) await ctx.send(f"Added package `{package_name}` to database and refreshed inventory.") @docs_group.command(name='delete', aliases=('remove', 'rm', 'd')) -- cgit v1.2.3 From fd839ef3f193586c204f52ca76a84c18a8f3ba1e Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Mon, 29 Jun 2020 02:39:00 +0200 Subject: Add stat for packages of fetched symbols. An additional variable is added to the DocItem named tuple to accommodate this. The `_package_name` is separated from `api_package_name` it previously overwrote and is now used for the stats and renamed symbols because the names are in a friendlier format. --- bot/cogs/doc.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/bot/cogs/doc.py b/bot/cogs/doc.py index 7c4beb075..e1c25d173 100644 --- a/bot/cogs/doc.py +++ b/bot/cogs/doc.py @@ -6,7 +6,7 @@ import textwrap from collections import OrderedDict from contextlib import suppress from types import SimpleNamespace -from typing import Any, Callable, List, NamedTuple, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple, Union from urllib.parse import urljoin import discord @@ -70,6 +70,7 @@ NOT_FOUND_DELETE_DELAY = RedirectOutput.delete_delay class DocItem(NamedTuple): """Holds inventory symbol information.""" + package: str url: str group: str @@ -174,7 +175,7 @@ class Doc(commands.Cog): def __init__(self, bot: Bot): self.base_urls = {} self.bot = bot - self.inventories = {} + self.inventories: Dict[str, DocItem] = {} self.renamed_symbols = set() self.bot.loop.create_task(self.init_refresh_inventory()) @@ -185,7 +186,7 @@ class Doc(commands.Cog): await self.refresh_inventory() async def update_single( - self, package_name: str, base_url: str, inventory_url: str + self, api_package_name: str, base_url: str, inventory_url: str ) -> None: """ Rebuild the inventory for a single package. @@ -197,14 +198,14 @@ class Doc(commands.Cog): * `inventory_url` is the absolute URL to the intersphinx inventory, fetched by running `intersphinx.fetch_inventory` in an executor on the bot's event loop """ - self.base_urls[package_name] = base_url + self.base_urls[api_package_name] = base_url package = await self._fetch_inventory(inventory_url) if not package: return None for group, value in package.items(): - for symbol, (package_name, _version, relative_doc_url, _) in value.items(): + for symbol, (_package_name, _version, relative_doc_url, _) in value.items(): if "/" in symbol: continue # skip unreachable symbols with slashes absolute_doc_url = base_url + relative_doc_url @@ -221,7 +222,7 @@ class Doc(commands.Cog): elif (overridden_symbol_group := self.inventories[symbol].group) in NO_OVERRIDE_GROUPS: overridden_symbol = f"{overridden_symbol_group}.{symbol}" if overridden_symbol in self.renamed_symbols: - overridden_symbol = f"{package_name.split()[0]}.{overridden_symbol}" + overridden_symbol = f"{api_package_name}.{overridden_symbol}" self.inventories[overridden_symbol] = self.inventories[symbol] self.renamed_symbols.add(overridden_symbol) @@ -229,12 +230,12 @@ class Doc(commands.Cog): # If renamed `symbol` already exists, add library name in front to differentiate between them. if symbol in self.renamed_symbols: # Split `package_name` because of packages like Pillow that have spaces in them. - symbol = f"{package_name.split()[0]}.{symbol}" + symbol = f"{api_package_name}.{symbol}" self.renamed_symbols.add(symbol) - self.inventories[symbol] = DocItem(absolute_doc_url, group_name) + self.inventories[symbol] = DocItem(api_package_name, absolute_doc_url, group_name) - log.trace(f"Fetched inventory for {package_name}.") + log.trace(f"Fetched inventory for {api_package_name}.") async def refresh_inventory(self) -> None: """Refresh internal documentation inventory.""" @@ -306,8 +307,10 @@ class Doc(commands.Cog): if scraped_html is None: return None + symbol_obj = self.inventories[symbol] + self.bot.stats.incr(f"doc_fetches.{symbol_obj.package.lower()}") signatures = scraped_html[0] - permalink = self.inventories[symbol].url + permalink = symbol_obj.url description = markdownify(scraped_html[1], url=permalink) # Truncate the description of the embed to the last occurrence -- cgit v1.2.3 From b6dc7536fd90e27f5dfdf3204dc2f17917d78ee2 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Mon, 29 Jun 2020 02:42:27 +0200 Subject: Trigger typing in converter instead of command. The converter does a web request so triggering typing in the command itself left out a period where the bot seemed inactive. --- bot/cogs/doc.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/bot/cogs/doc.py b/bot/cogs/doc.py index e1c25d173..50aa9bbad 100644 --- a/bot/cogs/doc.py +++ b/bot/cogs/doc.py @@ -151,6 +151,7 @@ class InventoryURL(commands.Converter): @staticmethod async def convert(ctx: commands.Context, url: str) -> str: """Convert url to Intersphinx inventory URL.""" + await ctx.trigger_typing() try: intersphinx.fetch_inventory(SPHINX_MOCK_APP, '', url) except AttributeError: @@ -504,10 +505,7 @@ class Doc(commands.Cog): f"Inventory URL: {inventory_url}" ) - # Rebuilding the inventory can take some time, so lets send out a - # typing event to show that the Bot is still working. - async with ctx.typing(): - await self.update_single(package_name, base_url, inventory_url) + await self.update_single(package_name, base_url, inventory_url) await ctx.send(f"Added package `{package_name}` to database and refreshed inventory.") @docs_group.command(name='delete', aliases=('remove', 'rm', 'd')) -- cgit v1.2.3 From 782cd1771ce9254761a70bbfbfa8e883c1330c6c Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Mon, 29 Jun 2020 16:27:24 +0200 Subject: Add option for user to delete the not found message before it's auto deleted. --- bot/cogs/doc.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/bot/cogs/doc.py b/bot/cogs/doc.py index 50aa9bbad..b288a92b1 100644 --- a/bot/cogs/doc.py +++ b/bot/cogs/doc.py @@ -12,7 +12,6 @@ from urllib.parse import urljoin import discord from bs4 import BeautifulSoup from bs4.element import PageElement, Tag -from discord.errors import NotFound from discord.ext import commands from markdownify import MarkdownConverter from requests import ConnectTimeout, ConnectionError, HTTPError @@ -24,6 +23,7 @@ from bot.constants import MODERATION_ROLES, RedirectOutput from bot.converters import ValidPythonIdentifier, ValidURL from bot.decorators import with_role from bot.pagination import LinePaginator +from bot.utils.messages import wait_for_deletion log = logging.getLogger(__name__) @@ -468,9 +468,16 @@ class Doc(commands.Cog): colour=discord.Colour.red() ) error_message = await ctx.send(embed=error_embed) - with suppress(NotFound): - await error_message.delete(delay=NOT_FOUND_DELETE_DELAY) - await ctx.message.delete(delay=NOT_FOUND_DELETE_DELAY) + await wait_for_deletion( + error_message, + (ctx.author.id,), + timeout=NOT_FOUND_DELETE_DELAY, + client=self.bot + ) + with suppress(discord.NotFound): + await ctx.message.delete() + with suppress(discord.NotFound): + await error_message.delete() else: await ctx.send(embed=doc_embed) -- cgit v1.2.3 From fa60e51243c56e6658a91ea63be67a42e22f1512 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Mon, 6 Jul 2020 21:23:41 +0200 Subject: Intern `group_names` --- bot/cogs/doc.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bot/cogs/doc.py b/bot/cogs/doc.py index b288a92b1..0975285e8 100644 --- a/bot/cogs/doc.py +++ b/bot/cogs/doc.py @@ -2,6 +2,7 @@ import asyncio import functools import logging import re +import sys import textwrap from collections import OrderedDict from contextlib import suppress @@ -210,7 +211,9 @@ class Doc(commands.Cog): if "/" in symbol: continue # skip unreachable symbols with slashes absolute_doc_url = base_url + relative_doc_url - group_name = group.split(":")[1] + # Intern the group names since they're reused in all the DocItems + # to remove unnecessary memory consumption from them being unique objects + group_name = sys.intern(group.split(":")[1]) if symbol in self.inventories: symbol_base_url = self.inventories[symbol].url.split("/", 3)[2] -- cgit v1.2.3 From 09987afb9b1e39fc5618b4217e1f33860cdd4bb4 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Tue, 7 Jul 2020 01:25:14 +0200 Subject: Create method to fetch and create a BeautifulSoup object from an url. Moving this part of the logic into a separate method allows us to put a cache on it, which caches the whole HTML document from the given url, removing the need to do requests to the same URL for every symbol behind it. --- bot/cogs/doc.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/bot/cogs/doc.py b/bot/cogs/doc.py index 0975285e8..71bfcfd4a 100644 --- a/bot/cogs/doc.py +++ b/bot/cogs/doc.py @@ -275,13 +275,9 @@ class Doc(commands.Cog): symbol_info = self.inventories.get(symbol) if symbol_info is None: return None + request_url, symbol_id = symbol_info.url.rsplit('#') - async with self.bot.http_session.get(symbol_info.url) as response: - html = await response.text(encoding='utf-8') - - # Find the signature header and parse the relevant parts. - symbol_id = symbol_info.url.split('#')[-1] - soup = BeautifulSoup(html, 'lxml') + soup = await self._get_soup_from_url(request_url) symbol_heading = soup.find(id=symbol_id) search_html = str(soup) @@ -424,6 +420,15 @@ class Doc(commands.Cog): return text + @async_cache(arg_offset=1) + async def _get_soup_from_url(self, url: str) -> BeautifulSoup: + """Create a BeautifulSoup object from the HTML data in `url` with the head tag removed.""" + log.trace(f"Sending a request to {url}.") + async with self.bot.http_session.get(url) as response: + soup = BeautifulSoup(await response.text(encoding="utf8"), 'lxml') + soup.find("head").decompose() # the head contains no useful data so we can remove it + return soup + @commands.group(name='docs', aliases=('doc', 'd'), invoke_without_command=True) async def docs_group(self, ctx: commands.Context, *, symbol: str) -> None: """Lookup documentation for Python symbols.""" -- cgit v1.2.3 From 8462abaa15e0f9eb7b4f861d0485686ec7470ed0 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Tue, 7 Jul 2020 01:26:34 +0200 Subject: Use the group attribute instead of checking the symbol name. --- bot/cogs/doc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/cogs/doc.py b/bot/cogs/doc.py index 71bfcfd4a..5ebfb6c25 100644 --- a/bot/cogs/doc.py +++ b/bot/cogs/doc.py @@ -284,7 +284,7 @@ class Doc(commands.Cog): if symbol_heading is None: return None - if symbol_id == f"module-{symbol}": + if symbol_info.group == "module": parsed_module = self.parse_module_symbol(symbol_heading) if parsed_module is None: return [], "" -- cgit v1.2.3 From 03dbddfcae35e47d57222343817ea779d6b67ab2 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Fri, 10 Jul 2020 22:36:19 +0200 Subject: Remove codeblock from symbol embed title. The code block caused the url to not highlight the title text on mobile --- bot/cogs/doc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/cogs/doc.py b/bot/cogs/doc.py index 5ebfb6c25..e2e3adb4e 100644 --- a/bot/cogs/doc.py +++ b/bot/cogs/doc.py @@ -350,7 +350,7 @@ class Doc(commands.Cog): embed_description += f"\n{description}" embed = discord.Embed( - title=f'`{symbol}`', + title=discord.utils.escape_markdown(symbol), url=permalink, description=embed_description ) -- cgit v1.2.3 From b59e39557ae97ac6bbc4e294651d1fe654bb2d21 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Tue, 14 Jul 2020 00:13:42 +0200 Subject: Add doc suffix to doc commands. The `set` command shadowed the `set` symbol, causing the command to seemingly not work. A suffix was added to all commands to keep them consistent and future proof; the shorthands were kept unchanged --- bot/cogs/doc.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bot/cogs/doc.py b/bot/cogs/doc.py index e2e3adb4e..7f1fb6135 100644 --- a/bot/cogs/doc.py +++ b/bot/cogs/doc.py @@ -434,7 +434,7 @@ class Doc(commands.Cog): """Lookup documentation for Python symbols.""" await ctx.invoke(self.get_command, symbol=symbol) - @docs_group.command(name='get', aliases=('g',)) + @docs_group.command(name='getdoc', aliases=('g',)) async def get_command(self, ctx: commands.Context, *, symbol: str) -> None: """ Return a documentation embed for a given symbol. @@ -489,7 +489,7 @@ class Doc(commands.Cog): else: await ctx.send(embed=doc_embed) - @docs_group.command(name='set', aliases=('s',)) + @docs_group.command(name='setdoc', aliases=('s',)) @with_role(*MODERATION_ROLES) async def set_command( self, ctx: commands.Context, package_name: ValidPythonIdentifier, @@ -523,7 +523,7 @@ class Doc(commands.Cog): await self.update_single(package_name, base_url, inventory_url) await ctx.send(f"Added package `{package_name}` to database and refreshed inventory.") - @docs_group.command(name='delete', aliases=('remove', 'rm', 'd')) + @docs_group.command(name='deletedoc', aliases=('removedoc', 'rm', 'd')) @with_role(*MODERATION_ROLES) async def delete_command(self, ctx: commands.Context, package_name: ValidPythonIdentifier) -> None: """ @@ -540,7 +540,7 @@ class Doc(commands.Cog): await self.refresh_inventory() await ctx.send(f"Successfully deleted `{package_name}` and refreshed inventory.") - @docs_group.command(name="refresh", aliases=("rfsh", "r")) + @docs_group.command(name="refreshdoc", aliases=("rfsh", "r")) @with_role(*MODERATION_ROLES) async def refresh_command(self, ctx: commands.Context) -> None: """Refresh inventories and send differences to channel.""" -- cgit v1.2.3 From ea0dcabbca10c5fe2afcee2b9451e1494bc069a2 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Tue, 14 Jul 2020 00:18:58 +0200 Subject: Make the symbol parameter optional. The commands were changed to be greedy, this however made them required arguments breaking the access to the default listing of the available inventories --- bot/cogs/doc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bot/cogs/doc.py b/bot/cogs/doc.py index 7f1fb6135..66c4b4ea8 100644 --- a/bot/cogs/doc.py +++ b/bot/cogs/doc.py @@ -430,12 +430,12 @@ class Doc(commands.Cog): return soup @commands.group(name='docs', aliases=('doc', 'd'), invoke_without_command=True) - async def docs_group(self, ctx: commands.Context, *, symbol: str) -> None: + async def docs_group(self, ctx: commands.Context, *, symbol: Optional[str]) -> None: """Lookup documentation for Python symbols.""" await ctx.invoke(self.get_command, symbol=symbol) @docs_group.command(name='getdoc', aliases=('g',)) - async def get_command(self, ctx: commands.Context, *, symbol: str) -> None: + async def get_command(self, ctx: commands.Context, *, symbol: Optional[str]) -> None: """ Return a documentation embed for a given symbol. -- cgit v1.2.3 From 40d831fb7b5ca7192fb1bdca8be9157f206eb2bc Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Tue, 14 Jul 2020 03:40:52 +0200 Subject: Change package name converter to only accept _a-z. Package names are now directly used for stats, where the lowercase a-z characters and _ are used. --- bot/cogs/doc.py | 6 +++--- bot/converters.py | 22 ++++++++++------------ 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/bot/cogs/doc.py b/bot/cogs/doc.py index 66c4b4ea8..09bddb02c 100644 --- a/bot/cogs/doc.py +++ b/bot/cogs/doc.py @@ -21,7 +21,7 @@ from urllib3.exceptions import ProtocolError from bot.bot import Bot from bot.constants import MODERATION_ROLES, RedirectOutput -from bot.converters import ValidPythonIdentifier, ValidURL +from bot.converters import PackageName, ValidURL from bot.decorators import with_role from bot.pagination import LinePaginator from bot.utils.messages import wait_for_deletion @@ -492,7 +492,7 @@ class Doc(commands.Cog): @docs_group.command(name='setdoc', aliases=('s',)) @with_role(*MODERATION_ROLES) async def set_command( - self, ctx: commands.Context, package_name: ValidPythonIdentifier, + self, ctx: commands.Context, package_name: PackageName, base_url: ValidURL, inventory_url: InventoryURL ) -> None: """ @@ -525,7 +525,7 @@ class Doc(commands.Cog): @docs_group.command(name='deletedoc', aliases=('removedoc', 'rm', 'd')) @with_role(*MODERATION_ROLES) - async def delete_command(self, ctx: commands.Context, package_name: ValidPythonIdentifier) -> None: + async def delete_command(self, ctx: commands.Context, package_name: PackageName) -> None: """ Removes the specified package from the database. diff --git a/bot/converters.py b/bot/converters.py index 72c46fdf0..fac94e9d0 100644 --- a/bot/converters.py +++ b/bot/converters.py @@ -34,22 +34,20 @@ def allowed_strings(*values, preserve_case: bool = False) -> t.Callable[[str], s return converter -class ValidPythonIdentifier(Converter): +class PackageName(Converter): """ - A converter that checks whether the given string is a valid Python identifier. + A converter that checks whether the given string is a valid package name. - This is used to have package names that correspond to how you would use the package in your - code, e.g. `import package`. - - Raises `BadArgument` if the argument is not a valid Python identifier, and simply passes through - the given argument otherwise. + Package names are used for stats and are restricted to the a-z and _ characters. """ - @staticmethod - async def convert(ctx: Context, argument: str) -> str: - """Checks whether the given string is a valid Python identifier.""" - if not argument.isidentifier(): - raise BadArgument(f"`{argument}` is not a valid Python identifier") + PACKAGE_NAME_RE = re.compile(r"[^a-z_]") + + @classmethod + async def convert(cls, ctx: Context, argument: str) -> str: + """Checks whether the given string is a valid package name.""" + if cls.PACKAGE_NAME_RE.search(argument): + raise BadArgument("The provided package name is not valid, please only use the _ and a-z characters.") return argument -- cgit v1.2.3 From 68805bb77d56f22854508f7912d00bdaab5daf5c Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Tue, 14 Jul 2020 03:49:18 +0200 Subject: Change docstrings to use suffixed command names. --- bot/cogs/doc.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bot/cogs/doc.py b/bot/cogs/doc.py index 09bddb02c..673a1156f 100644 --- a/bot/cogs/doc.py +++ b/bot/cogs/doc.py @@ -445,7 +445,7 @@ class Doc(commands.Cog): !docs !docs aiohttp !docs aiohttp.ClientSession - !docs get aiohttp.ClientSession + !docs getdoc aiohttp.ClientSession """ if not symbol: inventory_embed = discord.Embed( @@ -501,7 +501,7 @@ class Doc(commands.Cog): The database will update the object, should an existing item with the specified `package_name` already exist. Example: - !docs set \ + !docs setdoc \ python \ https://docs.python.org/3/ \ https://docs.python.org/3/objects.inv @@ -530,7 +530,7 @@ class Doc(commands.Cog): Removes the specified package from the database. Examples: - !docs delete aiohttp + !docs deletedoc aiohttp """ await self.bot.api_client.delete(f'bot/documentation-links/{package_name}') -- cgit v1.2.3 From d1413409f3cbfaaec94060df5c0fea7827fe874b Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Tue, 14 Jul 2020 23:54:03 +0200 Subject: Rename inventories to doc_symbols. --- bot/cogs/doc.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/bot/cogs/doc.py b/bot/cogs/doc.py index 673a1156f..526747bf4 100644 --- a/bot/cogs/doc.py +++ b/bot/cogs/doc.py @@ -177,7 +177,7 @@ class Doc(commands.Cog): def __init__(self, bot: Bot): self.base_urls = {} self.bot = bot - self.inventories: Dict[str, DocItem] = {} + self.doc_symbols: Dict[str, DocItem] = {} self.renamed_symbols = set() self.bot.loop.create_task(self.init_refresh_inventory()) @@ -215,20 +215,20 @@ class Doc(commands.Cog): # to remove unnecessary memory consumption from them being unique objects group_name = sys.intern(group.split(":")[1]) - if symbol in self.inventories: - symbol_base_url = self.inventories[symbol].url.split("/", 3)[2] + if symbol in self.doc_symbols: + symbol_base_url = self.doc_symbols[symbol].url.split("/", 3)[2] if ( group_name in NO_OVERRIDE_GROUPS or any(package in symbol_base_url for package in NO_OVERRIDE_PACKAGES) ): symbol = f"{group_name}.{symbol}" - elif (overridden_symbol_group := self.inventories[symbol].group) in NO_OVERRIDE_GROUPS: + elif (overridden_symbol_group := self.doc_symbols[symbol].group) in NO_OVERRIDE_GROUPS: overridden_symbol = f"{overridden_symbol_group}.{symbol}" if overridden_symbol in self.renamed_symbols: overridden_symbol = f"{api_package_name}.{overridden_symbol}" - self.inventories[overridden_symbol] = self.inventories[symbol] + self.doc_symbols[overridden_symbol] = self.doc_symbols[symbol] self.renamed_symbols.add(overridden_symbol) # If renamed `symbol` already exists, add library name in front to differentiate between them. @@ -237,7 +237,7 @@ class Doc(commands.Cog): symbol = f"{api_package_name}.{symbol}" self.renamed_symbols.add(symbol) - self.inventories[symbol] = DocItem(api_package_name, absolute_doc_url, group_name) + self.doc_symbols[symbol] = DocItem(api_package_name, absolute_doc_url, group_name) log.trace(f"Fetched inventory for {api_package_name}.") @@ -245,11 +245,11 @@ class Doc(commands.Cog): """Refresh internal documentation inventory.""" log.debug("Refreshing documentation inventory...") - # Clear the old base URLS and inventories to ensure + # Clear the old base URLS and doc symbols to ensure # that we start from a fresh local dataset. # Also, reset the cache used for fetching documentation. self.base_urls.clear() - self.inventories.clear() + self.doc_symbols.clear() self.renamed_symbols.clear() async_cache.cache = OrderedDict() @@ -272,7 +272,7 @@ class Doc(commands.Cog): If the given symbol is a module, returns a tuple `(None, str)` else if the symbol could not be found, returns `None`. """ - symbol_info = self.inventories.get(symbol) + symbol_info = self.doc_symbols.get(symbol) if symbol_info is None: return None request_url, symbol_id = symbol_info.url.rsplit('#') @@ -307,7 +307,7 @@ class Doc(commands.Cog): if scraped_html is None: return None - symbol_obj = self.inventories[symbol] + symbol_obj = self.doc_symbols[symbol] self.bot.stats.incr(f"doc_fetches.{symbol_obj.package.lower()}") signatures = scraped_html[0] permalink = symbol_obj.url -- cgit v1.2.3 From daa46eccc6518e567777240d7b94f121c5eacf57 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 18 Jul 2020 15:52:25 +0200 Subject: Create a package for the Doc cog. --- bot/cogs/doc.py | 603 ----------------------------------------------- bot/cogs/doc/__init__.py | 7 + bot/cogs/doc/cog.py | 598 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 605 insertions(+), 603 deletions(-) delete mode 100644 bot/cogs/doc.py create mode 100644 bot/cogs/doc/__init__.py create mode 100644 bot/cogs/doc/cog.py diff --git a/bot/cogs/doc.py b/bot/cogs/doc.py deleted file mode 100644 index 526747bf4..000000000 --- a/bot/cogs/doc.py +++ /dev/null @@ -1,603 +0,0 @@ -import asyncio -import functools -import logging -import re -import sys -import textwrap -from collections import OrderedDict -from contextlib import suppress -from types import SimpleNamespace -from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple, Union -from urllib.parse import urljoin - -import discord -from bs4 import BeautifulSoup -from bs4.element import PageElement, Tag -from discord.ext import commands -from markdownify import MarkdownConverter -from requests import ConnectTimeout, ConnectionError, HTTPError -from sphinx.ext import intersphinx -from urllib3.exceptions import ProtocolError - -from bot.bot import Bot -from bot.constants import MODERATION_ROLES, RedirectOutput -from bot.converters import PackageName, ValidURL -from bot.decorators import with_role -from bot.pagination import LinePaginator -from bot.utils.messages import wait_for_deletion - - -log = logging.getLogger(__name__) -logging.getLogger('urllib3').setLevel(logging.WARNING) - -# Since Intersphinx is intended to be used with Sphinx, -# we need to mock its configuration. -SPHINX_MOCK_APP = SimpleNamespace( - config=SimpleNamespace( - intersphinx_timeout=3, - tls_verify=True, - user_agent="python3:python-discord/bot:1.0.0" - ) -) - -NO_OVERRIDE_GROUPS = ( - "2to3fixer", - "token", - "label", - "pdbcommand", - "term", -) -NO_OVERRIDE_PACKAGES = ( - "python", -) - -SEARCH_END_TAG_ATTRS = ( - "data", - "function", - "class", - "exception", - "seealso", - "section", - "rubric", - "sphinxsidebar", -) -UNWANTED_SIGNATURE_SYMBOLS_RE = re.compile(r"\[source]|\\\\|¶") -WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)") - -FAILED_REQUEST_RETRY_AMOUNT = 3 -NOT_FOUND_DELETE_DELAY = RedirectOutput.delete_delay - - -class DocItem(NamedTuple): - """Holds inventory symbol information.""" - - package: str - url: str - group: str - - -def async_cache(max_size: int = 128, arg_offset: int = 0) -> Callable: - """ - LRU cache implementation for coroutines. - - Once the cache exceeds the maximum size, keys are deleted in FIFO order. - - An offset may be optionally provided to be applied to the coroutine's arguments when creating the cache key. - """ - # Assign the cache to the function itself so we can clear it from outside. - async_cache.cache = OrderedDict() - - def decorator(function: Callable) -> Callable: - """Define the async_cache decorator.""" - @functools.wraps(function) - async def wrapper(*args) -> Any: - """Decorator wrapper for the caching logic.""" - key = ':'.join(args[arg_offset:]) - - value = async_cache.cache.get(key) - if value is None: - if len(async_cache.cache) > max_size: - async_cache.cache.popitem(last=False) - - async_cache.cache[key] = await function(*args) - return async_cache.cache[key] - return wrapper - return decorator - - -class DocMarkdownConverter(MarkdownConverter): - """Subclass markdownify's MarkdownCoverter to provide custom conversion methods.""" - - def __init__(self, *, page_url: str, **options): - super().__init__(**options) - self.page_url = page_url - - def convert_code(self, el: PageElement, text: str) -> str: - """Undo `markdownify`s underscore escaping.""" - return f"`{text}`".replace('\\', '') - - def convert_pre(self, el: PageElement, text: str) -> str: - """Wrap any codeblocks in `py` for syntax highlighting.""" - code = ''.join(el.strings) - return f"```py\n{code}```" - - def convert_a(self, el: PageElement, text: str) -> str: - """Resolve relative URLs to `self.page_url`.""" - el["href"] = urljoin(self.page_url, el["href"]) - return super().convert_a(el, text) - - def convert_p(self, el: PageElement, text: str) -> str: - """Include only one newline instead of two when the parent is a li tag.""" - parent = el.parent - if parent is not None and parent.name == "li": - return f"{text}\n" - return super().convert_p(el, text) - - -def markdownify(html: str, *, url: str = "") -> str: - """Create a DocMarkdownConverter object from the input html.""" - return DocMarkdownConverter(bullets='•', page_url=url).convert(html) - - -class InventoryURL(commands.Converter): - """ - Represents an Intersphinx inventory URL. - - This converter checks whether intersphinx accepts the given inventory URL, and raises - `BadArgument` if that is not the case. - - Otherwise, it simply passes through the given URL. - """ - - @staticmethod - async def convert(ctx: commands.Context, url: str) -> str: - """Convert url to Intersphinx inventory URL.""" - await ctx.trigger_typing() - try: - intersphinx.fetch_inventory(SPHINX_MOCK_APP, '', url) - except AttributeError: - raise commands.BadArgument(f"Failed to fetch Intersphinx inventory from URL `{url}`.") - except ConnectionError: - if url.startswith('https'): - raise commands.BadArgument( - f"Cannot establish a connection to `{url}`. Does it support HTTPS?" - ) - raise commands.BadArgument(f"Cannot connect to host with URL `{url}`.") - except ValueError: - raise commands.BadArgument( - f"Failed to read Intersphinx inventory from URL `{url}`. " - "Are you sure that it's a valid inventory file?" - ) - return url - - -class Doc(commands.Cog): - """A set of commands for querying & displaying documentation.""" - - def __init__(self, bot: Bot): - self.base_urls = {} - self.bot = bot - self.doc_symbols: Dict[str, DocItem] = {} - self.renamed_symbols = set() - - self.bot.loop.create_task(self.init_refresh_inventory()) - - async def init_refresh_inventory(self) -> None: - """Refresh documentation inventory on cog initialization.""" - await self.bot.wait_until_guild_available() - await self.refresh_inventory() - - async def update_single( - self, api_package_name: str, base_url: str, inventory_url: str - ) -> None: - """ - Rebuild the inventory for a single package. - - Where: - * `package_name` is the package name to use, appears in the log - * `base_url` is the root documentation URL for the specified package, used to build - absolute paths that link to specific symbols - * `inventory_url` is the absolute URL to the intersphinx inventory, fetched by running - `intersphinx.fetch_inventory` in an executor on the bot's event loop - """ - self.base_urls[api_package_name] = base_url - - package = await self._fetch_inventory(inventory_url) - if not package: - return None - - for group, value in package.items(): - for symbol, (_package_name, _version, relative_doc_url, _) in value.items(): - if "/" in symbol: - continue # skip unreachable symbols with slashes - absolute_doc_url = base_url + relative_doc_url - # Intern the group names since they're reused in all the DocItems - # to remove unnecessary memory consumption from them being unique objects - group_name = sys.intern(group.split(":")[1]) - - if symbol in self.doc_symbols: - symbol_base_url = self.doc_symbols[symbol].url.split("/", 3)[2] - if ( - group_name in NO_OVERRIDE_GROUPS - or any(package in symbol_base_url for package in NO_OVERRIDE_PACKAGES) - ): - symbol = f"{group_name}.{symbol}" - - elif (overridden_symbol_group := self.doc_symbols[symbol].group) in NO_OVERRIDE_GROUPS: - overridden_symbol = f"{overridden_symbol_group}.{symbol}" - if overridden_symbol in self.renamed_symbols: - overridden_symbol = f"{api_package_name}.{overridden_symbol}" - - self.doc_symbols[overridden_symbol] = self.doc_symbols[symbol] - self.renamed_symbols.add(overridden_symbol) - - # If renamed `symbol` already exists, add library name in front to differentiate between them. - if symbol in self.renamed_symbols: - # Split `package_name` because of packages like Pillow that have spaces in them. - symbol = f"{api_package_name}.{symbol}" - self.renamed_symbols.add(symbol) - - self.doc_symbols[symbol] = DocItem(api_package_name, absolute_doc_url, group_name) - - log.trace(f"Fetched inventory for {api_package_name}.") - - async def refresh_inventory(self) -> None: - """Refresh internal documentation inventory.""" - log.debug("Refreshing documentation inventory...") - - # Clear the old base URLS and doc symbols to ensure - # that we start from a fresh local dataset. - # Also, reset the cache used for fetching documentation. - self.base_urls.clear() - self.doc_symbols.clear() - self.renamed_symbols.clear() - async_cache.cache = OrderedDict() - - # Run all coroutines concurrently - since each of them performs a HTTP - # request, this speeds up fetching the inventory data heavily. - coros = [ - self.update_single( - package["package"], package["base_url"], package["inventory_url"] - ) for package in await self.bot.api_client.get('bot/documentation-links') - ] - await asyncio.gather(*coros) - - async def get_symbol_html(self, symbol: str) -> Optional[Tuple[list, str]]: - """ - Given a Python symbol, return its signature and description. - - The first tuple element is the signature of the given symbol as a markup-free string, and - the second tuple element is the description of the given symbol with HTML markup included. - - If the given symbol is a module, returns a tuple `(None, str)` - else if the symbol could not be found, returns `None`. - """ - symbol_info = self.doc_symbols.get(symbol) - if symbol_info is None: - return None - request_url, symbol_id = symbol_info.url.rsplit('#') - - soup = await self._get_soup_from_url(request_url) - symbol_heading = soup.find(id=symbol_id) - search_html = str(soup) - - if symbol_heading is None: - return None - - if symbol_info.group == "module": - parsed_module = self.parse_module_symbol(symbol_heading) - if parsed_module is None: - return [], "" - else: - signatures, description = parsed_module - - else: - signatures, description = self.parse_symbol(symbol_heading, search_html) - - return signatures, description.replace('¶', '') - - @async_cache(arg_offset=1) - async def get_symbol_embed(self, symbol: str) -> Optional[discord.Embed]: - """ - Attempt to scrape and fetch the data for the given `symbol`, and build an embed from its contents. - - If the symbol is known, an Embed with documentation about it is returned. - """ - scraped_html = await self.get_symbol_html(symbol) - if scraped_html is None: - return None - - symbol_obj = self.doc_symbols[symbol] - self.bot.stats.incr(f"doc_fetches.{symbol_obj.package.lower()}") - signatures = scraped_html[0] - permalink = symbol_obj.url - description = markdownify(scraped_html[1], url=permalink) - - # Truncate the description of the embed to the last occurrence - # of a double newline (interpreted as a paragraph) before index 1000. - if len(description) > 1000: - shortened = description[:1000] - description_cutoff = shortened.rfind('\n\n', 100) - if description_cutoff == -1: - # Search the shortened version for cutoff points in decreasing desirability, - # cutoff at 1000 if none are found. - for string in (". ", ", ", ",", " "): - description_cutoff = shortened.rfind(string) - if description_cutoff != -1: - break - else: - description_cutoff = 1000 - description = description[:description_cutoff] - - # If there is an incomplete code block, cut it out - if description.count("```") % 2: - codeblock_start = description.rfind('```py') - description = description[:codeblock_start].rstrip() - description += f"... [read more]({permalink})" - - description = WHITESPACE_AFTER_NEWLINES_RE.sub('', description) - if signatures is None: - # If symbol is a module, don't show signature. - embed_description = description - - elif not signatures: - # It's some "meta-page", for example: - # https://docs.djangoproject.com/en/dev/ref/views/#module-django.views - embed_description = "This appears to be a generic page not tied to a specific symbol." - - else: - embed_description = "".join(f"```py\n{textwrap.shorten(signature, 500)}```" for signature in signatures) - embed_description += f"\n{description}" - - embed = discord.Embed( - title=discord.utils.escape_markdown(symbol), - url=permalink, - description=embed_description - ) - # Show all symbols with the same name that were renamed in the footer. - embed.set_footer( - text=", ".join(renamed for renamed in self.renamed_symbols - {symbol} if renamed.endswith(f".{symbol}")) - ) - return embed - - @classmethod - def parse_module_symbol(cls, heading: PageElement) -> Optional[Tuple[None, str]]: - """Get page content from the headerlink up to a table or a tag with its class in `SEARCH_END_TAG_ATTRS`.""" - start_tag = heading.find("a", attrs={"class": "headerlink"}) - if start_tag is None: - return None - - description = cls.find_all_children_until_tag(start_tag, cls._match_end_tag) - if description is None: - return None - - return None, description - - @classmethod - def parse_symbol(cls, heading: PageElement, html: str) -> Tuple[List[str], str]: - """ - Parse the signatures and description of a symbol. - - Collects up to 3 signatures from dt tags and a description from their sibling dd tag. - """ - signatures = [] - description_element = heading.find_next_sibling("dd") - description_pos = html.find(str(description_element)) - description = cls.find_all_children_until_tag(description_element, tag_filter=("dt", "dl")) - - for element in ( - *reversed(heading.find_previous_siblings("dt", limit=2)), - heading, - *heading.find_next_siblings("dt", limit=2), - )[-3:]: - signature = UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text) - - if signature and html.find(str(element)) < description_pos: - signatures.append(signature) - - return signatures, description - - @staticmethod - def find_all_children_until_tag( - start_element: PageElement, - tag_filter: Union[Tuple[str, ...], Callable[[Tag], bool]] - ) -> Optional[str]: - """ - Get all direct children until a child matching `tag_filter` is found. - - `tag_filter` can be either a tuple of string names to check against, - or a filtering callable that's applied to the tags. - """ - text = "" - - for element in start_element.find_next().find_next_siblings(): - if isinstance(tag_filter, tuple): - if element.name in tag_filter: - break - elif tag_filter(element): - break - text += str(element) - - return text - - @async_cache(arg_offset=1) - async def _get_soup_from_url(self, url: str) -> BeautifulSoup: - """Create a BeautifulSoup object from the HTML data in `url` with the head tag removed.""" - log.trace(f"Sending a request to {url}.") - async with self.bot.http_session.get(url) as response: - soup = BeautifulSoup(await response.text(encoding="utf8"), 'lxml') - soup.find("head").decompose() # the head contains no useful data so we can remove it - return soup - - @commands.group(name='docs', aliases=('doc', 'd'), invoke_without_command=True) - async def docs_group(self, ctx: commands.Context, *, symbol: Optional[str]) -> None: - """Lookup documentation for Python symbols.""" - await ctx.invoke(self.get_command, symbol=symbol) - - @docs_group.command(name='getdoc', aliases=('g',)) - async def get_command(self, ctx: commands.Context, *, symbol: Optional[str]) -> None: - """ - Return a documentation embed for a given symbol. - - If no symbol is given, return a list of all available inventories. - - Examples: - !docs - !docs aiohttp - !docs aiohttp.ClientSession - !docs getdoc aiohttp.ClientSession - """ - if not symbol: - inventory_embed = discord.Embed( - title=f"All inventories (`{len(self.base_urls)}` total)", - colour=discord.Colour.blue() - ) - - lines = sorted(f"• [`{name}`]({url})" for name, url in self.base_urls.items()) - if self.base_urls: - await LinePaginator.paginate(lines, ctx, inventory_embed, max_size=400, empty=False) - - else: - inventory_embed.description = "Hmmm, seems like there's nothing here yet." - await ctx.send(embed=inventory_embed) - - else: - symbol = symbol.strip("`") - # Fetching documentation for a symbol (at least for the first time, since - # caching is used) takes quite some time, so let's send typing to indicate - # that we got the command, but are still working on it. - async with ctx.typing(): - doc_embed = await self.get_symbol_embed(symbol) - - if doc_embed is None: - symbol = await discord.ext.commands.clean_content().convert(ctx, symbol) - error_embed = discord.Embed( - description=f"Sorry, I could not find any documentation for `{(symbol)}`.", - colour=discord.Colour.red() - ) - error_message = await ctx.send(embed=error_embed) - await wait_for_deletion( - error_message, - (ctx.author.id,), - timeout=NOT_FOUND_DELETE_DELAY, - client=self.bot - ) - with suppress(discord.NotFound): - await ctx.message.delete() - with suppress(discord.NotFound): - await error_message.delete() - else: - await ctx.send(embed=doc_embed) - - @docs_group.command(name='setdoc', aliases=('s',)) - @with_role(*MODERATION_ROLES) - async def set_command( - self, ctx: commands.Context, package_name: PackageName, - base_url: ValidURL, inventory_url: InventoryURL - ) -> None: - """ - Adds a new documentation metadata object to the site's database. - - The database will update the object, should an existing item with the specified `package_name` already exist. - - Example: - !docs setdoc \ - python \ - https://docs.python.org/3/ \ - https://docs.python.org/3/objects.inv - """ - body = { - 'package': package_name, - 'base_url': base_url, - 'inventory_url': inventory_url - } - await self.bot.api_client.post('bot/documentation-links', json=body) - - log.info( - f"User @{ctx.author} ({ctx.author.id}) added a new documentation package:\n" - f"Package name: {package_name}\n" - f"Base url: {base_url}\n" - f"Inventory URL: {inventory_url}" - ) - - await self.update_single(package_name, base_url, inventory_url) - await ctx.send(f"Added package `{package_name}` to database and refreshed inventory.") - - @docs_group.command(name='deletedoc', aliases=('removedoc', 'rm', 'd')) - @with_role(*MODERATION_ROLES) - async def delete_command(self, ctx: commands.Context, package_name: PackageName) -> None: - """ - Removes the specified package from the database. - - Examples: - !docs deletedoc aiohttp - """ - await self.bot.api_client.delete(f'bot/documentation-links/{package_name}') - - async with ctx.typing(): - # Rebuild the inventory to ensure that everything - # that was from this package is properly deleted. - await self.refresh_inventory() - await ctx.send(f"Successfully deleted `{package_name}` and refreshed inventory.") - - @docs_group.command(name="refreshdoc", aliases=("rfsh", "r")) - @with_role(*MODERATION_ROLES) - async def refresh_command(self, ctx: commands.Context) -> None: - """Refresh inventories and send differences to channel.""" - old_inventories = set(self.base_urls) - with ctx.typing(): - await self.refresh_inventory() - new_inventories = set(self.base_urls) - - if added := ", ".join(new_inventories - old_inventories): - added = "+ " + added - - if removed := ", ".join(old_inventories - new_inventories): - removed = "- " + removed - - embed = discord.Embed( - title="Inventories refreshed", - description=f"```diff\n{added}\n{removed}```" if added or removed else "" - ) - await ctx.send(embed=embed) - - async def _fetch_inventory(self, inventory_url: str) -> Optional[dict]: - """Get and return inventory from `inventory_url`. If fetching fails, return None.""" - fetch_func = functools.partial(intersphinx.fetch_inventory, SPHINX_MOCK_APP, '', inventory_url) - for retry in range(1, FAILED_REQUEST_RETRY_AMOUNT+1): - try: - package = await self.bot.loop.run_in_executor(None, fetch_func) - except ConnectTimeout: - log.error( - f"Fetching of inventory {inventory_url} timed out," - f" trying again. ({retry}/{FAILED_REQUEST_RETRY_AMOUNT})" - ) - except ProtocolError: - log.error( - f"Connection lost while fetching inventory {inventory_url}," - f" trying again. ({retry}/{FAILED_REQUEST_RETRY_AMOUNT})" - ) - except HTTPError as e: - log.error(f"Fetching of inventory {inventory_url} failed with status code {e.response.status_code}.") - return None - except ConnectionError: - log.error(f"Couldn't establish connection to inventory {inventory_url}.") - return None - else: - return package - log.error(f"Fetching of inventory {inventory_url} failed.") - return None - - @staticmethod - def _match_end_tag(tag: Tag) -> bool: - """Matches `tag` if its class value is in `SEARCH_END_TAG_ATTRS` or the tag is table.""" - for attr in SEARCH_END_TAG_ATTRS: - if attr in tag.get("class", ()): - return True - - return tag.name == "table" - - -def setup(bot: Bot) -> None: - """Load the Doc cog.""" - bot.add_cog(Doc(bot)) diff --git a/bot/cogs/doc/__init__.py b/bot/cogs/doc/__init__.py new file mode 100644 index 000000000..19a71ee66 --- /dev/null +++ b/bot/cogs/doc/__init__.py @@ -0,0 +1,7 @@ +from bot.bot import Bot +from .cog import DocCog + + +def setup(bot: Bot) -> None: + """Load the Doc cog.""" + bot.add_cog(DocCog(bot)) diff --git a/bot/cogs/doc/cog.py b/bot/cogs/doc/cog.py new file mode 100644 index 000000000..463e4ebc6 --- /dev/null +++ b/bot/cogs/doc/cog.py @@ -0,0 +1,598 @@ +import asyncio +import functools +import logging +import re +import sys +import textwrap +from collections import OrderedDict +from contextlib import suppress +from types import SimpleNamespace +from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple, Union +from urllib.parse import urljoin + +import discord +from bs4 import BeautifulSoup +from bs4.element import PageElement, Tag +from discord.ext import commands +from markdownify import MarkdownConverter +from requests import ConnectTimeout, ConnectionError, HTTPError +from sphinx.ext import intersphinx +from urllib3.exceptions import ProtocolError + +from bot.bot import Bot +from bot.constants import MODERATION_ROLES, RedirectOutput +from bot.converters import PackageName, ValidURL +from bot.decorators import with_role +from bot.pagination import LinePaginator +from bot.utils.messages import wait_for_deletion + + +log = logging.getLogger(__name__) +logging.getLogger('urllib3').setLevel(logging.WARNING) + +# Since Intersphinx is intended to be used with Sphinx, +# we need to mock its configuration. +SPHINX_MOCK_APP = SimpleNamespace( + config=SimpleNamespace( + intersphinx_timeout=3, + tls_verify=True, + user_agent="python3:python-discord/bot:1.0.0" + ) +) + +NO_OVERRIDE_GROUPS = ( + "2to3fixer", + "token", + "label", + "pdbcommand", + "term", +) +NO_OVERRIDE_PACKAGES = ( + "python", +) + +SEARCH_END_TAG_ATTRS = ( + "data", + "function", + "class", + "exception", + "seealso", + "section", + "rubric", + "sphinxsidebar", +) +UNWANTED_SIGNATURE_SYMBOLS_RE = re.compile(r"\[source]|\\\\|¶") +WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)") + +FAILED_REQUEST_RETRY_AMOUNT = 3 +NOT_FOUND_DELETE_DELAY = RedirectOutput.delete_delay + + +class DocItem(NamedTuple): + """Holds inventory symbol information.""" + + package: str + url: str + group: str + + +def async_cache(max_size: int = 128, arg_offset: int = 0) -> Callable: + """ + LRU cache implementation for coroutines. + + Once the cache exceeds the maximum size, keys are deleted in FIFO order. + + An offset may be optionally provided to be applied to the coroutine's arguments when creating the cache key. + """ + # Assign the cache to the function itself so we can clear it from outside. + async_cache.cache = OrderedDict() + + def decorator(function: Callable) -> Callable: + """Define the async_cache decorator.""" + @functools.wraps(function) + async def wrapper(*args) -> Any: + """Decorator wrapper for the caching logic.""" + key = ':'.join(args[arg_offset:]) + + value = async_cache.cache.get(key) + if value is None: + if len(async_cache.cache) > max_size: + async_cache.cache.popitem(last=False) + + async_cache.cache[key] = await function(*args) + return async_cache.cache[key] + return wrapper + return decorator + + +class DocMarkdownConverter(MarkdownConverter): + """Subclass markdownify's MarkdownCoverter to provide custom conversion methods.""" + + def __init__(self, *, page_url: str, **options): + super().__init__(**options) + self.page_url = page_url + + def convert_code(self, el: PageElement, text: str) -> str: + """Undo `markdownify`s underscore escaping.""" + return f"`{text}`".replace('\\', '') + + def convert_pre(self, el: PageElement, text: str) -> str: + """Wrap any codeblocks in `py` for syntax highlighting.""" + code = ''.join(el.strings) + return f"```py\n{code}```" + + def convert_a(self, el: PageElement, text: str) -> str: + """Resolve relative URLs to `self.page_url`.""" + el["href"] = urljoin(self.page_url, el["href"]) + return super().convert_a(el, text) + + def convert_p(self, el: PageElement, text: str) -> str: + """Include only one newline instead of two when the parent is a li tag.""" + parent = el.parent + if parent is not None and parent.name == "li": + return f"{text}\n" + return super().convert_p(el, text) + + +def markdownify(html: str, *, url: str = "") -> str: + """Create a DocMarkdownConverter object from the input html.""" + return DocMarkdownConverter(bullets='•', page_url=url).convert(html) + + +class InventoryURL(commands.Converter): + """ + Represents an Intersphinx inventory URL. + + This converter checks whether intersphinx accepts the given inventory URL, and raises + `BadArgument` if that is not the case. + + Otherwise, it simply passes through the given URL. + """ + + @staticmethod + async def convert(ctx: commands.Context, url: str) -> str: + """Convert url to Intersphinx inventory URL.""" + await ctx.trigger_typing() + try: + intersphinx.fetch_inventory(SPHINX_MOCK_APP, '', url) + except AttributeError: + raise commands.BadArgument(f"Failed to fetch Intersphinx inventory from URL `{url}`.") + except ConnectionError: + if url.startswith('https'): + raise commands.BadArgument( + f"Cannot establish a connection to `{url}`. Does it support HTTPS?" + ) + raise commands.BadArgument(f"Cannot connect to host with URL `{url}`.") + except ValueError: + raise commands.BadArgument( + f"Failed to read Intersphinx inventory from URL `{url}`. " + "Are you sure that it's a valid inventory file?" + ) + return url + + +class DocCog(commands.Cog): + """A set of commands for querying & displaying documentation.""" + + def __init__(self, bot: Bot): + self.base_urls = {} + self.bot = bot + self.doc_symbols: Dict[str, DocItem] = {} + self.renamed_symbols = set() + + self.bot.loop.create_task(self.init_refresh_inventory()) + + async def init_refresh_inventory(self) -> None: + """Refresh documentation inventory on cog initialization.""" + await self.bot.wait_until_guild_available() + await self.refresh_inventory() + + async def update_single( + self, api_package_name: str, base_url: str, inventory_url: str + ) -> None: + """ + Rebuild the inventory for a single package. + + Where: + * `package_name` is the package name to use, appears in the log + * `base_url` is the root documentation URL for the specified package, used to build + absolute paths that link to specific symbols + * `inventory_url` is the absolute URL to the intersphinx inventory, fetched by running + `intersphinx.fetch_inventory` in an executor on the bot's event loop + """ + self.base_urls[api_package_name] = base_url + + package = await self._fetch_inventory(inventory_url) + if not package: + return None + + for group, value in package.items(): + for symbol, (_package_name, _version, relative_doc_url, _) in value.items(): + if "/" in symbol: + continue # skip unreachable symbols with slashes + absolute_doc_url = base_url + relative_doc_url + # Intern the group names since they're reused in all the DocItems + # to remove unnecessary memory consumption from them being unique objects + group_name = sys.intern(group.split(":")[1]) + + if symbol in self.doc_symbols: + symbol_base_url = self.doc_symbols[symbol].url.split("/", 3)[2] + if ( + group_name in NO_OVERRIDE_GROUPS + or any(package in symbol_base_url for package in NO_OVERRIDE_PACKAGES) + ): + symbol = f"{group_name}.{symbol}" + + elif (overridden_symbol_group := self.doc_symbols[symbol].group) in NO_OVERRIDE_GROUPS: + overridden_symbol = f"{overridden_symbol_group}.{symbol}" + if overridden_symbol in self.renamed_symbols: + overridden_symbol = f"{api_package_name}.{overridden_symbol}" + + self.doc_symbols[overridden_symbol] = self.doc_symbols[symbol] + self.renamed_symbols.add(overridden_symbol) + + # If renamed `symbol` already exists, add library name in front to differentiate between them. + if symbol in self.renamed_symbols: + # Split `package_name` because of packages like Pillow that have spaces in them. + symbol = f"{api_package_name}.{symbol}" + self.renamed_symbols.add(symbol) + + self.doc_symbols[symbol] = DocItem(api_package_name, absolute_doc_url, group_name) + + log.trace(f"Fetched inventory for {api_package_name}.") + + async def refresh_inventory(self) -> None: + """Refresh internal documentation inventory.""" + log.debug("Refreshing documentation inventory...") + + # Clear the old base URLS and doc symbols to ensure + # that we start from a fresh local dataset. + # Also, reset the cache used for fetching documentation. + self.base_urls.clear() + self.doc_symbols.clear() + self.renamed_symbols.clear() + async_cache.cache = OrderedDict() + + # Run all coroutines concurrently - since each of them performs a HTTP + # request, this speeds up fetching the inventory data heavily. + coros = [ + self.update_single( + package["package"], package["base_url"], package["inventory_url"] + ) for package in await self.bot.api_client.get('bot/documentation-links') + ] + await asyncio.gather(*coros) + + async def get_symbol_html(self, symbol: str) -> Optional[Tuple[list, str]]: + """ + Given a Python symbol, return its signature and description. + + The first tuple element is the signature of the given symbol as a markup-free string, and + the second tuple element is the description of the given symbol with HTML markup included. + + If the given symbol is a module, returns a tuple `(None, str)` + else if the symbol could not be found, returns `None`. + """ + symbol_info = self.doc_symbols.get(symbol) + if symbol_info is None: + return None + request_url, symbol_id = symbol_info.url.rsplit('#') + + soup = await self._get_soup_from_url(request_url) + symbol_heading = soup.find(id=symbol_id) + search_html = str(soup) + + if symbol_heading is None: + return None + + if symbol_info.group == "module": + parsed_module = self.parse_module_symbol(symbol_heading) + if parsed_module is None: + return [], "" + else: + signatures, description = parsed_module + + else: + signatures, description = self.parse_symbol(symbol_heading, search_html) + + return signatures, description.replace('¶', '') + + @async_cache(arg_offset=1) + async def get_symbol_embed(self, symbol: str) -> Optional[discord.Embed]: + """ + Attempt to scrape and fetch the data for the given `symbol`, and build an embed from its contents. + + If the symbol is known, an Embed with documentation about it is returned. + """ + scraped_html = await self.get_symbol_html(symbol) + if scraped_html is None: + return None + + symbol_obj = self.doc_symbols[symbol] + self.bot.stats.incr(f"doc_fetches.{symbol_obj.package.lower()}") + signatures = scraped_html[0] + permalink = symbol_obj.url + description = markdownify(scraped_html[1], url=permalink) + + # Truncate the description of the embed to the last occurrence + # of a double newline (interpreted as a paragraph) before index 1000. + if len(description) > 1000: + shortened = description[:1000] + description_cutoff = shortened.rfind('\n\n', 100) + if description_cutoff == -1: + # Search the shortened version for cutoff points in decreasing desirability, + # cutoff at 1000 if none are found. + for string in (". ", ", ", ",", " "): + description_cutoff = shortened.rfind(string) + if description_cutoff != -1: + break + else: + description_cutoff = 1000 + description = description[:description_cutoff] + + # If there is an incomplete code block, cut it out + if description.count("```") % 2: + codeblock_start = description.rfind('```py') + description = description[:codeblock_start].rstrip() + description += f"... [read more]({permalink})" + + description = WHITESPACE_AFTER_NEWLINES_RE.sub('', description) + if signatures is None: + # If symbol is a module, don't show signature. + embed_description = description + + elif not signatures: + # It's some "meta-page", for example: + # https://docs.djangoproject.com/en/dev/ref/views/#module-django.views + embed_description = "This appears to be a generic page not tied to a specific symbol." + + else: + embed_description = "".join(f"```py\n{textwrap.shorten(signature, 500)}```" for signature in signatures) + embed_description += f"\n{description}" + + embed = discord.Embed( + title=discord.utils.escape_markdown(symbol), + url=permalink, + description=embed_description + ) + # Show all symbols with the same name that were renamed in the footer. + embed.set_footer( + text=", ".join(renamed for renamed in self.renamed_symbols - {symbol} if renamed.endswith(f".{symbol}")) + ) + return embed + + @classmethod + def parse_module_symbol(cls, heading: PageElement) -> Optional[Tuple[None, str]]: + """Get page content from the headerlink up to a table or a tag with its class in `SEARCH_END_TAG_ATTRS`.""" + start_tag = heading.find("a", attrs={"class": "headerlink"}) + if start_tag is None: + return None + + description = cls.find_all_children_until_tag(start_tag, cls._match_end_tag) + if description is None: + return None + + return None, description + + @classmethod + def parse_symbol(cls, heading: PageElement, html: str) -> Tuple[List[str], str]: + """ + Parse the signatures and description of a symbol. + + Collects up to 3 signatures from dt tags and a description from their sibling dd tag. + """ + signatures = [] + description_element = heading.find_next_sibling("dd") + description_pos = html.find(str(description_element)) + description = cls.find_all_children_until_tag(description_element, tag_filter=("dt", "dl")) + + for element in ( + *reversed(heading.find_previous_siblings("dt", limit=2)), + heading, + *heading.find_next_siblings("dt", limit=2), + )[-3:]: + signature = UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text) + + if signature and html.find(str(element)) < description_pos: + signatures.append(signature) + + return signatures, description + + @staticmethod + def find_all_children_until_tag( + start_element: PageElement, + tag_filter: Union[Tuple[str, ...], Callable[[Tag], bool]] + ) -> Optional[str]: + """ + Get all direct children until a child matching `tag_filter` is found. + + `tag_filter` can be either a tuple of string names to check against, + or a filtering callable that's applied to the tags. + """ + text = "" + + for element in start_element.find_next().find_next_siblings(): + if isinstance(tag_filter, tuple): + if element.name in tag_filter: + break + elif tag_filter(element): + break + text += str(element) + + return text + + @async_cache(arg_offset=1) + async def _get_soup_from_url(self, url: str) -> BeautifulSoup: + """Create a BeautifulSoup object from the HTML data in `url` with the head tag removed.""" + log.trace(f"Sending a request to {url}.") + async with self.bot.http_session.get(url) as response: + soup = BeautifulSoup(await response.text(encoding="utf8"), 'lxml') + soup.find("head").decompose() # the head contains no useful data so we can remove it + return soup + + @commands.group(name='docs', aliases=('doc', 'd'), invoke_without_command=True) + async def docs_group(self, ctx: commands.Context, *, symbol: Optional[str]) -> None: + """Lookup documentation for Python symbols.""" + await ctx.invoke(self.get_command, symbol=symbol) + + @docs_group.command(name='getdoc', aliases=('g',)) + async def get_command(self, ctx: commands.Context, *, symbol: Optional[str]) -> None: + """ + Return a documentation embed for a given symbol. + + If no symbol is given, return a list of all available inventories. + + Examples: + !docs + !docs aiohttp + !docs aiohttp.ClientSession + !docs getdoc aiohttp.ClientSession + """ + if not symbol: + inventory_embed = discord.Embed( + title=f"All inventories (`{len(self.base_urls)}` total)", + colour=discord.Colour.blue() + ) + + lines = sorted(f"• [`{name}`]({url})" for name, url in self.base_urls.items()) + if self.base_urls: + await LinePaginator.paginate(lines, ctx, inventory_embed, max_size=400, empty=False) + + else: + inventory_embed.description = "Hmmm, seems like there's nothing here yet." + await ctx.send(embed=inventory_embed) + + else: + symbol = symbol.strip("`") + # Fetching documentation for a symbol (at least for the first time, since + # caching is used) takes quite some time, so let's send typing to indicate + # that we got the command, but are still working on it. + async with ctx.typing(): + doc_embed = await self.get_symbol_embed(symbol) + + if doc_embed is None: + symbol = await discord.ext.commands.clean_content().convert(ctx, symbol) + error_embed = discord.Embed( + description=f"Sorry, I could not find any documentation for `{(symbol)}`.", + colour=discord.Colour.red() + ) + error_message = await ctx.send(embed=error_embed) + await wait_for_deletion( + error_message, + (ctx.author.id,), + timeout=NOT_FOUND_DELETE_DELAY, + client=self.bot + ) + with suppress(discord.NotFound): + await ctx.message.delete() + with suppress(discord.NotFound): + await error_message.delete() + else: + await ctx.send(embed=doc_embed) + + @docs_group.command(name='setdoc', aliases=('s',)) + @with_role(*MODERATION_ROLES) + async def set_command( + self, ctx: commands.Context, package_name: PackageName, + base_url: ValidURL, inventory_url: InventoryURL + ) -> None: + """ + Adds a new documentation metadata object to the site's database. + + The database will update the object, should an existing item with the specified `package_name` already exist. + + Example: + !docs setdoc \ + python \ + https://docs.python.org/3/ \ + https://docs.python.org/3/objects.inv + """ + body = { + 'package': package_name, + 'base_url': base_url, + 'inventory_url': inventory_url + } + await self.bot.api_client.post('bot/documentation-links', json=body) + + log.info( + f"User @{ctx.author} ({ctx.author.id}) added a new documentation package:\n" + f"Package name: {package_name}\n" + f"Base url: {base_url}\n" + f"Inventory URL: {inventory_url}" + ) + + await self.update_single(package_name, base_url, inventory_url) + await ctx.send(f"Added package `{package_name}` to database and refreshed inventory.") + + @docs_group.command(name='deletedoc', aliases=('removedoc', 'rm', 'd')) + @with_role(*MODERATION_ROLES) + async def delete_command(self, ctx: commands.Context, package_name: PackageName) -> None: + """ + Removes the specified package from the database. + + Examples: + !docs deletedoc aiohttp + """ + await self.bot.api_client.delete(f'bot/documentation-links/{package_name}') + + async with ctx.typing(): + # Rebuild the inventory to ensure that everything + # that was from this package is properly deleted. + await self.refresh_inventory() + await ctx.send(f"Successfully deleted `{package_name}` and refreshed inventory.") + + @docs_group.command(name="refreshdoc", aliases=("rfsh", "r")) + @with_role(*MODERATION_ROLES) + async def refresh_command(self, ctx: commands.Context) -> None: + """Refresh inventories and send differences to channel.""" + old_inventories = set(self.base_urls) + with ctx.typing(): + await self.refresh_inventory() + new_inventories = set(self.base_urls) + + if added := ", ".join(new_inventories - old_inventories): + added = "+ " + added + + if removed := ", ".join(old_inventories - new_inventories): + removed = "- " + removed + + embed = discord.Embed( + title="Inventories refreshed", + description=f"```diff\n{added}\n{removed}```" if added or removed else "" + ) + await ctx.send(embed=embed) + + async def _fetch_inventory(self, inventory_url: str) -> Optional[dict]: + """Get and return inventory from `inventory_url`. If fetching fails, return None.""" + fetch_func = functools.partial(intersphinx.fetch_inventory, SPHINX_MOCK_APP, '', inventory_url) + for retry in range(1, FAILED_REQUEST_RETRY_AMOUNT+1): + try: + package = await self.bot.loop.run_in_executor(None, fetch_func) + except ConnectTimeout: + log.error( + f"Fetching of inventory {inventory_url} timed out," + f" trying again. ({retry}/{FAILED_REQUEST_RETRY_AMOUNT})" + ) + except ProtocolError: + log.error( + f"Connection lost while fetching inventory {inventory_url}," + f" trying again. ({retry}/{FAILED_REQUEST_RETRY_AMOUNT})" + ) + except HTTPError as e: + log.error(f"Fetching of inventory {inventory_url} failed with status code {e.response.status_code}.") + return None + except ConnectionError: + log.error(f"Couldn't establish connection to inventory {inventory_url}.") + return None + else: + return package + log.error(f"Fetching of inventory {inventory_url} failed.") + return None + + @staticmethod + def _match_end_tag(tag: Tag) -> bool: + """Matches `tag` if its class value is in `SEARCH_END_TAG_ATTRS` or the tag is table.""" + for attr in SEARCH_END_TAG_ATTRS: + if attr in tag.get("class", ()): + return True + + return tag.name == "table" -- cgit v1.2.3 From c3bda11a10e3706d7e457f727e57e6a92f604d1e Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 18 Jul 2020 16:16:49 +0200 Subject: Move async_cache into a separate module --- bot/cogs/doc/cache.py | 32 ++++++++++++++++++++++++++++++++ bot/cogs/doc/cog.py | 33 ++------------------------------- 2 files changed, 34 insertions(+), 31 deletions(-) create mode 100644 bot/cogs/doc/cache.py diff --git a/bot/cogs/doc/cache.py b/bot/cogs/doc/cache.py new file mode 100644 index 000000000..9da2a1dab --- /dev/null +++ b/bot/cogs/doc/cache.py @@ -0,0 +1,32 @@ +import functools +from collections import OrderedDict +from typing import Any, Callable + + +def async_cache(max_size: int = 128, arg_offset: int = 0) -> Callable: + """ + LRU cache implementation for coroutines. + + Once the cache exceeds the maximum size, keys are deleted in FIFO order. + + An offset may be optionally provided to be applied to the coroutine's arguments when creating the cache key. + """ + # Assign the cache to the function itself so we can clear it from outside. + async_cache.cache = OrderedDict() + + def decorator(function: Callable) -> Callable: + """Define the async_cache decorator.""" + @functools.wraps(function) + async def wrapper(*args) -> Any: + """Decorator wrapper for the caching logic.""" + key = ':'.join(args[arg_offset:]) + + value = async_cache.cache.get(key) + if value is None: + if len(async_cache.cache) > max_size: + async_cache.cache.popitem(last=False) + + async_cache.cache[key] = await function(*args) + return async_cache.cache[key] + return wrapper + return decorator diff --git a/bot/cogs/doc/cog.py b/bot/cogs/doc/cog.py index 463e4ebc6..2627951e8 100644 --- a/bot/cogs/doc/cog.py +++ b/bot/cogs/doc/cog.py @@ -7,7 +7,7 @@ import textwrap from collections import OrderedDict from contextlib import suppress from types import SimpleNamespace -from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple, Union +from typing import Callable, Dict, List, NamedTuple, Optional, Tuple, Union from urllib.parse import urljoin import discord @@ -25,7 +25,7 @@ from bot.converters import PackageName, ValidURL from bot.decorators import with_role from bot.pagination import LinePaginator from bot.utils.messages import wait_for_deletion - +from .cache import async_cache log = logging.getLogger(__name__) logging.getLogger('urllib3').setLevel(logging.WARNING) @@ -76,35 +76,6 @@ class DocItem(NamedTuple): group: str -def async_cache(max_size: int = 128, arg_offset: int = 0) -> Callable: - """ - LRU cache implementation for coroutines. - - Once the cache exceeds the maximum size, keys are deleted in FIFO order. - - An offset may be optionally provided to be applied to the coroutine's arguments when creating the cache key. - """ - # Assign the cache to the function itself so we can clear it from outside. - async_cache.cache = OrderedDict() - - def decorator(function: Callable) -> Callable: - """Define the async_cache decorator.""" - @functools.wraps(function) - async def wrapper(*args) -> Any: - """Decorator wrapper for the caching logic.""" - key = ':'.join(args[arg_offset:]) - - value = async_cache.cache.get(key) - if value is None: - if len(async_cache.cache) > max_size: - async_cache.cache.popitem(last=False) - - async_cache.cache[key] = await function(*args) - return async_cache.cache[key] - return wrapper - return decorator - - class DocMarkdownConverter(MarkdownConverter): """Subclass markdownify's MarkdownCoverter to provide custom conversion methods.""" -- cgit v1.2.3 From 53213ec69208370342498cdc417f3c90d35b8f3e Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 18 Jul 2020 16:37:19 +0200 Subject: Move main parsing methods into a new module --- bot/cogs/doc/cog.py | 102 +++---------------------------------------------- bot/cogs/doc/parser.py | 102 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+), 96 deletions(-) create mode 100644 bot/cogs/doc/parser.py diff --git a/bot/cogs/doc/cog.py b/bot/cogs/doc/cog.py index 2627951e8..4a275c7c6 100644 --- a/bot/cogs/doc/cog.py +++ b/bot/cogs/doc/cog.py @@ -7,12 +7,11 @@ import textwrap from collections import OrderedDict from contextlib import suppress from types import SimpleNamespace -from typing import Callable, Dict, List, NamedTuple, Optional, Tuple, Union +from typing import Dict, NamedTuple, Optional, Tuple from urllib.parse import urljoin import discord -from bs4 import BeautifulSoup -from bs4.element import PageElement, Tag +from bs4.element import PageElement from discord.ext import commands from markdownify import MarkdownConverter from requests import ConnectTimeout, ConnectionError, HTTPError @@ -26,6 +25,7 @@ from bot.decorators import with_role from bot.pagination import LinePaginator from bot.utils.messages import wait_for_deletion from .cache import async_cache +from .parser import get_soup_from_url, parse_module_symbol, parse_symbol log = logging.getLogger(__name__) logging.getLogger('urllib3').setLevel(logging.WARNING) @@ -51,19 +51,7 @@ NO_OVERRIDE_PACKAGES = ( "python", ) -SEARCH_END_TAG_ATTRS = ( - "data", - "function", - "class", - "exception", - "seealso", - "section", - "rubric", - "sphinxsidebar", -) -UNWANTED_SIGNATURE_SYMBOLS_RE = re.compile(r"\[source]|\\\\|¶") WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)") - FAILED_REQUEST_RETRY_AMOUNT = 3 NOT_FOUND_DELETE_DELAY = RedirectOutput.delete_delay @@ -248,7 +236,7 @@ class DocCog(commands.Cog): return None request_url, symbol_id = symbol_info.url.rsplit('#') - soup = await self._get_soup_from_url(request_url) + soup = await get_soup_from_url(self.bot.http_session, request_url) symbol_heading = soup.find(id=symbol_id) search_html = str(soup) @@ -256,14 +244,14 @@ class DocCog(commands.Cog): return None if symbol_info.group == "module": - parsed_module = self.parse_module_symbol(symbol_heading) + parsed_module = parse_module_symbol(symbol_heading) if parsed_module is None: return [], "" else: signatures, description = parsed_module else: - signatures, description = self.parse_symbol(symbol_heading, search_html) + signatures, description = parse_symbol(symbol_heading, search_html) return signatures, description.replace('¶', '') @@ -331,75 +319,6 @@ class DocCog(commands.Cog): ) return embed - @classmethod - def parse_module_symbol(cls, heading: PageElement) -> Optional[Tuple[None, str]]: - """Get page content from the headerlink up to a table or a tag with its class in `SEARCH_END_TAG_ATTRS`.""" - start_tag = heading.find("a", attrs={"class": "headerlink"}) - if start_tag is None: - return None - - description = cls.find_all_children_until_tag(start_tag, cls._match_end_tag) - if description is None: - return None - - return None, description - - @classmethod - def parse_symbol(cls, heading: PageElement, html: str) -> Tuple[List[str], str]: - """ - Parse the signatures and description of a symbol. - - Collects up to 3 signatures from dt tags and a description from their sibling dd tag. - """ - signatures = [] - description_element = heading.find_next_sibling("dd") - description_pos = html.find(str(description_element)) - description = cls.find_all_children_until_tag(description_element, tag_filter=("dt", "dl")) - - for element in ( - *reversed(heading.find_previous_siblings("dt", limit=2)), - heading, - *heading.find_next_siblings("dt", limit=2), - )[-3:]: - signature = UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text) - - if signature and html.find(str(element)) < description_pos: - signatures.append(signature) - - return signatures, description - - @staticmethod - def find_all_children_until_tag( - start_element: PageElement, - tag_filter: Union[Tuple[str, ...], Callable[[Tag], bool]] - ) -> Optional[str]: - """ - Get all direct children until a child matching `tag_filter` is found. - - `tag_filter` can be either a tuple of string names to check against, - or a filtering callable that's applied to the tags. - """ - text = "" - - for element in start_element.find_next().find_next_siblings(): - if isinstance(tag_filter, tuple): - if element.name in tag_filter: - break - elif tag_filter(element): - break - text += str(element) - - return text - - @async_cache(arg_offset=1) - async def _get_soup_from_url(self, url: str) -> BeautifulSoup: - """Create a BeautifulSoup object from the HTML data in `url` with the head tag removed.""" - log.trace(f"Sending a request to {url}.") - async with self.bot.http_session.get(url) as response: - soup = BeautifulSoup(await response.text(encoding="utf8"), 'lxml') - soup.find("head").decompose() # the head contains no useful data so we can remove it - return soup - @commands.group(name='docs', aliases=('doc', 'd'), invoke_without_command=True) async def docs_group(self, ctx: commands.Context, *, symbol: Optional[str]) -> None: """Lookup documentation for Python symbols.""" @@ -558,12 +477,3 @@ class DocCog(commands.Cog): return package log.error(f"Fetching of inventory {inventory_url} failed.") return None - - @staticmethod - def _match_end_tag(tag: Tag) -> bool: - """Matches `tag` if its class value is in `SEARCH_END_TAG_ATTRS` or the tag is table.""" - for attr in SEARCH_END_TAG_ATTRS: - if attr in tag.get("class", ()): - return True - - return tag.name == "table" diff --git a/bot/cogs/doc/parser.py b/bot/cogs/doc/parser.py new file mode 100644 index 000000000..67621591b --- /dev/null +++ b/bot/cogs/doc/parser.py @@ -0,0 +1,102 @@ +import logging +import re +from typing import Callable, List, Optional, Tuple, Union + +from aiohttp import ClientSession +from bs4 import BeautifulSoup +from bs4.element import PageElement, Tag + +from .cache import async_cache + +log = logging.getLogger(__name__) + +UNWANTED_SIGNATURE_SYMBOLS_RE = re.compile(r"\[source]|\\\\|¶") +SEARCH_END_TAG_ATTRS = ( + "data", + "function", + "class", + "exception", + "seealso", + "section", + "rubric", + "sphinxsidebar", +) + + +def parse_module_symbol(heading: PageElement) -> Optional[Tuple[None, str]]: + """Get page content from the headerlink up to a table or a tag with its class in `SEARCH_END_TAG_ATTRS`.""" + start_tag = heading.find("a", attrs={"class": "headerlink"}) + if start_tag is None: + return None + + description = find_all_children_until_tag(start_tag, _match_end_tag) + if description is None: + return None + + return None, description + + +def parse_symbol(heading: PageElement, html: str) -> Tuple[List[str], str]: + """ + Parse the signatures and description of a symbol. + + Collects up to 3 signatures from dt tags and a description from their sibling dd tag. + """ + signatures = [] + description_element = heading.find_next_sibling("dd") + description_pos = html.find(str(description_element)) + description = find_all_children_until_tag(description_element, tag_filter=("dt", "dl")) + + for element in ( + *reversed(heading.find_previous_siblings("dt", limit=2)), + heading, + *heading.find_next_siblings("dt", limit=2), + )[-3:]: + signature = UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text) + + if signature and html.find(str(element)) < description_pos: + signatures.append(signature) + + return signatures, description + + +def find_all_children_until_tag( + start_element: PageElement, + tag_filter: Union[Tuple[str, ...], Callable[[Tag], bool]] +) -> Optional[str]: + """ + Get all direct children until a child matching `tag_filter` is found. + + `tag_filter` can be either a tuple of string names to check against, + or a filtering callable that's applied to the tags. + """ + text = "" + + for element in start_element.find_next().find_next_siblings(): + if isinstance(tag_filter, tuple): + if element.name in tag_filter: + break + elif tag_filter(element): + break + text += str(element) + + return text + + +@async_cache(arg_offset=1) +async def get_soup_from_url(http_session: ClientSession, url: str) -> BeautifulSoup: + """Create a BeautifulSoup object from the HTML data in `url` with the head tag removed.""" + log.trace(f"Sending a request to {url}.") + async with http_session.get(url) as response: + soup = BeautifulSoup(await response.text(encoding="utf8"), 'lxml') + soup.find("head").decompose() # the head contains no useful data so we can remove it + return soup + + +def _match_end_tag(tag: Tag) -> bool: + """Matches `tag` if its class value is in `SEARCH_END_TAG_ATTRS` or the tag is table.""" + for attr in SEARCH_END_TAG_ATTRS: + if attr in tag.get("class", ()): + return True + + return tag.name == "table" -- cgit v1.2.3 From eb8361d7fa9d0eb0dd5982c6df0fd35b80d40ba6 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 19 Jul 2020 03:13:02 +0200 Subject: Move markdown truncation into parser module --- bot/cogs/doc/cog.py | 27 ++------------------------- bot/cogs/doc/parser.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 25 deletions(-) diff --git a/bot/cogs/doc/cog.py b/bot/cogs/doc/cog.py index 4a275c7c6..bd4e9d4d1 100644 --- a/bot/cogs/doc/cog.py +++ b/bot/cogs/doc/cog.py @@ -25,7 +25,7 @@ from bot.decorators import with_role from bot.pagination import LinePaginator from bot.utils.messages import wait_for_deletion from .cache import async_cache -from .parser import get_soup_from_url, parse_module_symbol, parse_symbol +from .parser import get_soup_from_url, parse_module_symbol, parse_symbol, truncate_markdown log = logging.getLogger(__name__) logging.getLogger('urllib3').setLevel(logging.WARNING) @@ -270,30 +270,7 @@ class DocCog(commands.Cog): self.bot.stats.incr(f"doc_fetches.{symbol_obj.package.lower()}") signatures = scraped_html[0] permalink = symbol_obj.url - description = markdownify(scraped_html[1], url=permalink) - - # Truncate the description of the embed to the last occurrence - # of a double newline (interpreted as a paragraph) before index 1000. - if len(description) > 1000: - shortened = description[:1000] - description_cutoff = shortened.rfind('\n\n', 100) - if description_cutoff == -1: - # Search the shortened version for cutoff points in decreasing desirability, - # cutoff at 1000 if none are found. - for string in (". ", ", ", ",", " "): - description_cutoff = shortened.rfind(string) - if description_cutoff != -1: - break - else: - description_cutoff = 1000 - description = description[:description_cutoff] - - # If there is an incomplete code block, cut it out - if description.count("```") % 2: - codeblock_start = description.rfind('```py') - description = description[:codeblock_start].rstrip() - description += f"... [read more]({permalink})" - + description = truncate_markdown(markdownify(scraped_html[1], url=permalink), permalink, 1000) description = WHITESPACE_AFTER_NEWLINES_RE.sub('', description) if signatures is None: # If symbol is a module, don't show signature. diff --git a/bot/cogs/doc/parser.py b/bot/cogs/doc/parser.py index 67621591b..010826a96 100644 --- a/bot/cogs/doc/parser.py +++ b/bot/cogs/doc/parser.py @@ -83,6 +83,35 @@ def find_all_children_until_tag( return text +def truncate_markdown(markdown: str, permalink: str, max_length: int) -> str: + """ + Truncate `markdown` to be at most `max_length` characters. + + The markdown string is searched for substrings to cut at, to keep its structure, + but if none are found the string is simply sliced. + """ + if len(markdown) > max_length: + shortened = markdown[:max_length] + description_cutoff = shortened.rfind('\n\n', 100) + if description_cutoff == -1: + # Search the shortened version for cutoff points in decreasing desirability, + # cutoff at 1000 if none are found. + for string in (". ", ", ", ",", " "): + description_cutoff = shortened.rfind(string) + if description_cutoff != -1: + break + else: + description_cutoff = max_length + markdown = markdown[:description_cutoff] + + # If there is an incomplete code block, cut it out + if markdown.count("```") % 2: + codeblock_start = markdown.rfind('```py') + markdown = markdown[:codeblock_start].rstrip() + markdown += f"... [read more]({permalink})" + return markdown + + @async_cache(arg_offset=1) async def get_soup_from_url(http_session: ClientSession, url: str) -> BeautifulSoup: """Create a BeautifulSoup object from the HTML data in `url` with the head tag removed.""" -- cgit v1.2.3 From 0f8b991fffce8b808bf25f1ad9ed710bb1ff4919 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Mon, 20 Jul 2020 02:24:19 +0200 Subject: Rename parser.py to parsing.py. Parser is a stdlib module name, a rename avoids shadowing it. --- bot/cogs/doc/cog.py | 2 +- bot/cogs/doc/parser.py | 131 ------------------------------------------------ bot/cogs/doc/parsing.py | 131 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 132 insertions(+), 132 deletions(-) delete mode 100644 bot/cogs/doc/parser.py create mode 100644 bot/cogs/doc/parsing.py diff --git a/bot/cogs/doc/cog.py b/bot/cogs/doc/cog.py index bd4e9d4d1..4e4f3b737 100644 --- a/bot/cogs/doc/cog.py +++ b/bot/cogs/doc/cog.py @@ -25,7 +25,7 @@ from bot.decorators import with_role from bot.pagination import LinePaginator from bot.utils.messages import wait_for_deletion from .cache import async_cache -from .parser import get_soup_from_url, parse_module_symbol, parse_symbol, truncate_markdown +from .parsing import get_soup_from_url, parse_module_symbol, parse_symbol, truncate_markdown log = logging.getLogger(__name__) logging.getLogger('urllib3').setLevel(logging.WARNING) diff --git a/bot/cogs/doc/parser.py b/bot/cogs/doc/parser.py deleted file mode 100644 index 010826a96..000000000 --- a/bot/cogs/doc/parser.py +++ /dev/null @@ -1,131 +0,0 @@ -import logging -import re -from typing import Callable, List, Optional, Tuple, Union - -from aiohttp import ClientSession -from bs4 import BeautifulSoup -from bs4.element import PageElement, Tag - -from .cache import async_cache - -log = logging.getLogger(__name__) - -UNWANTED_SIGNATURE_SYMBOLS_RE = re.compile(r"\[source]|\\\\|¶") -SEARCH_END_TAG_ATTRS = ( - "data", - "function", - "class", - "exception", - "seealso", - "section", - "rubric", - "sphinxsidebar", -) - - -def parse_module_symbol(heading: PageElement) -> Optional[Tuple[None, str]]: - """Get page content from the headerlink up to a table or a tag with its class in `SEARCH_END_TAG_ATTRS`.""" - start_tag = heading.find("a", attrs={"class": "headerlink"}) - if start_tag is None: - return None - - description = find_all_children_until_tag(start_tag, _match_end_tag) - if description is None: - return None - - return None, description - - -def parse_symbol(heading: PageElement, html: str) -> Tuple[List[str], str]: - """ - Parse the signatures and description of a symbol. - - Collects up to 3 signatures from dt tags and a description from their sibling dd tag. - """ - signatures = [] - description_element = heading.find_next_sibling("dd") - description_pos = html.find(str(description_element)) - description = find_all_children_until_tag(description_element, tag_filter=("dt", "dl")) - - for element in ( - *reversed(heading.find_previous_siblings("dt", limit=2)), - heading, - *heading.find_next_siblings("dt", limit=2), - )[-3:]: - signature = UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text) - - if signature and html.find(str(element)) < description_pos: - signatures.append(signature) - - return signatures, description - - -def find_all_children_until_tag( - start_element: PageElement, - tag_filter: Union[Tuple[str, ...], Callable[[Tag], bool]] -) -> Optional[str]: - """ - Get all direct children until a child matching `tag_filter` is found. - - `tag_filter` can be either a tuple of string names to check against, - or a filtering callable that's applied to the tags. - """ - text = "" - - for element in start_element.find_next().find_next_siblings(): - if isinstance(tag_filter, tuple): - if element.name in tag_filter: - break - elif tag_filter(element): - break - text += str(element) - - return text - - -def truncate_markdown(markdown: str, permalink: str, max_length: int) -> str: - """ - Truncate `markdown` to be at most `max_length` characters. - - The markdown string is searched for substrings to cut at, to keep its structure, - but if none are found the string is simply sliced. - """ - if len(markdown) > max_length: - shortened = markdown[:max_length] - description_cutoff = shortened.rfind('\n\n', 100) - if description_cutoff == -1: - # Search the shortened version for cutoff points in decreasing desirability, - # cutoff at 1000 if none are found. - for string in (". ", ", ", ",", " "): - description_cutoff = shortened.rfind(string) - if description_cutoff != -1: - break - else: - description_cutoff = max_length - markdown = markdown[:description_cutoff] - - # If there is an incomplete code block, cut it out - if markdown.count("```") % 2: - codeblock_start = markdown.rfind('```py') - markdown = markdown[:codeblock_start].rstrip() - markdown += f"... [read more]({permalink})" - return markdown - - -@async_cache(arg_offset=1) -async def get_soup_from_url(http_session: ClientSession, url: str) -> BeautifulSoup: - """Create a BeautifulSoup object from the HTML data in `url` with the head tag removed.""" - log.trace(f"Sending a request to {url}.") - async with http_session.get(url) as response: - soup = BeautifulSoup(await response.text(encoding="utf8"), 'lxml') - soup.find("head").decompose() # the head contains no useful data so we can remove it - return soup - - -def _match_end_tag(tag: Tag) -> bool: - """Matches `tag` if its class value is in `SEARCH_END_TAG_ATTRS` or the tag is table.""" - for attr in SEARCH_END_TAG_ATTRS: - if attr in tag.get("class", ()): - return True - - return tag.name == "table" diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py new file mode 100644 index 000000000..010826a96 --- /dev/null +++ b/bot/cogs/doc/parsing.py @@ -0,0 +1,131 @@ +import logging +import re +from typing import Callable, List, Optional, Tuple, Union + +from aiohttp import ClientSession +from bs4 import BeautifulSoup +from bs4.element import PageElement, Tag + +from .cache import async_cache + +log = logging.getLogger(__name__) + +UNWANTED_SIGNATURE_SYMBOLS_RE = re.compile(r"\[source]|\\\\|¶") +SEARCH_END_TAG_ATTRS = ( + "data", + "function", + "class", + "exception", + "seealso", + "section", + "rubric", + "sphinxsidebar", +) + + +def parse_module_symbol(heading: PageElement) -> Optional[Tuple[None, str]]: + """Get page content from the headerlink up to a table or a tag with its class in `SEARCH_END_TAG_ATTRS`.""" + start_tag = heading.find("a", attrs={"class": "headerlink"}) + if start_tag is None: + return None + + description = find_all_children_until_tag(start_tag, _match_end_tag) + if description is None: + return None + + return None, description + + +def parse_symbol(heading: PageElement, html: str) -> Tuple[List[str], str]: + """ + Parse the signatures and description of a symbol. + + Collects up to 3 signatures from dt tags and a description from their sibling dd tag. + """ + signatures = [] + description_element = heading.find_next_sibling("dd") + description_pos = html.find(str(description_element)) + description = find_all_children_until_tag(description_element, tag_filter=("dt", "dl")) + + for element in ( + *reversed(heading.find_previous_siblings("dt", limit=2)), + heading, + *heading.find_next_siblings("dt", limit=2), + )[-3:]: + signature = UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text) + + if signature and html.find(str(element)) < description_pos: + signatures.append(signature) + + return signatures, description + + +def find_all_children_until_tag( + start_element: PageElement, + tag_filter: Union[Tuple[str, ...], Callable[[Tag], bool]] +) -> Optional[str]: + """ + Get all direct children until a child matching `tag_filter` is found. + + `tag_filter` can be either a tuple of string names to check against, + or a filtering callable that's applied to the tags. + """ + text = "" + + for element in start_element.find_next().find_next_siblings(): + if isinstance(tag_filter, tuple): + if element.name in tag_filter: + break + elif tag_filter(element): + break + text += str(element) + + return text + + +def truncate_markdown(markdown: str, permalink: str, max_length: int) -> str: + """ + Truncate `markdown` to be at most `max_length` characters. + + The markdown string is searched for substrings to cut at, to keep its structure, + but if none are found the string is simply sliced. + """ + if len(markdown) > max_length: + shortened = markdown[:max_length] + description_cutoff = shortened.rfind('\n\n', 100) + if description_cutoff == -1: + # Search the shortened version for cutoff points in decreasing desirability, + # cutoff at 1000 if none are found. + for string in (". ", ", ", ",", " "): + description_cutoff = shortened.rfind(string) + if description_cutoff != -1: + break + else: + description_cutoff = max_length + markdown = markdown[:description_cutoff] + + # If there is an incomplete code block, cut it out + if markdown.count("```") % 2: + codeblock_start = markdown.rfind('```py') + markdown = markdown[:codeblock_start].rstrip() + markdown += f"... [read more]({permalink})" + return markdown + + +@async_cache(arg_offset=1) +async def get_soup_from_url(http_session: ClientSession, url: str) -> BeautifulSoup: + """Create a BeautifulSoup object from the HTML data in `url` with the head tag removed.""" + log.trace(f"Sending a request to {url}.") + async with http_session.get(url) as response: + soup = BeautifulSoup(await response.text(encoding="utf8"), 'lxml') + soup.find("head").decompose() # the head contains no useful data so we can remove it + return soup + + +def _match_end_tag(tag: Tag) -> bool: + """Matches `tag` if its class value is in `SEARCH_END_TAG_ATTRS` or the tag is table.""" + for attr in SEARCH_END_TAG_ATTRS: + if attr in tag.get("class", ()): + return True + + return tag.name == "table" -- cgit v1.2.3 From 4560f0f89b52cfcb8b18abeb1efa707c334a86d4 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Mon, 20 Jul 2020 02:28:25 +0200 Subject: Remove permalink from truncated markdown. The permalink serves no functional purpose in the embed, as it is already included in the title. But it does add the complexity of passing in the url to the parser. --- bot/cogs/doc/cog.py | 2 +- bot/cogs/doc/parsing.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bot/cogs/doc/cog.py b/bot/cogs/doc/cog.py index 4e4f3b737..36fbe9010 100644 --- a/bot/cogs/doc/cog.py +++ b/bot/cogs/doc/cog.py @@ -270,7 +270,7 @@ class DocCog(commands.Cog): self.bot.stats.incr(f"doc_fetches.{symbol_obj.package.lower()}") signatures = scraped_html[0] permalink = symbol_obj.url - description = truncate_markdown(markdownify(scraped_html[1], url=permalink), permalink, 1000) + description = truncate_markdown(markdownify(scraped_html[1], url=permalink), 1000) description = WHITESPACE_AFTER_NEWLINES_RE.sub('', description) if signatures is None: # If symbol is a module, don't show signature. diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py index 010826a96..3b79e0a93 100644 --- a/bot/cogs/doc/parsing.py +++ b/bot/cogs/doc/parsing.py @@ -83,7 +83,7 @@ def find_all_children_until_tag( return text -def truncate_markdown(markdown: str, permalink: str, max_length: int) -> str: +def truncate_markdown(markdown: str, max_length: int) -> str: """ Truncate `markdown` to be at most `max_length` characters. @@ -108,7 +108,7 @@ def truncate_markdown(markdown: str, permalink: str, max_length: int) -> str: if markdown.count("```") % 2: codeblock_start = markdown.rfind('```py') markdown = markdown[:codeblock_start].rstrip() - markdown += f"... [read more]({permalink})" + markdown += "... read more" return markdown -- cgit v1.2.3 From cecd2c8e320a2a0ff0095cd1fa197552d43c6684 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Mon, 20 Jul 2020 02:31:56 +0200 Subject: Simplify cutoff text. "read more" seemed out of place with no permalink over it. --- bot/cogs/doc/parsing.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py index 3b79e0a93..994124e92 100644 --- a/bot/cogs/doc/parsing.py +++ b/bot/cogs/doc/parsing.py @@ -1,5 +1,6 @@ import logging import re +import string from typing import Callable, List, Optional, Tuple, Union from aiohttp import ClientSession @@ -96,8 +97,8 @@ def truncate_markdown(markdown: str, max_length: int) -> str: if description_cutoff == -1: # Search the shortened version for cutoff points in decreasing desirability, # cutoff at 1000 if none are found. - for string in (". ", ", ", ",", " "): - description_cutoff = shortened.rfind(string) + for cutoff_string in (". ", ", ", ",", " "): + description_cutoff = shortened.rfind(cutoff_string) if description_cutoff != -1: break else: @@ -108,7 +109,7 @@ def truncate_markdown(markdown: str, max_length: int) -> str: if markdown.count("```") % 2: codeblock_start = markdown.rfind('```py') markdown = markdown[:codeblock_start].rstrip() - markdown += "... read more" + markdown = markdown.rstrip(string.punctuation) + "..." return markdown -- cgit v1.2.3 From 2b24579b49ced873e05e375051bbbb4ec2855b12 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Mon, 20 Jul 2020 03:55:31 +0200 Subject: Add function for finding tags until a matching tag This will allow flexibility in the future when collecting tags for the description and signature of symbols. The base is a function which accepts a callable which is called and iterated over, but 3 names with a partial function that has the callable supplied are provided to keep the outside interface neater. --- bot/cogs/doc/parsing.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py index 994124e92..5e5a5be66 100644 --- a/bot/cogs/doc/parsing.py +++ b/bot/cogs/doc/parsing.py @@ -1,6 +1,7 @@ import logging import re import string +from functools import partial from typing import Callable, List, Optional, Tuple, Union from aiohttp import ClientSession @@ -24,6 +25,40 @@ SEARCH_END_TAG_ATTRS = ( ) +def find_elements_until_tag( + start_element: PageElement, + tag_filter: Union[Tuple[str, ...], Callable[[Tag], bool]], + *, + func: Callable, + limit: int = None, +) -> List[str]: + """ + Get all tags until a tag matching `tag_filter` is found. + + `tag_filter` can be either a tuple of string names to check against, + or a filtering t.Callable that's applied to the tags. + + `func` takes in a BeautifulSoup unbound method for finding multiple elements, such as `BeautifulSoup.find_all`. + That method is then iterated over and all tags until the matching tag are added to the return list as strings. + """ + elements = [] + + for element in func(start_element, limit=limit): + if isinstance(tag_filter, tuple): + if element.name in tag_filter: + break + elif tag_filter(element): + break + elements.append(str(element)) + + return elements + + +find_next_children_until_tag = partial(find_elements_until_tag, func=partial(BeautifulSoup.find_all, recursive=False)) +find_next_siblings_until_tag = partial(find_elements_until_tag, func=BeautifulSoup.find_next_siblings) +find_previous_siblings_until_tag = partial(find_elements_until_tag, func=BeautifulSoup.find_previous_siblings) + + def parse_module_symbol(heading: PageElement) -> Optional[Tuple[None, str]]: """Get page content from the headerlink up to a table or a tag with its class in `SEARCH_END_TAG_ATTRS`.""" start_tag = heading.find("a", attrs={"class": "headerlink"}) -- cgit v1.2.3 From 9f78dbafc3bc532bbfb5ffa0ef110fdeb0c3e8a5 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Mon, 20 Jul 2020 03:57:27 +0200 Subject: Simplify module parsing method. Instead of returning None and multiple values, the method now only returns the string of the description. Previously the parsing returned None and quit when appropriate tags for shortening the description were not found, but the new implementation simply defaults to the provided start tag if a better alternative is not found. --- bot/cogs/doc/parsing.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py index 5e5a5be66..368feeb68 100644 --- a/bot/cogs/doc/parsing.py +++ b/bot/cogs/doc/parsing.py @@ -59,17 +59,18 @@ find_next_siblings_until_tag = partial(find_elements_until_tag, func=BeautifulSo find_previous_siblings_until_tag = partial(find_elements_until_tag, func=BeautifulSoup.find_previous_siblings) -def parse_module_symbol(heading: PageElement) -> Optional[Tuple[None, str]]: - """Get page content from the headerlink up to a table or a tag with its class in `SEARCH_END_TAG_ATTRS`.""" - start_tag = heading.find("a", attrs={"class": "headerlink"}) - if start_tag is None: - return None +def get_module_description(start_element: PageElement) -> Optional[str]: + """ + Get page content to a table or a tag with its class in `SEARCH_END_TAG_ATTRS`. - description = find_all_children_until_tag(start_tag, _match_end_tag) - if description is None: - return None + A headerlink a tag is attempted to be found to skip repeating the module name in the description, + if it's found it's used as the tag to search from instead of the `start_element`. + """ + header = start_element.find("a", attrs={"class": "headerlink"}) + start_tag = header.parent if header is not None else start_element + description = "".join(str(tag) for tag in find_next_siblings_until_tag(start_tag, _match_end_tag)) - return None, description + return description def parse_symbol(heading: PageElement, html: str) -> Tuple[List[str], str]: -- cgit v1.2.3 From 082867253cd19c70516102a3d4972da6d501ff6f Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Mon, 20 Jul 2020 17:35:07 +0200 Subject: Create a function for collecting signatures. By getting the signatures without the description we get more flexibility of parsing different symbol groups and decouple the logic from the description which can be parsed directly with the new `find_elements_until_tag` based function. --- bot/cogs/doc/parsing.py | 46 ++++++++++------------------------------------ 1 file changed, 10 insertions(+), 36 deletions(-) diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py index 368feeb68..5b60f1609 100644 --- a/bot/cogs/doc/parsing.py +++ b/bot/cogs/doc/parsing.py @@ -73,51 +73,25 @@ def get_module_description(start_element: PageElement) -> Optional[str]: return description -def parse_symbol(heading: PageElement, html: str) -> Tuple[List[str], str]: +def get_signatures(start_signature: PageElement) -> List[str]: """ - Parse the signatures and description of a symbol. + Collect up to 3 signatures from dt tags around the `start_signature` dt tag. - Collects up to 3 signatures from dt tags and a description from their sibling dd tag. + First the signatures under the `start_signature` are included; + if less than 2 are found, tags above the start signature are added to the result if any are present. """ signatures = [] - description_element = heading.find_next_sibling("dd") - description_pos = html.find(str(description_element)) - description = find_all_children_until_tag(description_element, tag_filter=("dt", "dl")) - for element in ( - *reversed(heading.find_previous_siblings("dt", limit=2)), - heading, - *heading.find_next_siblings("dt", limit=2), + *reversed(find_previous_siblings_until_tag(start_signature, ("dd",), limit=2)), + start_signature, + *find_next_siblings_until_tag(start_signature, ("dd",), limit=2), )[-3:]: - signature = UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text) + signature = UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element) - if signature and html.find(str(element)) < description_pos: + if signature: signatures.append(signature) - return signatures, description - - -def find_all_children_until_tag( - start_element: PageElement, - tag_filter: Union[Tuple[str, ...], Callable[[Tag], bool]] -) -> Optional[str]: - """ - Get all direct children until a child matching `tag_filter` is found. - - `tag_filter` can be either a tuple of string names to check against, - or a filtering callable that's applied to the tags. - """ - text = "" - - for element in start_element.find_next().find_next_siblings(): - if isinstance(tag_filter, tuple): - if element.name in tag_filter: - break - elif tag_filter(element): - break - text += str(element) - - return text + return signatures def truncate_markdown(markdown: str, max_length: int) -> str: -- cgit v1.2.3 From caedfb0c16bc98eb94d723caff42dfe0799f8f17 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 22 Jul 2020 01:38:00 +0200 Subject: Remove conversion to str when finding elements. The tags need to be processed down the line, which is not viable on strings. --- bot/cogs/doc/parsing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py index 5b60f1609..acf3a0804 100644 --- a/bot/cogs/doc/parsing.py +++ b/bot/cogs/doc/parsing.py @@ -31,7 +31,7 @@ def find_elements_until_tag( *, func: Callable, limit: int = None, -) -> List[str]: +) -> List[Tag]: """ Get all tags until a tag matching `tag_filter` is found. @@ -49,7 +49,7 @@ def find_elements_until_tag( break elif tag_filter(element): break - elements.append(str(element)) + elements.append(element) return elements -- cgit v1.2.3 From 1c997846f282f76d17700f0f16c0a0abb5c49a30 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 22 Jul 2020 01:39:43 +0200 Subject: Fix handling of elements when fetching signatures. After the change to `find_elements_until_tag`, the text contentsneed to be extracted from the tags instead of passing them directly to re. --- bot/cogs/doc/parsing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py index acf3a0804..725fe47cd 100644 --- a/bot/cogs/doc/parsing.py +++ b/bot/cogs/doc/parsing.py @@ -86,7 +86,7 @@ def get_signatures(start_signature: PageElement) -> List[str]: start_signature, *find_next_siblings_until_tag(start_signature, ("dd",), limit=2), )[-3:]: - signature = UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element) + signature = UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text) if signature: signatures.append(signature) -- cgit v1.2.3 From e10def8a3d79dffd8cc53acd6b30fa43741d140c Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 22 Jul 2020 02:03:31 +0200 Subject: Move DocMarkdownConverter to parsing. --- bot/cogs/doc/cog.py | 34 ---------------------------------- bot/cogs/doc/parsing.py | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 34 deletions(-) diff --git a/bot/cogs/doc/cog.py b/bot/cogs/doc/cog.py index 36fbe9010..a7dcd9020 100644 --- a/bot/cogs/doc/cog.py +++ b/bot/cogs/doc/cog.py @@ -64,40 +64,6 @@ class DocItem(NamedTuple): group: str -class DocMarkdownConverter(MarkdownConverter): - """Subclass markdownify's MarkdownCoverter to provide custom conversion methods.""" - - def __init__(self, *, page_url: str, **options): - super().__init__(**options) - self.page_url = page_url - - def convert_code(self, el: PageElement, text: str) -> str: - """Undo `markdownify`s underscore escaping.""" - return f"`{text}`".replace('\\', '') - - def convert_pre(self, el: PageElement, text: str) -> str: - """Wrap any codeblocks in `py` for syntax highlighting.""" - code = ''.join(el.strings) - return f"```py\n{code}```" - - def convert_a(self, el: PageElement, text: str) -> str: - """Resolve relative URLs to `self.page_url`.""" - el["href"] = urljoin(self.page_url, el["href"]) - return super().convert_a(el, text) - - def convert_p(self, el: PageElement, text: str) -> str: - """Include only one newline instead of two when the parent is a li tag.""" - parent = el.parent - if parent is not None and parent.name == "li": - return f"{text}\n" - return super().convert_p(el, text) - - -def markdownify(html: str, *, url: str = "") -> str: - """Create a DocMarkdownConverter object from the input html.""" - return DocMarkdownConverter(bullets='•', page_url=url).convert(html) - - class InventoryURL(commands.Converter): """ Represents an Intersphinx inventory URL. diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py index 725fe47cd..8f6688bd2 100644 --- a/bot/cogs/doc/parsing.py +++ b/bot/cogs/doc/parsing.py @@ -25,6 +25,40 @@ SEARCH_END_TAG_ATTRS = ( ) +class DocMarkdownConverter(MarkdownConverter): + """Subclass markdownify's MarkdownCoverter to provide custom conversion methods.""" + + def __init__(self, *, page_url: str, **options): + super().__init__(**options) + self.page_url = page_url + + def convert_code(self, el: PageElement, text: str) -> str: + """Undo `markdownify`s underscore escaping.""" + return f"`{text}`".replace('\\', '') + + def convert_pre(self, el: PageElement, text: str) -> str: + """Wrap any codeblocks in `py` for syntax highlighting.""" + code = ''.join(el.strings) + return f"```py\n{code}```" + + def convert_a(self, el: PageElement, text: str) -> str: + """Resolve relative URLs to `self.page_url`.""" + el["href"] = urljoin(self.page_url, el["href"]) + return super().convert_a(el, text) + + def convert_p(self, el: PageElement, text: str) -> str: + """Include only one newline instead of two when the parent is a li tag.""" + parent = el.parent + if parent is not None and parent.name == "li": + return f"{text}\n" + return super().convert_p(el, text) + + +def markdownify(html: str, *, url: str = "") -> str: + """Create a DocMarkdownConverter object from the input html.""" + return DocMarkdownConverter(bullets='•', page_url=url).convert(html) + + def find_elements_until_tag( start_element: PageElement, tag_filter: Union[Tuple[str, ...], Callable[[Tag], bool]], -- cgit v1.2.3 From 6795a7f05e3720f375a9195182b996a14d754ea0 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 22 Jul 2020 02:06:50 +0200 Subject: Fix ordered list indices in markdown converter. markdownify relies on the parent tag's index method, which goes through all of its contents, if there is anything else in the contents apart from the li tags, those indices are then shifted. --- bot/cogs/doc/parsing.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py index 8f6688bd2..25001b83d 100644 --- a/bot/cogs/doc/parsing.py +++ b/bot/cogs/doc/parsing.py @@ -3,10 +3,12 @@ import re import string from functools import partial from typing import Callable, List, Optional, Tuple, Union +from urllib.parse import urljoin from aiohttp import ClientSession from bs4 import BeautifulSoup from bs4.element import PageElement, Tag +from markdownify import MarkdownConverter from .cache import async_cache @@ -32,6 +34,22 @@ class DocMarkdownConverter(MarkdownConverter): super().__init__(**options) self.page_url = page_url + def convert_li(self, el: PageElement, text: str) -> str: + """Fix markdownify's erroneous indexing in ol tags.""" + parent = el.parent + if parent is not None and parent.name == 'ol': + li_tags = parent.find_all("li") + bullet = '%s.' % (li_tags.index(el)+1) + else: + depth = -1 + while el: + if el.name == 'ul': + depth += 1 + el = el.parent + bullets = self.options['bullets'] + bullet = bullets[depth % len(bullets)] + return '%s %s\n' % (bullet, text or '') + def convert_code(self, el: PageElement, text: str) -> str: """Undo `markdownify`s underscore escaping.""" return f"`{text}`".replace('\\', '') -- cgit v1.2.3 From 4e9ffb210f6a8f0184ac97cb16703777cc1e0ca0 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 22 Jul 2020 02:34:11 +0200 Subject: Create a function for getting the result markdown. --- bot/cogs/doc/parsing.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py index 25001b83d..8756e0694 100644 --- a/bot/cogs/doc/parsing.py +++ b/bot/cogs/doc/parsing.py @@ -1,6 +1,7 @@ import logging import re import string +import textwrap from functools import partial from typing import Callable, List, Optional, Tuple, Union from urllib.parse import urljoin @@ -15,6 +16,8 @@ from .cache import async_cache log = logging.getLogger(__name__) UNWANTED_SIGNATURE_SYMBOLS_RE = re.compile(r"\[source]|\\\\|¶") +WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)") + SEARCH_END_TAG_ATTRS = ( "data", "function", @@ -175,6 +178,24 @@ def truncate_markdown(markdown: str, max_length: int) -> str: return markdown +def _parse_into_markdown(signatures: Optional[List[str]], description: str, url: str) -> str: + """ + Create a markdown string with the signatures at the top, and the converted html description below them. + + The signatures are wrapped in python codeblocks, separated from the description by a newline. + The result string is truncated to be max 1000 symbols long. + """ + description = truncate_markdown(markdownify(description, url=url), 1000) + description = WHITESPACE_AFTER_NEWLINES_RE.sub('', description) + if signatures is not None: + formatted_markdown = "".join(f"```py\n{textwrap.shorten(signature, 500)}```" for signature in signatures) + else: + formatted_markdown = "" + formatted_markdown += f"\n{description}" + + return formatted_markdown + + @async_cache(arg_offset=1) async def get_soup_from_url(http_session: ClientSession, url: str) -> BeautifulSoup: """Create a BeautifulSoup object from the HTML data in `url` with the head tag removed.""" -- cgit v1.2.3 From f562c4b4551caa8ed3710ac5e9841150cb8a2492 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 22 Jul 2020 02:35:13 +0200 Subject: Create the parsing interface function. Other functions from the module are not intended to be used directly, with the interface of it being the added function which accepts the symbol and calls internals. All other names except imports and log had the underscore prefix added to accommodate this. --- bot/cogs/doc/parsing.py | 92 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 71 insertions(+), 21 deletions(-) diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py index 8756e0694..a2c6564b3 100644 --- a/bot/cogs/doc/parsing.py +++ b/bot/cogs/doc/parsing.py @@ -3,7 +3,7 @@ import re import string import textwrap from functools import partial -from typing import Callable, List, Optional, Tuple, Union +from typing import Callable, List, Optional, TYPE_CHECKING, Tuple, Union from urllib.parse import urljoin from aiohttp import ClientSession @@ -12,13 +12,15 @@ from bs4.element import PageElement, Tag from markdownify import MarkdownConverter from .cache import async_cache +if TYPE_CHECKING: + from .cog import DocItem log = logging.getLogger(__name__) -UNWANTED_SIGNATURE_SYMBOLS_RE = re.compile(r"\[source]|\\\\|¶") -WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)") +_UNWANTED_SIGNATURE_SYMBOLS_RE = re.compile(r"\[source]|\\\\|¶") +_WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)") -SEARCH_END_TAG_ATTRS = ( +_SEARCH_END_TAG_ATTRS = ( "data", "function", "class", @@ -29,8 +31,17 @@ SEARCH_END_TAG_ATTRS = ( "sphinxsidebar", ) +_NO_SIGNATURE_GROUPS = { + "attribute", + "envvar", + "setting", + "tempaltefilter", + "templatetag", + "term", +} -class DocMarkdownConverter(MarkdownConverter): + +class _DocMarkdownConverter(MarkdownConverter): """Subclass markdownify's MarkdownCoverter to provide custom conversion methods.""" def __init__(self, *, page_url: str, **options): @@ -75,12 +86,12 @@ class DocMarkdownConverter(MarkdownConverter): return super().convert_p(el, text) -def markdownify(html: str, *, url: str = "") -> str: +def _markdownify(html: str, *, url: str = "") -> str: """Create a DocMarkdownConverter object from the input html.""" - return DocMarkdownConverter(bullets='•', page_url=url).convert(html) + return _DocMarkdownConverter(bullets='•', page_url=url).convert(html) -def find_elements_until_tag( +def _find_elements_until_tag( start_element: PageElement, tag_filter: Union[Tuple[str, ...], Callable[[Tag], bool]], *, @@ -109,9 +120,9 @@ def find_elements_until_tag( return elements -find_next_children_until_tag = partial(find_elements_until_tag, func=partial(BeautifulSoup.find_all, recursive=False)) -find_next_siblings_until_tag = partial(find_elements_until_tag, func=BeautifulSoup.find_next_siblings) -find_previous_siblings_until_tag = partial(find_elements_until_tag, func=BeautifulSoup.find_previous_siblings) +_find_next_children_until_tag = partial(_find_elements_until_tag, func=partial(BeautifulSoup.find_all, recursive=False)) +_find_next_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_next_siblings) +_find_previous_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_previous_siblings) def get_module_description(start_element: PageElement) -> Optional[str]: @@ -123,12 +134,19 @@ def get_module_description(start_element: PageElement) -> Optional[str]: """ header = start_element.find("a", attrs={"class": "headerlink"}) start_tag = header.parent if header is not None else start_element - description = "".join(str(tag) for tag in find_next_siblings_until_tag(start_tag, _match_end_tag)) + description = "".join(str(tag) for tag in _find_next_siblings_until_tag(start_tag, _match_end_tag)) return description -def get_signatures(start_signature: PageElement) -> List[str]: +def _get_symbol_description(symbol: PageElement) -> str: + """Get the string contents of the next dd tag, up to a dt or a dl tag.""" + description_tag = symbol.find_next("dd") + description_contents = _find_next_children_until_tag(description_tag, ("dt", "dl")) + return "".join(str(tag) for tag in description_contents) + + +def _get_signatures(start_signature: PageElement) -> List[str]: """ Collect up to 3 signatures from dt tags around the `start_signature` dt tag. @@ -137,11 +155,11 @@ def get_signatures(start_signature: PageElement) -> List[str]: """ signatures = [] for element in ( - *reversed(find_previous_siblings_until_tag(start_signature, ("dd",), limit=2)), + *reversed(_find_previous_siblings_until_tag(start_signature, ("dd",), limit=2)), start_signature, - *find_next_siblings_until_tag(start_signature, ("dd",), limit=2), + *_find_next_siblings_until_tag(start_signature, ("dd",), limit=2), )[-3:]: - signature = UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text) + signature = _UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text) if signature: signatures.append(signature) @@ -149,7 +167,7 @@ def get_signatures(start_signature: PageElement) -> List[str]: return signatures -def truncate_markdown(markdown: str, max_length: int) -> str: +def _truncate_markdown(markdown: str, max_length: int) -> str: """ Truncate `markdown` to be at most `max_length` characters. @@ -185,8 +203,8 @@ def _parse_into_markdown(signatures: Optional[List[str]], description: str, url: The signatures are wrapped in python codeblocks, separated from the description by a newline. The result string is truncated to be max 1000 symbols long. """ - description = truncate_markdown(markdownify(description, url=url), 1000) - description = WHITESPACE_AFTER_NEWLINES_RE.sub('', description) + description = _truncate_markdown(_markdownify(description, url=url), 1000) + description = _WHITESPACE_AFTER_NEWLINES_RE.sub('', description) if signatures is not None: formatted_markdown = "".join(f"```py\n{textwrap.shorten(signature, 500)}```" for signature in signatures) else: @@ -197,7 +215,7 @@ def _parse_into_markdown(signatures: Optional[List[str]], description: str, url: @async_cache(arg_offset=1) -async def get_soup_from_url(http_session: ClientSession, url: str) -> BeautifulSoup: +async def _get_soup_from_url(http_session: ClientSession, url: str) -> BeautifulSoup: """Create a BeautifulSoup object from the HTML data in `url` with the head tag removed.""" log.trace(f"Sending a request to {url}.") async with http_session.get(url) as response: @@ -208,8 +226,40 @@ async def get_soup_from_url(http_session: ClientSession, url: str) -> BeautifulS def _match_end_tag(tag: Tag) -> bool: """Matches `tag` if its class value is in `SEARCH_END_TAG_ATTRS` or the tag is table.""" - for attr in SEARCH_END_TAG_ATTRS: + for attr in _SEARCH_END_TAG_ATTRS: if attr in tag.get("class", ()): return True return tag.name == "table" + + +async def get_symbol_markdown(http_session: ClientSession, symbol_data: "DocItem") -> str: + """ + Return parsed markdown of the passed symbol, truncated to 1000 characters. + + A request through `http_session` is made to the url associated with `symbol_data` for the html contents; + the contents are then parsed depending on what group the symbol belongs to. + """ + if "#" in symbol_data.url: + request_url, symbol_id = symbol_data.url.rsplit('#') + else: + request_url = symbol_data.url + symbol_id = None + + soup = await _get_soup_from_url(http_session, request_url) + symbol_heading = soup.find(id=symbol_id) + + # Handle doc symbols as modules, because they either link to the page of a module, + # or don't contain any useful info to be parsed. + signature = None + if symbol_data.group in {"module", "doc"}: + description = get_module_description(symbol_heading) + + elif symbol_data.group in _NO_SIGNATURE_GROUPS: + description = _get_symbol_description(symbol_heading) + + else: + signature = _get_signatures(symbol_heading) + description = _get_symbol_description(symbol_heading) + + return _parse_into_markdown(signature, description, symbol_data.url) -- cgit v1.2.3 From 6f4731714aa9df086ec287f768556a4c4443b635 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 22 Jul 2020 02:50:49 +0200 Subject: Change DocCog to use the new parsing module fully. The parsing module provides an interface for fetching the markdown from the symbol data provided to it. Because it's now fully done in an another module we can remove the needless parts from the cog. --- bot/cogs/doc/cog.py | 69 ++++++----------------------------------------------- 1 file changed, 7 insertions(+), 62 deletions(-) diff --git a/bot/cogs/doc/cog.py b/bot/cogs/doc/cog.py index a7dcd9020..6cd066f1b 100644 --- a/bot/cogs/doc/cog.py +++ b/bot/cogs/doc/cog.py @@ -3,17 +3,13 @@ import functools import logging import re import sys -import textwrap from collections import OrderedDict from contextlib import suppress from types import SimpleNamespace -from typing import Dict, NamedTuple, Optional, Tuple -from urllib.parse import urljoin +from typing import Dict, NamedTuple, Optional import discord -from bs4.element import PageElement from discord.ext import commands -from markdownify import MarkdownConverter from requests import ConnectTimeout, ConnectionError, HTTPError from sphinx.ext import intersphinx from urllib3.exceptions import ProtocolError @@ -25,7 +21,7 @@ from bot.decorators import with_role from bot.pagination import LinePaginator from bot.utils.messages import wait_for_deletion from .cache import async_cache -from .parsing import get_soup_from_url, parse_module_symbol, parse_symbol, truncate_markdown +from .parsing import get_symbol_markdown log = logging.getLogger(__name__) logging.getLogger('urllib3').setLevel(logging.WARNING) @@ -187,40 +183,6 @@ class DocCog(commands.Cog): ] await asyncio.gather(*coros) - async def get_symbol_html(self, symbol: str) -> Optional[Tuple[list, str]]: - """ - Given a Python symbol, return its signature and description. - - The first tuple element is the signature of the given symbol as a markup-free string, and - the second tuple element is the description of the given symbol with HTML markup included. - - If the given symbol is a module, returns a tuple `(None, str)` - else if the symbol could not be found, returns `None`. - """ - symbol_info = self.doc_symbols.get(symbol) - if symbol_info is None: - return None - request_url, symbol_id = symbol_info.url.rsplit('#') - - soup = await get_soup_from_url(self.bot.http_session, request_url) - symbol_heading = soup.find(id=symbol_id) - search_html = str(soup) - - if symbol_heading is None: - return None - - if symbol_info.group == "module": - parsed_module = parse_module_symbol(symbol_heading) - if parsed_module is None: - return [], "" - else: - signatures, description = parsed_module - - else: - signatures, description = parse_symbol(symbol_heading, search_html) - - return signatures, description.replace('¶', '') - @async_cache(arg_offset=1) async def get_symbol_embed(self, symbol: str) -> Optional[discord.Embed]: """ @@ -228,32 +190,15 @@ class DocCog(commands.Cog): If the symbol is known, an Embed with documentation about it is returned. """ - scraped_html = await self.get_symbol_html(symbol) - if scraped_html is None: + symbol_info = self.doc_symbols.get(symbol) + if symbol_info is None: return None - - symbol_obj = self.doc_symbols[symbol] - self.bot.stats.incr(f"doc_fetches.{symbol_obj.package.lower()}") - signatures = scraped_html[0] - permalink = symbol_obj.url - description = truncate_markdown(markdownify(scraped_html[1], url=permalink), 1000) - description = WHITESPACE_AFTER_NEWLINES_RE.sub('', description) - if signatures is None: - # If symbol is a module, don't show signature. - embed_description = description - - elif not signatures: - # It's some "meta-page", for example: - # https://docs.djangoproject.com/en/dev/ref/views/#module-django.views - embed_description = "This appears to be a generic page not tied to a specific symbol." - - else: - embed_description = "".join(f"```py\n{textwrap.shorten(signature, 500)}```" for signature in signatures) - embed_description += f"\n{description}" + self.bot.stats.incr(f"doc_fetches.{symbol_info.package.lower()}") + embed_description = await get_symbol_markdown(self.bot.http_session, symbol_info) embed = discord.Embed( title=discord.utils.escape_markdown(symbol), - url=permalink, + url=symbol_info.url, description=embed_description ) # Show all symbols with the same name that were renamed in the footer. -- cgit v1.2.3 From e875142a0f937ab190208523ef17068e5988dca3 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 22 Jul 2020 14:25:47 +0200 Subject: Remove caching from get_symbol_embed. The web request is already cached, and parsing doesn't much more time, but without moving the logic around the cache prevents the stat increase when a symbol is requested. --- bot/cogs/doc/cog.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bot/cogs/doc/cog.py b/bot/cogs/doc/cog.py index 6cd066f1b..05cedcaaf 100644 --- a/bot/cogs/doc/cog.py +++ b/bot/cogs/doc/cog.py @@ -183,7 +183,6 @@ class DocCog(commands.Cog): ] await asyncio.gather(*coros) - @async_cache(arg_offset=1) async def get_symbol_embed(self, symbol: str) -> Optional[discord.Embed]: """ Attempt to scrape and fetch the data for the given `symbol`, and build an embed from its contents. -- cgit v1.2.3 From 6731de62e3a3f5d188e73538a718d2b30cc2f442 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 22 Jul 2020 14:28:07 +0200 Subject: Hold url parts in DocItem separately. This allows us to save up some memory by not creating unique strings with the base url repeated between them. --- bot/cogs/doc/cog.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/bot/cogs/doc/cog.py b/bot/cogs/doc/cog.py index 05cedcaaf..bd27dde01 100644 --- a/bot/cogs/doc/cog.py +++ b/bot/cogs/doc/cog.py @@ -55,10 +55,16 @@ NOT_FOUND_DELETE_DELAY = RedirectOutput.delete_delay class DocItem(NamedTuple): """Holds inventory symbol information.""" + base_url: str + relative_url: str package: str - url: str group: str + @property + def url(self) -> str: + """Return the absolute url to the symbol.""" + return self.base_url + self.relative_url + class InventoryURL(commands.Converter): """ @@ -131,7 +137,6 @@ class DocCog(commands.Cog): for symbol, (_package_name, _version, relative_doc_url, _) in value.items(): if "/" in symbol: continue # skip unreachable symbols with slashes - absolute_doc_url = base_url + relative_doc_url # Intern the group names since they're reused in all the DocItems # to remove unnecessary memory consumption from them being unique objects group_name = sys.intern(group.split(":")[1]) @@ -158,7 +163,7 @@ class DocCog(commands.Cog): symbol = f"{api_package_name}.{symbol}" self.renamed_symbols.add(symbol) - self.doc_symbols[symbol] = DocItem(api_package_name, absolute_doc_url, group_name) + self.doc_symbols[symbol] = DocItem(base_url, relative_doc_url, api_package_name, group_name) log.trace(f"Fetched inventory for {api_package_name}.") -- cgit v1.2.3 From 6ca72a68a75a1e5f56cb6a6ebec5a5b533c77eff Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 22 Jul 2020 14:52:04 +0200 Subject: Remove paragraph chars from descriptions --- bot/cogs/doc/parsing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py index a2c6564b3..79f3bbf69 100644 --- a/bot/cogs/doc/parsing.py +++ b/bot/cogs/doc/parsing.py @@ -262,4 +262,4 @@ async def get_symbol_markdown(http_session: ClientSession, symbol_data: "DocItem signature = _get_signatures(symbol_heading) description = _get_symbol_description(symbol_heading) - return _parse_into_markdown(signature, description, symbol_data.url) + return _parse_into_markdown(signature, description.replace('¶', ''), symbol_data.url) -- cgit v1.2.3 From 9f4d602bfa02fce088aaed28ee598c116b655683 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 22 Jul 2020 16:20:48 +0200 Subject: Change ValidPythonIdentifier tests to PackageName. --- tests/bot/test_converters.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/tests/bot/test_converters.py b/tests/bot/test_converters.py index ca8cb6825..a3c071168 100644 --- a/tests/bot/test_converters.py +++ b/tests/bot/test_converters.py @@ -10,9 +10,9 @@ from bot.converters import ( Duration, HushDurationConverter, ISODateTime, + PackageName, TagContentConverter, TagNameConverter, - ValidPythonIdentifier, ) @@ -78,24 +78,23 @@ class ConverterTests(unittest.TestCase): with self.assertRaises(BadArgument, msg=exception_message): asyncio.run(TagNameConverter.convert(self.context, invalid_name)) - def test_valid_python_identifier_for_valid(self): - """ValidPythonIdentifier returns valid identifiers unchanged.""" - test_values = ('foo', 'lemon') + def test_package_name_for_valid(self): + """PackageName returns valid package names unchanged.""" + test_values = ('foo', 'le_mon') for name in test_values: with self.subTest(identifier=name): - conversion = asyncio.run(ValidPythonIdentifier.convert(self.context, name)) + conversion = asyncio.run(PackageName.convert(self.context, name)) self.assertEqual(name, conversion) - def test_valid_python_identifier_for_invalid(self): - """ValidPythonIdentifier raises the proper exception for invalid identifiers.""" - test_values = ('nested.stuff', '#####') + def test_package_name_for_invalid(self): + """PackageName raises the proper exception for invalid package names.""" + test_values = ('text_with_a_dot.', 'UpperCaseName', "num83r") for name in test_values: with self.subTest(identifier=name): - exception_message = f'`{name}` is not a valid Python identifier' - with self.assertRaises(BadArgument, msg=exception_message): - asyncio.run(ValidPythonIdentifier.convert(self.context, name)) + with self.assertRaises(BadArgument): + asyncio.run(PackageName.convert(self.context, name)) def test_duration_converter_for_valid(self): """Duration returns the correct `datetime` for valid duration strings.""" -- cgit v1.2.3 From 7e367ce4a5df3fbd768c6dce1acc39e786a376ea Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 25 Jul 2020 03:13:20 +0200 Subject: Ensure all renamed symbols are kept After the restructure behaviour change in d790c404ca3dba3843f351d6f42e766956aa73a1, the add to renamed_symbols was not readded and symbols that only passed the first check were being missed. --- bot/cogs/doc/cog.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bot/cogs/doc/cog.py b/bot/cogs/doc/cog.py index bd27dde01..e52ee95c1 100644 --- a/bot/cogs/doc/cog.py +++ b/bot/cogs/doc/cog.py @@ -148,6 +148,7 @@ class DocCog(commands.Cog): or any(package in symbol_base_url for package in NO_OVERRIDE_PACKAGES) ): symbol = f"{group_name}.{symbol}" + self.renamed_symbols.add(symbol) elif (overridden_symbol_group := self.doc_symbols[symbol].group) in NO_OVERRIDE_GROUPS: overridden_symbol = f"{overridden_symbol_group}.{symbol}" @@ -158,7 +159,7 @@ class DocCog(commands.Cog): self.renamed_symbols.add(overridden_symbol) # If renamed `symbol` already exists, add library name in front to differentiate between them. - if symbol in self.renamed_symbols: + elif symbol in self.renamed_symbols: # Split `package_name` because of packages like Pillow that have spaces in them. symbol = f"{api_package_name}.{symbol}" self.renamed_symbols.add(symbol) -- cgit v1.2.3 From 2cc7ec9e26b013b2967841372898f1f8954d8f8f Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 26 Jul 2020 15:06:35 +0200 Subject: Parse NavigableStrings in symbol descriptions. When a symbol, such as [term.numpy](https://matplotlib.org/3.1.1/glossary/index.html#term-numpy) had NavigableStrings as direct children, they were not included as bs4's SoupStrainer won't include both strings and tags in its filters. The implementation goes around the limitation by introducing a new optional flag, bypassing the default check which skips matching tags when the `text` argument is present. --- bot/cogs/doc/html.py | 33 +++++++++++++++++++++++++++++++++ bot/cogs/doc/parsing.py | 36 ++++++++++++++++++++++-------------- 2 files changed, 55 insertions(+), 14 deletions(-) create mode 100644 bot/cogs/doc/html.py diff --git a/bot/cogs/doc/html.py b/bot/cogs/doc/html.py new file mode 100644 index 000000000..bc705130d --- /dev/null +++ b/bot/cogs/doc/html.py @@ -0,0 +1,33 @@ +from collections.abc import Iterable +from typing import List, Union + +from bs4.element import NavigableString, PageElement, SoupStrainer, Tag + + +class Strainer(SoupStrainer): + """Subclass of SoupStrainer to allow matching of both `Tag`s and `NavigableString`s.""" + + def __init__(self, *, include_strings: bool, **kwargs): + self.include_strings = include_strings + super().__init__(**kwargs) + + markup_hint = Union[PageElement, List["markup_hint"]] + + def search(self, markup: markup_hint) -> Union[PageElement, str]: + """Extend default SoupStrainer behaviour to allow matching both `Tag`s` and `NavigableString`s.""" + if isinstance(markup, Iterable) and not isinstance(markup, (Tag, str)): + for element in markup: + if isinstance(element, NavigableString) and self.search(element): + return element + elif isinstance(markup, Tag): + # Also include tags while we're searching for strings and tags. + if self.include_strings or (not self.text or self.name or self.attrs): + return self.search_tag(markup) + + elif isinstance(markup, str): + # Let everything through the text filter if we're including strings and tags. + text_filter = None if not self.include_strings else True + if not self.name and not self.attrs and self._matches(markup, text_filter): + return markup + else: + raise Exception(f"I don't know how to match against a {markup.__class__}") diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py index 79f3bbf69..050c49447 100644 --- a/bot/cogs/doc/parsing.py +++ b/bot/cogs/doc/parsing.py @@ -8,10 +8,11 @@ from urllib.parse import urljoin from aiohttp import ClientSession from bs4 import BeautifulSoup -from bs4.element import PageElement, Tag +from bs4.element import NavigableString, PageElement, Tag from markdownify import MarkdownConverter from .cache import async_cache +from .html import Strainer if TYPE_CHECKING: from .cog import DocItem @@ -96,25 +97,30 @@ def _find_elements_until_tag( tag_filter: Union[Tuple[str, ...], Callable[[Tag], bool]], *, func: Callable, + include_strings: bool = False, limit: int = None, -) -> List[Tag]: +) -> List[Union[Tag, NavigableString]]: """ - Get all tags until a tag matching `tag_filter` is found. + Get all elements up to `limit` or until a tag matching `tag_filter` is found. `tag_filter` can be either a tuple of string names to check against, - or a filtering t.Callable that's applied to the tags. + or a filtering callable that's applied to tags. + + When `include_strings` is True, `NavigableString`s from the document will be included in the result along `Tag`s. `func` takes in a BeautifulSoup unbound method for finding multiple elements, such as `BeautifulSoup.find_all`. - That method is then iterated over and all tags until the matching tag are added to the return list as strings. + The method is then iterated over and all elements until the matching tag or the limit are added to the return list. """ + use_tuple_filter = isinstance(tag_filter, tuple) elements = [] - for element in func(start_element, limit=limit): - if isinstance(tag_filter, tuple): - if element.name in tag_filter: + for element in func(start_element, name=Strainer(include_strings=include_strings), limit=limit): + if isinstance(element, Tag): + if use_tuple_filter: + if element.name in tag_filter: + break + elif tag_filter(element): break - elif tag_filter(element): - break elements.append(element) return elements @@ -125,7 +131,7 @@ _find_next_siblings_until_tag = partial(_find_elements_until_tag, func=Beautiful _find_previous_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_previous_siblings) -def get_module_description(start_element: PageElement) -> Optional[str]: +def _get_module_description(start_element: PageElement) -> Optional[str]: """ Get page content to a table or a tag with its class in `SEARCH_END_TAG_ATTRS`. @@ -134,7 +140,9 @@ def get_module_description(start_element: PageElement) -> Optional[str]: """ header = start_element.find("a", attrs={"class": "headerlink"}) start_tag = header.parent if header is not None else start_element - description = "".join(str(tag) for tag in _find_next_siblings_until_tag(start_tag, _match_end_tag)) + description = "".join( + str(tag) for tag in _find_next_siblings_until_tag(start_tag, _match_end_tag, include_strings=True) + ) return description @@ -142,7 +150,7 @@ def get_module_description(start_element: PageElement) -> Optional[str]: def _get_symbol_description(symbol: PageElement) -> str: """Get the string contents of the next dd tag, up to a dt or a dl tag.""" description_tag = symbol.find_next("dd") - description_contents = _find_next_children_until_tag(description_tag, ("dt", "dl")) + description_contents = _find_next_children_until_tag(description_tag, ("dt", "dl"), include_strings=True) return "".join(str(tag) for tag in description_contents) @@ -253,7 +261,7 @@ async def get_symbol_markdown(http_session: ClientSession, symbol_data: "DocItem # or don't contain any useful info to be parsed. signature = None if symbol_data.group in {"module", "doc"}: - description = get_module_description(symbol_heading) + description = _get_module_description(symbol_heading) elif symbol_data.group in _NO_SIGNATURE_GROUPS: description = _get_symbol_description(symbol_heading) -- cgit v1.2.3 From 6ea6f732e719f93f88588f1d6c435262261e2650 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 26 Jul 2020 15:09:53 +0200 Subject: Fix markdownify's handling of h tags. Discord only allows `**` for bolding while the markdown from the default MarkdownConverter tries to use # time n with h*n* tags for different font weights. --- bot/cogs/doc/parsing.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py index 050c49447..ac8a94e3f 100644 --- a/bot/cogs/doc/parsing.py +++ b/bot/cogs/doc/parsing.py @@ -65,6 +65,10 @@ class _DocMarkdownConverter(MarkdownConverter): bullet = bullets[depth % len(bullets)] return '%s %s\n' % (bullet, text or '') + def convert_hn(self, _n: int, el: PageElement, text: str) -> str: + """Convert h tags to bold text with ** instead of adding #.""" + return f"**{text}**\n\n" + def convert_code(self, el: PageElement, text: str) -> str: """Undo `markdownify`s underscore escaping.""" return f"`{text}`".replace('\\', '') -- cgit v1.2.3 From 13030b8c54dd2ed37047349c5b09e4ded2c83391 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 26 Jul 2020 15:11:45 +0200 Subject: Move MarkdownConverter subclass to separate module --- bot/cogs/doc/markdown.py | 58 +++++++++++++++++++++++++++++++++++++++++++++++ bot/cogs/doc/parsing.py | 59 ++---------------------------------------------- 2 files changed, 60 insertions(+), 57 deletions(-) create mode 100644 bot/cogs/doc/markdown.py diff --git a/bot/cogs/doc/markdown.py b/bot/cogs/doc/markdown.py new file mode 100644 index 000000000..dca477d35 --- /dev/null +++ b/bot/cogs/doc/markdown.py @@ -0,0 +1,58 @@ +from urllib.parse import urljoin + +from bs4.element import PageElement +from markdownify import MarkdownConverter + + +class _DocMarkdownConverter(MarkdownConverter): + """Subclass markdownify's MarkdownCoverter to provide custom conversion methods.""" + + def __init__(self, *, page_url: str, **options): + super().__init__(**options) + self.page_url = page_url + + def convert_li(self, el: PageElement, text: str) -> str: + """Fix markdownify's erroneous indexing in ol tags.""" + parent = el.parent + if parent is not None and parent.name == 'ol': + li_tags = parent.find_all("li") + bullet = '%s.' % (li_tags.index(el)+1) + else: + depth = -1 + while el: + if el.name == 'ul': + depth += 1 + el = el.parent + bullets = self.options['bullets'] + bullet = bullets[depth % len(bullets)] + return '%s %s\n' % (bullet, text or '') + + def convert_hn(self, _n: int, el: PageElement, text: str) -> str: + """Convert h tags to bold text with ** instead of adding #.""" + return f"**{text}**\n\n" + + def convert_code(self, el: PageElement, text: str) -> str: + """Undo `markdownify`s underscore escaping.""" + return f"`{text}`".replace('\\', '') + + def convert_pre(self, el: PageElement, text: str) -> str: + """Wrap any codeblocks in `py` for syntax highlighting.""" + code = ''.join(el.strings) + return f"```py\n{code}```" + + def convert_a(self, el: PageElement, text: str) -> str: + """Resolve relative URLs to `self.page_url`.""" + el["href"] = urljoin(self.page_url, el["href"]) + return super().convert_a(el, text) + + def convert_p(self, el: PageElement, text: str) -> str: + """Include only one newline instead of two when the parent is a li tag.""" + parent = el.parent + if parent is not None and parent.name == "li": + return f"{text}\n" + return super().convert_p(el, text) + + +def markdownify(html: str, *, url: str = "") -> str: + """Create a DocMarkdownConverter object from the input html.""" + return _DocMarkdownConverter(bullets='•', page_url=url).convert(html) diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py index ac8a94e3f..93daf3faf 100644 --- a/bot/cogs/doc/parsing.py +++ b/bot/cogs/doc/parsing.py @@ -4,15 +4,14 @@ import string import textwrap from functools import partial from typing import Callable, List, Optional, TYPE_CHECKING, Tuple, Union -from urllib.parse import urljoin from aiohttp import ClientSession from bs4 import BeautifulSoup from bs4.element import NavigableString, PageElement, Tag -from markdownify import MarkdownConverter from .cache import async_cache from .html import Strainer +from .markdown import markdownify if TYPE_CHECKING: from .cog import DocItem @@ -42,60 +41,6 @@ _NO_SIGNATURE_GROUPS = { } -class _DocMarkdownConverter(MarkdownConverter): - """Subclass markdownify's MarkdownCoverter to provide custom conversion methods.""" - - def __init__(self, *, page_url: str, **options): - super().__init__(**options) - self.page_url = page_url - - def convert_li(self, el: PageElement, text: str) -> str: - """Fix markdownify's erroneous indexing in ol tags.""" - parent = el.parent - if parent is not None and parent.name == 'ol': - li_tags = parent.find_all("li") - bullet = '%s.' % (li_tags.index(el)+1) - else: - depth = -1 - while el: - if el.name == 'ul': - depth += 1 - el = el.parent - bullets = self.options['bullets'] - bullet = bullets[depth % len(bullets)] - return '%s %s\n' % (bullet, text or '') - - def convert_hn(self, _n: int, el: PageElement, text: str) -> str: - """Convert h tags to bold text with ** instead of adding #.""" - return f"**{text}**\n\n" - - def convert_code(self, el: PageElement, text: str) -> str: - """Undo `markdownify`s underscore escaping.""" - return f"`{text}`".replace('\\', '') - - def convert_pre(self, el: PageElement, text: str) -> str: - """Wrap any codeblocks in `py` for syntax highlighting.""" - code = ''.join(el.strings) - return f"```py\n{code}```" - - def convert_a(self, el: PageElement, text: str) -> str: - """Resolve relative URLs to `self.page_url`.""" - el["href"] = urljoin(self.page_url, el["href"]) - return super().convert_a(el, text) - - def convert_p(self, el: PageElement, text: str) -> str: - """Include only one newline instead of two when the parent is a li tag.""" - parent = el.parent - if parent is not None and parent.name == "li": - return f"{text}\n" - return super().convert_p(el, text) - - -def _markdownify(html: str, *, url: str = "") -> str: - """Create a DocMarkdownConverter object from the input html.""" - return _DocMarkdownConverter(bullets='•', page_url=url).convert(html) - - def _find_elements_until_tag( start_element: PageElement, tag_filter: Union[Tuple[str, ...], Callable[[Tag], bool]], @@ -215,7 +160,7 @@ def _parse_into_markdown(signatures: Optional[List[str]], description: str, url: The signatures are wrapped in python codeblocks, separated from the description by a newline. The result string is truncated to be max 1000 symbols long. """ - description = _truncate_markdown(_markdownify(description, url=url), 1000) + description = _truncate_markdown(markdownify(description, url=url), 1000) description = _WHITESPACE_AFTER_NEWLINES_RE.sub('', description) if signatures is not None: formatted_markdown = "".join(f"```py\n{textwrap.shorten(signature, 500)}```" for signature in signatures) -- cgit v1.2.3 From 994b828254cc8e40a52cf604910d5aa3eba2293d Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 26 Jul 2020 15:21:40 +0200 Subject: Add more logging --- bot/cogs/doc/parsing.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py index 93daf3faf..2ea21ed98 100644 --- a/bot/cogs/doc/parsing.py +++ b/bot/cogs/doc/parsing.py @@ -197,6 +197,7 @@ async def get_symbol_markdown(http_session: ClientSession, symbol_data: "DocItem A request through `http_session` is made to the url associated with `symbol_data` for the html contents; the contents are then parsed depending on what group the symbol belongs to. """ + log.trace(f"Parsing symbol from url {symbol_data.url}.") if "#" in symbol_data.url: request_url, symbol_id = symbol_data.url.rsplit('#') else: @@ -210,12 +211,15 @@ async def get_symbol_markdown(http_session: ClientSession, symbol_data: "DocItem # or don't contain any useful info to be parsed. signature = None if symbol_data.group in {"module", "doc"}: + log.trace("Symbol is a module or doc, parsing as module.") description = _get_module_description(symbol_heading) elif symbol_data.group in _NO_SIGNATURE_GROUPS: + log.trace("Symbol's group is in the group signature blacklist, skipping parsing of signature.") description = _get_symbol_description(symbol_heading) else: + log.trace("Parsing both signature and description of symbol.") signature = _get_signatures(symbol_heading) description = _get_symbol_description(symbol_heading) -- cgit v1.2.3 From 83989d28fb83801acdea4b6f51cf48e974e21891 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 26 Jul 2020 15:29:09 +0200 Subject: Rename description functions to be more general --- bot/cogs/doc/parsing.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py index 2ea21ed98..96bb1dfb4 100644 --- a/bot/cogs/doc/parsing.py +++ b/bot/cogs/doc/parsing.py @@ -80,14 +80,14 @@ _find_next_siblings_until_tag = partial(_find_elements_until_tag, func=Beautiful _find_previous_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_previous_siblings) -def _get_module_description(start_element: PageElement) -> Optional[str]: +def _get_general_description(start_element: PageElement) -> Optional[str]: """ Get page content to a table or a tag with its class in `SEARCH_END_TAG_ATTRS`. - A headerlink a tag is attempted to be found to skip repeating the module name in the description, - if it's found it's used as the tag to search from instead of the `start_element`. + A headerlink a tag is attempted to be found to skip repeating the symbol information in the description, + if it's found it's used as the tag to start the search from instead of the `start_element`. """ - header = start_element.find("a", attrs={"class": "headerlink"}) + header = start_element.find_next("a", attrs={"class": "headerlink"}) start_tag = header.parent if header is not None else start_element description = "".join( str(tag) for tag in _find_next_siblings_until_tag(start_tag, _match_end_tag, include_strings=True) @@ -96,7 +96,7 @@ def _get_module_description(start_element: PageElement) -> Optional[str]: return description -def _get_symbol_description(symbol: PageElement) -> str: +def _get_dd_description(symbol: PageElement) -> str: """Get the string contents of the next dd tag, up to a dt or a dl tag.""" description_tag = symbol.find_next("dd") description_contents = _find_next_children_until_tag(description_tag, ("dt", "dl"), include_strings=True) @@ -212,15 +212,15 @@ async def get_symbol_markdown(http_session: ClientSession, symbol_data: "DocItem signature = None if symbol_data.group in {"module", "doc"}: log.trace("Symbol is a module or doc, parsing as module.") - description = _get_module_description(symbol_heading) + description = _get_general_description(symbol_heading) elif symbol_data.group in _NO_SIGNATURE_GROUPS: log.trace("Symbol's group is in the group signature blacklist, skipping parsing of signature.") - description = _get_symbol_description(symbol_heading) + description = _get_dd_description(symbol_heading) else: log.trace("Parsing both signature and description of symbol.") signature = _get_signatures(symbol_heading) - description = _get_symbol_description(symbol_heading) + description = _get_dd_description(symbol_heading) return _parse_into_markdown(signature, description.replace('¶', ''), symbol_data.url) -- cgit v1.2.3 From 5290fcf0fff23e4979746c51b77be9a51fe82ae7 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 26 Jul 2020 15:51:34 +0200 Subject: Properly parse labels add fallback for non dt tags Labels point to tags that aren't in description lists, like modules or doc symbols which we already handle. If by chance we get a symbol that we don't have in the group for general parsing and which isn't a dt tag, log it and don't attempt to parse signature and use general description parsing instead of parsing a dd tag. --- bot/cogs/doc/parsing.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py index 96bb1dfb4..1271953d4 100644 --- a/bot/cogs/doc/parsing.py +++ b/bot/cogs/doc/parsing.py @@ -206,12 +206,20 @@ async def get_symbol_markdown(http_session: ClientSession, symbol_data: "DocItem soup = await _get_soup_from_url(http_session, request_url) symbol_heading = soup.find(id=symbol_id) - - # Handle doc symbols as modules, because they either link to the page of a module, - # or don't contain any useful info to be parsed. signature = None - if symbol_data.group in {"module", "doc"}: - log.trace("Symbol is a module or doc, parsing as module.") + # Modules, doc pages and labels don't point to description list tags but to tags like divs, + # no special parsing can be done so we only try to include what's under them. + if symbol_data.group in {"module", "doc", "label"}: + log.trace("Symbol is a module, doc or a label; using general description parsing.") + description = _get_general_description(symbol_heading) + + elif symbol_heading.name != "dt": + # Use the general parsing for symbols that aren't modules, docs or labels and aren't dt tags, + # log info the tag can be looked at. + log.info( + f"Symbol heading at url {symbol_data.url} was not a dt tag or from known groups that lack it," + f"handling as general description." + ) description = _get_general_description(symbol_heading) elif symbol_data.group in _NO_SIGNATURE_GROUPS: -- cgit v1.2.3 From ddb3c230cc7e1b38dbb57be10b1684c4ecb2ac7b Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 16 Sep 2020 00:14:58 +0200 Subject: Remove old comment --- bot/cogs/doc/cog.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bot/cogs/doc/cog.py b/bot/cogs/doc/cog.py index e52ee95c1..2f4c99252 100644 --- a/bot/cogs/doc/cog.py +++ b/bot/cogs/doc/cog.py @@ -160,7 +160,6 @@ class DocCog(commands.Cog): # If renamed `symbol` already exists, add library name in front to differentiate between them. elif symbol in self.renamed_symbols: - # Split `package_name` because of packages like Pillow that have spaces in them. symbol = f"{api_package_name}.{symbol}" self.renamed_symbols.add(symbol) -- cgit v1.2.3 From cb89cbaa36102c111c0204eb7c8bc27cecc1d4cd Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 16 Sep 2020 00:18:51 +0200 Subject: Don't return fragment in DocItem url The fragment is only needed for the user and required sparingly returning only the url while keeping the fragment behind symbol_id simplifies the uses of the url without it. --- bot/cogs/doc/cog.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/bot/cogs/doc/cog.py b/bot/cogs/doc/cog.py index 2f4c99252..2e49fcd38 100644 --- a/bot/cogs/doc/cog.py +++ b/bot/cogs/doc/cog.py @@ -55,15 +55,16 @@ NOT_FOUND_DELETE_DELAY = RedirectOutput.delete_delay class DocItem(NamedTuple): """Holds inventory symbol information.""" - base_url: str - relative_url: str package: str group: str + base_url: str + relative_url_path: str + symbol_id: str @property def url(self) -> str: """Return the absolute url to the symbol.""" - return self.base_url + self.relative_url + return "".join((self.base_url, self.relative_url_path)) class InventoryURL(commands.Converter): @@ -141,21 +142,20 @@ class DocCog(commands.Cog): # to remove unnecessary memory consumption from them being unique objects group_name = sys.intern(group.split(":")[1]) - if symbol in self.doc_symbols: - symbol_base_url = self.doc_symbols[symbol].url.split("/", 3)[2] + if (original_symbol := self.doc_symbols.get(symbol)) is not None: if ( group_name in NO_OVERRIDE_GROUPS - or any(package in symbol_base_url for package in NO_OVERRIDE_PACKAGES) + or any(package == original_symbol.package for package in NO_OVERRIDE_PACKAGES) ): symbol = f"{group_name}.{symbol}" self.renamed_symbols.add(symbol) - elif (overridden_symbol_group := self.doc_symbols[symbol].group) in NO_OVERRIDE_GROUPS: + elif (overridden_symbol_group := original_symbol.group) in NO_OVERRIDE_GROUPS: overridden_symbol = f"{overridden_symbol_group}.{symbol}" if overridden_symbol in self.renamed_symbols: overridden_symbol = f"{api_package_name}.{overridden_symbol}" - self.doc_symbols[overridden_symbol] = self.doc_symbols[symbol] + self.doc_symbols[overridden_symbol] = original_symbol self.renamed_symbols.add(overridden_symbol) # If renamed `symbol` already exists, add library name in front to differentiate between them. @@ -202,7 +202,7 @@ class DocCog(commands.Cog): embed = discord.Embed( title=discord.utils.escape_markdown(symbol), - url=symbol_info.url, + url=f"{symbol_info.url}#{symbol_info.symbol_id}", description=embed_description ) # Show all symbols with the same name that were renamed in the footer. -- cgit v1.2.3 From 75f95a110ce96734cb64f89321f9a6eeb0d79463 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 20 Sep 2020 03:06:59 +0200 Subject: Replace caching of soups with new class. Storing BeautifulSoup objects could lead to memory problems because of their large footprint, the new class replaces the long term storage by parsing all items on the first fetch of the page and only storing their markdown string. --- bot/cogs/doc/cog.py | 122 +++++++++++++++++++++++++++++++++++++++++++++--- bot/cogs/doc/parsing.py | 36 ++------------ 2 files changed, 119 insertions(+), 39 deletions(-) diff --git a/bot/cogs/doc/cog.py b/bot/cogs/doc/cog.py index 2e49fcd38..d57e76ebd 100644 --- a/bot/cogs/doc/cog.py +++ b/bot/cogs/doc/cog.py @@ -1,14 +1,18 @@ +from __future__ import annotations + import asyncio import functools import logging import re import sys -from collections import OrderedDict +from collections import defaultdict from contextlib import suppress from types import SimpleNamespace -from typing import Dict, NamedTuple, Optional +from typing import Dict, List, NamedTuple, Optional, Union import discord +from aiohttp import ClientSession +from bs4 import BeautifulSoup from discord.ext import commands from requests import ConnectTimeout, ConnectionError, HTTPError from sphinx.ext import intersphinx @@ -20,7 +24,6 @@ from bot.converters import PackageName, ValidURL from bot.decorators import with_role from bot.pagination import LinePaginator from bot.utils.messages import wait_for_deletion -from .cache import async_cache from .parsing import get_symbol_markdown log = logging.getLogger(__name__) @@ -67,6 +70,108 @@ class DocItem(NamedTuple): return "".join((self.base_url, self.relative_url_path)) +class QueueItem(NamedTuple): + """Contains a symbol and the BeautifulSoup object needed to parse it.""" + + symbol: DocItem + soup: BeautifulSoup + + def __eq__(self, other: Union[QueueItem, DocItem]): + if isinstance(other, DocItem): + return self.symbol == other + return NamedTuple.__eq__(self, other) + + +class CachedParser: + """ + Get symbol markdown from pages with smarter caching. + + DocItems are added through the `add_item` method which adds them to the `_page_symbols` dict. + `get_markdown` is used to fetch the markdown; when this is used for the first time on a page, + all of the symbols are queued to be parsed to avoid multiple web requests to the same page. + """ + + def __init__(self): + self._queue: List[QueueItem] = [] + self._results = {} + self._page_symbols: Dict[str, List[DocItem]] = defaultdict(list) + self._item_events: Dict[DocItem, asyncio.Event] = {} + self._parse_task = None + + async def get_markdown(self, client_session: ClientSession, doc_item: DocItem) -> str: + """ + Get result markdown of `doc_item`. + + If no symbols were fetched from `doc_item`s page before, + the HTML has to be fetched before parsing can be queued. + """ + if (symbol := self._results.get(doc_item)) is not None: + return symbol + + if (symbols_to_queue := self._page_symbols.get(doc_item.url)) is not None: + async with client_session.get(doc_item.url) as response: + soup = BeautifulSoup(await response.text(encoding="utf8"), "lxml") + + self._queue.extend(QueueItem(symbol, soup) for symbol in symbols_to_queue) + del self._page_symbols[doc_item.url] + log.debug(f"Added symbols from {doc_item.url} to parse queue.") + + if self._parse_task is None: + self._parse_task = asyncio.create_task(self._parse_queue()) + + self._move_to_front(doc_item) + self._item_events[doc_item] = item_event = asyncio.Event() + await item_event.wait() + return self._results[doc_item] + + async def _parse_queue(self) -> None: + """ + Parse all item from the queue, setting associated events for symbols if present. + + The coroutine will run as long as the queue is not empty, resetting `self._parse_task` to None when finished. + """ + log.trace("Starting queue parsing.") + while self._queue: + item, soup = self._queue.pop() + self._results[item] = get_symbol_markdown(soup, item) + if (event := self._item_events.get(item)) is not None: + event.set() + await asyncio.sleep(0.1) + + self._parse_task = None + log.trace("Finished parsing queue.") + + def _move_to_front(self, item: Union[QueueItem, DocItem]) -> None: + """Move `item` to the front of the parse queue.""" + # The parse queue stores soups along with the doc symbols in QueueItem objects, + # in case we're moving a DocItem we have to get the associated QueueItem first and then move it. + item_index = self._queue.index(item) + queue_item = self._queue[item_index] + + del self._queue[item_index] + self._queue.append(queue_item) + + def add_item(self, doc_item: DocItem) -> None: + """Add a DocItem to `_page_symbols`.""" + self._page_symbols[doc_item.url].append(doc_item) + + async def clear(self) -> None: + """ + Clear all internal symbol data. + + All currently requested items are waited to be parsed before clearing. + """ + for event in self._item_events.values(): + await event.wait() + if self._parse_task is not None: + self._parse_task.cancel() + self._parse_task = None + self._queue.clear() + self._results.clear() + self._page_symbols.clear() + self._item_events.clear() + + class InventoryURL(commands.Converter): """ Represents an Intersphinx inventory URL. @@ -106,6 +211,7 @@ class DocCog(commands.Cog): self.base_urls = {} self.bot = bot self.doc_symbols: Dict[str, DocItem] = {} + self.item_fetcher = CachedParser() self.renamed_symbols = set() self.bot.loop.create_task(self.init_refresh_inventory()) @@ -163,7 +269,10 @@ class DocCog(commands.Cog): symbol = f"{api_package_name}.{symbol}" self.renamed_symbols.add(symbol) - self.doc_symbols[symbol] = DocItem(base_url, relative_doc_url, api_package_name, group_name) + relative_url_path, _, symbol_id = relative_doc_url.partition("#") + symbol_item = DocItem(api_package_name, group_name, base_url, relative_url_path, symbol_id) + self.doc_symbols[symbol] = symbol_item + self.item_fetcher.add_item(symbol_item) log.trace(f"Fetched inventory for {api_package_name}.") @@ -177,7 +286,7 @@ class DocCog(commands.Cog): self.base_urls.clear() self.doc_symbols.clear() self.renamed_symbols.clear() - async_cache.cache = OrderedDict() + await self.item_fetcher.clear() # Run all coroutines concurrently - since each of them performs a HTTP # request, this speeds up fetching the inventory data heavily. @@ -198,12 +307,11 @@ class DocCog(commands.Cog): if symbol_info is None: return None self.bot.stats.incr(f"doc_fetches.{symbol_info.package.lower()}") - embed_description = await get_symbol_markdown(self.bot.http_session, symbol_info) embed = discord.Embed( title=discord.utils.escape_markdown(symbol), url=f"{symbol_info.url}#{symbol_info.symbol_id}", - description=embed_description + description=await self.item_fetcher.get_markdown(self.bot.http_session, symbol_info) ) # Show all symbols with the same name that were renamed in the footer. embed.set_footer( diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py index 1271953d4..9fbce7bed 100644 --- a/bot/cogs/doc/parsing.py +++ b/bot/cogs/doc/parsing.py @@ -5,11 +5,9 @@ import textwrap from functools import partial from typing import Callable, List, Optional, TYPE_CHECKING, Tuple, Union -from aiohttp import ClientSession from bs4 import BeautifulSoup from bs4.element import NavigableString, PageElement, Tag -from .cache import async_cache from .html import Strainer from .markdown import markdownify if TYPE_CHECKING: @@ -171,16 +169,6 @@ def _parse_into_markdown(signatures: Optional[List[str]], description: str, url: return formatted_markdown -@async_cache(arg_offset=1) -async def _get_soup_from_url(http_session: ClientSession, url: str) -> BeautifulSoup: - """Create a BeautifulSoup object from the HTML data in `url` with the head tag removed.""" - log.trace(f"Sending a request to {url}.") - async with http_session.get(url) as response: - soup = BeautifulSoup(await response.text(encoding="utf8"), 'lxml') - soup.find("head").decompose() # the head contains no useful data so we can remove it - return soup - - def _match_end_tag(tag: Tag) -> bool: """Matches `tag` if its class value is in `SEARCH_END_TAG_ATTRS` or the tag is table.""" for attr in _SEARCH_END_TAG_ATTRS: @@ -190,44 +178,28 @@ def _match_end_tag(tag: Tag) -> bool: return tag.name == "table" -async def get_symbol_markdown(http_session: ClientSession, symbol_data: "DocItem") -> str: +def get_symbol_markdown(soup: BeautifulSoup, symbol_data: "DocItem") -> str: """ - Return parsed markdown of the passed symbol, truncated to 1000 characters. + Return parsed markdown of the passed symbol using the passed in soup, truncated to 1000 characters. - A request through `http_session` is made to the url associated with `symbol_data` for the html contents; - the contents are then parsed depending on what group the symbol belongs to. + The method of parsing and what information gets included depends on the symbol's group. """ - log.trace(f"Parsing symbol from url {symbol_data.url}.") - if "#" in symbol_data.url: - request_url, symbol_id = symbol_data.url.rsplit('#') - else: - request_url = symbol_data.url - symbol_id = None - - soup = await _get_soup_from_url(http_session, request_url) - symbol_heading = soup.find(id=symbol_id) + symbol_heading = soup.find(id=symbol_data.symbol_id) signature = None # Modules, doc pages and labels don't point to description list tags but to tags like divs, # no special parsing can be done so we only try to include what's under them. if symbol_data.group in {"module", "doc", "label"}: - log.trace("Symbol is a module, doc or a label; using general description parsing.") description = _get_general_description(symbol_heading) elif symbol_heading.name != "dt": # Use the general parsing for symbols that aren't modules, docs or labels and aren't dt tags, # log info the tag can be looked at. - log.info( - f"Symbol heading at url {symbol_data.url} was not a dt tag or from known groups that lack it," - f"handling as general description." - ) description = _get_general_description(symbol_heading) elif symbol_data.group in _NO_SIGNATURE_GROUPS: - log.trace("Symbol's group is in the group signature blacklist, skipping parsing of signature.") description = _get_dd_description(symbol_heading) else: - log.trace("Parsing both signature and description of symbol.") signature = _get_signatures(symbol_heading) description = _get_dd_description(symbol_heading) -- cgit v1.2.3 From 38753114c0d056ba330296c9fea7a8f2312459f9 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 20 Sep 2020 03:08:36 +0200 Subject: Replace forward ref with future annotations import --- bot/cogs/doc/parsing.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py index 9fbce7bed..21a3065f4 100644 --- a/bot/cogs/doc/parsing.py +++ b/bot/cogs/doc/parsing.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import logging import re import string @@ -178,7 +180,7 @@ def _match_end_tag(tag: Tag) -> bool: return tag.name == "table" -def get_symbol_markdown(soup: BeautifulSoup, symbol_data: "DocItem") -> str: +def get_symbol_markdown(soup: BeautifulSoup, symbol_data: DocItem) -> str: """ Return parsed markdown of the passed symbol using the passed in soup, truncated to 1000 characters. -- cgit v1.2.3 From de440ce8c4539972ea0f0538042e6cb41a4395dc Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 20 Sep 2020 03:09:24 +0200 Subject: Remove unused cache --- bot/cogs/doc/cache.py | 32 -------------------------------- 1 file changed, 32 deletions(-) delete mode 100644 bot/cogs/doc/cache.py diff --git a/bot/cogs/doc/cache.py b/bot/cogs/doc/cache.py deleted file mode 100644 index 9da2a1dab..000000000 --- a/bot/cogs/doc/cache.py +++ /dev/null @@ -1,32 +0,0 @@ -import functools -from collections import OrderedDict -from typing import Any, Callable - - -def async_cache(max_size: int = 128, arg_offset: int = 0) -> Callable: - """ - LRU cache implementation for coroutines. - - Once the cache exceeds the maximum size, keys are deleted in FIFO order. - - An offset may be optionally provided to be applied to the coroutine's arguments when creating the cache key. - """ - # Assign the cache to the function itself so we can clear it from outside. - async_cache.cache = OrderedDict() - - def decorator(function: Callable) -> Callable: - """Define the async_cache decorator.""" - @functools.wraps(function) - async def wrapper(*args) -> Any: - """Decorator wrapper for the caching logic.""" - key = ':'.join(args[arg_offset:]) - - value = async_cache.cache.get(key) - if value is None: - if len(async_cache.cache) > max_size: - async_cache.cache.popitem(last=False) - - async_cache.cache[key] = await function(*args) - return async_cache.cache[key] - return wrapper - return decorator -- cgit v1.2.3 From 758dd3ef6ca5c1cd7615f0eb6688d7d2f19578ea Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 20 Sep 2020 23:46:54 +0200 Subject: Log exceptions from parsing task --- bot/cogs/doc/cog.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/bot/cogs/doc/cog.py b/bot/cogs/doc/cog.py index fc01dfb20..7c1bf2a5f 100644 --- a/bot/cogs/doc/cog.py +++ b/bot/cogs/doc/cog.py @@ -133,9 +133,13 @@ class CachedParser: log.trace("Starting queue parsing.") while self._queue: item, soup = self._queue.pop() - self._results[item] = get_symbol_markdown(soup, item) - if (event := self._item_events.get(item)) is not None: - event.set() + try: + self._results[item] = get_symbol_markdown(soup, item) + except Exception: + log.exception(f"Unexpected error when handling {item}") + else: + if (event := self._item_events.get(item)) is not None: + event.set() await asyncio.sleep(0.1) self._parse_task = None -- cgit v1.2.3 From 7ab949e09a22d7547f74caa447d81299f7b52e47 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Mon, 21 Sep 2020 00:30:08 +0200 Subject: Properly truncate description markdown The previous truncating implementation used a naive method that disregarded the actual markdown formatting, possibly resulting in it getting cut out. With the introduction of proper href tags this became impossible to manage without writing an actual parser; so the process was moved to happen when the gathered bs4 elements are being converted into markdown --- bot/cogs/doc/markdown.py | 7 +--- bot/cogs/doc/parsing.py | 86 +++++++++++++++++++++++++++--------------------- 2 files changed, 49 insertions(+), 44 deletions(-) diff --git a/bot/cogs/doc/markdown.py b/bot/cogs/doc/markdown.py index dca477d35..a95e94991 100644 --- a/bot/cogs/doc/markdown.py +++ b/bot/cogs/doc/markdown.py @@ -4,7 +4,7 @@ from bs4.element import PageElement from markdownify import MarkdownConverter -class _DocMarkdownConverter(MarkdownConverter): +class DocMarkdownConverter(MarkdownConverter): """Subclass markdownify's MarkdownCoverter to provide custom conversion methods.""" def __init__(self, *, page_url: str, **options): @@ -51,8 +51,3 @@ class _DocMarkdownConverter(MarkdownConverter): if parent is not None and parent.name == "li": return f"{text}\n" return super().convert_p(el, text) - - -def markdownify(html: str, *, url: str = "") -> str: - """Create a DocMarkdownConverter object from the input html.""" - return _DocMarkdownConverter(bullets='•', page_url=url).convert(html) diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py index 21a3065f4..ed6343cd8 100644 --- a/bot/cogs/doc/parsing.py +++ b/bot/cogs/doc/parsing.py @@ -5,13 +5,13 @@ import re import string import textwrap from functools import partial -from typing import Callable, List, Optional, TYPE_CHECKING, Tuple, Union +from typing import Callable, Iterable, List, Optional, TYPE_CHECKING, Tuple, Union from bs4 import BeautifulSoup from bs4.element import NavigableString, PageElement, Tag from .html import Strainer -from .markdown import markdownify +from .markdown import DocMarkdownConverter if TYPE_CHECKING: from .cog import DocItem @@ -39,6 +39,8 @@ _NO_SIGNATURE_GROUPS = { "templatetag", "term", } +_MAX_DESCRIPTION_LENGTH = 1800 +_TRUNCATE_STRIP_CHARACTERS = "!?:;." + string.whitespace def _find_elements_until_tag( @@ -80,7 +82,7 @@ _find_next_siblings_until_tag = partial(_find_elements_until_tag, func=Beautiful _find_previous_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_previous_siblings) -def _get_general_description(start_element: PageElement) -> Optional[str]: +def _get_general_description(start_element: PageElement) -> Iterable[Union[Tag, NavigableString]]: """ Get page content to a table or a tag with its class in `SEARCH_END_TAG_ATTRS`. @@ -89,18 +91,13 @@ def _get_general_description(start_element: PageElement) -> Optional[str]: """ header = start_element.find_next("a", attrs={"class": "headerlink"}) start_tag = header.parent if header is not None else start_element - description = "".join( - str(tag) for tag in _find_next_siblings_until_tag(start_tag, _match_end_tag, include_strings=True) - ) + return _find_next_siblings_until_tag(start_tag, _match_end_tag, include_strings=True) - return description - -def _get_dd_description(symbol: PageElement) -> str: - """Get the string contents of the next dd tag, up to a dt or a dl tag.""" +def _get_dd_description(symbol: PageElement) -> List[Union[Tag, NavigableString]]: + """Get the contents of the next dd tag, up to a dt or a dl tag.""" description_tag = symbol.find_next("dd") - description_contents = _find_next_children_until_tag(description_tag, ("dt", "dl"), include_strings=True) - return "".join(str(tag) for tag in description_contents) + return _find_next_children_until_tag(description_tag, ("dt", "dl"), include_strings=True) def _get_signatures(start_signature: PageElement) -> List[str]: @@ -124,43 +121,57 @@ def _get_signatures(start_signature: PageElement) -> List[str]: return signatures -def _truncate_markdown(markdown: str, max_length: int) -> str: +def _get_truncated_description( + elements: Iterable[Union[Tag, NavigableString]], + markdown_converter: DocMarkdownConverter, + max_length: int, +) -> str: """ - Truncate `markdown` to be at most `max_length` characters. + Truncate markdown from `elements` to be at most `max_length` characters visually. - The markdown string is searched for substrings to cut at, to keep its structure, - but if none are found the string is simply sliced. + `max_length` limits the length of the rendered characters in the string, + with the real string length limited to `_MAX_DESCRIPTION_LENGTH` to accommodate discord length limits """ - if len(markdown) > max_length: - shortened = markdown[:max_length] - description_cutoff = shortened.rfind('\n\n', 100) - if description_cutoff == -1: - # Search the shortened version for cutoff points in decreasing desirability, - # cutoff at 1000 if none are found. - for cutoff_string in (". ", ", ", ",", " "): - description_cutoff = shortened.rfind(cutoff_string) - if description_cutoff != -1: - break + visual_length = 0 + real_length = 0 + result = [] + shortened = False + + for element in elements: + is_tag = isinstance(element, Tag) + element_length = len(element.text) if is_tag else len(element) + if visual_length + element_length < max_length: + if is_tag: + element_markdown = markdown_converter.process_tag(element) + else: + element_markdown = markdown_converter.process_text(element) + + element_markdown_length = len(element_markdown) + if real_length + element_markdown_length < _MAX_DESCRIPTION_LENGTH: + result.append(element_markdown) else: - description_cutoff = max_length - markdown = markdown[:description_cutoff] + shortened = True + break + real_length += element_markdown_length + visual_length += element_length + else: + shortened = True + break - # If there is an incomplete code block, cut it out - if markdown.count("```") % 2: - codeblock_start = markdown.rfind('```py') - markdown = markdown[:codeblock_start].rstrip() - markdown = markdown.rstrip(string.punctuation) + "..." - return markdown + markdown_string = "".join(result) + if shortened: + markdown_string = markdown_string.rstrip(_TRUNCATE_STRIP_CHARACTERS) + "..." + return markdown_string -def _parse_into_markdown(signatures: Optional[List[str]], description: str, url: str) -> str: +def _parse_into_markdown(signatures: Optional[List[str]], description: Iterable[Tag], url: str) -> str: """ Create a markdown string with the signatures at the top, and the converted html description below them. The signatures are wrapped in python codeblocks, separated from the description by a newline. The result string is truncated to be max 1000 symbols long. """ - description = _truncate_markdown(markdownify(description, url=url), 1000) + description = _get_truncated_description(description, DocMarkdownConverter(bullets="•", page_url=url), 750) description = _WHITESPACE_AFTER_NEWLINES_RE.sub('', description) if signatures is not None: formatted_markdown = "".join(f"```py\n{textwrap.shorten(signature, 500)}```" for signature in signatures) @@ -204,5 +215,4 @@ def get_symbol_markdown(soup: BeautifulSoup, symbol_data: DocItem) -> str: else: signature = _get_signatures(symbol_heading) description = _get_dd_description(symbol_heading) - - return _parse_into_markdown(signature, description.replace('¶', ''), symbol_data.url) + return _parse_into_markdown(signature, description, symbol_data.url).replace('¶', '') -- cgit v1.2.3 From 3eed4af70fa24e5daef6c5e6d2d145094b9e672f Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Mon, 21 Sep 2020 00:39:15 +0200 Subject: Use f strings instead of c style on copied code The code copied over from MarkdownConverter's implementation used c style string formatting, there is no reason to keep the style of strings in our code --- bot/cogs/doc/markdown.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/bot/cogs/doc/markdown.py b/bot/cogs/doc/markdown.py index a95e94991..ba35a84c4 100644 --- a/bot/cogs/doc/markdown.py +++ b/bot/cogs/doc/markdown.py @@ -14,18 +14,18 @@ class DocMarkdownConverter(MarkdownConverter): def convert_li(self, el: PageElement, text: str) -> str: """Fix markdownify's erroneous indexing in ol tags.""" parent = el.parent - if parent is not None and parent.name == 'ol': + if parent is not None and parent.name == "ol": li_tags = parent.find_all("li") - bullet = '%s.' % (li_tags.index(el)+1) + bullet = f"{li_tags.index(el)+1}." else: depth = -1 while el: - if el.name == 'ul': + if el.name == "ul": depth += 1 el = el.parent - bullets = self.options['bullets'] + bullets = self.options["bullets"] bullet = bullets[depth % len(bullets)] - return '%s %s\n' % (bullet, text or '') + return f"{bullet} {text}\n" def convert_hn(self, _n: int, el: PageElement, text: str) -> str: """Convert h tags to bold text with ** instead of adding #.""" @@ -33,11 +33,11 @@ class DocMarkdownConverter(MarkdownConverter): def convert_code(self, el: PageElement, text: str) -> str: """Undo `markdownify`s underscore escaping.""" - return f"`{text}`".replace('\\', '') + return f"`{text}`".replace("\\", "") def convert_pre(self, el: PageElement, text: str) -> str: """Wrap any codeblocks in `py` for syntax highlighting.""" - code = ''.join(el.strings) + code = "".join(el.strings) return f"```py\n{code}```" def convert_a(self, el: PageElement, text: str) -> str: -- cgit v1.2.3 From b6ef6b6bc30b02e0a6797dd9feae167da2cb6e5b Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Mon, 21 Sep 2020 00:52:40 +0200 Subject: Handle cases with outdated bot inventories. --- bot/cogs/doc/parsing.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py index ed6343cd8..939f963f1 100644 --- a/bot/cogs/doc/parsing.py +++ b/bot/cogs/doc/parsing.py @@ -198,6 +198,9 @@ def get_symbol_markdown(soup: BeautifulSoup, symbol_data: DocItem) -> str: The method of parsing and what information gets included depends on the symbol's group. """ symbol_heading = soup.find(id=symbol_data.symbol_id) + if symbol_heading is None: + log.warning("Symbol present in loaded inventories not found on site, consider refreshing inventories.") + return "Unable to parse the requested symbol." signature = None # Modules, doc pages and labels don't point to description list tags but to tags like divs, # no special parsing can be done so we only try to include what's under them. -- cgit v1.2.3 From ba73313adaff363bef9e3a505bf66373ea915997 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Mon, 21 Sep 2020 22:36:18 +0200 Subject: Use List typehint that has a narrower scope --- bot/cogs/doc/parsing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py index 939f963f1..9c82a1c13 100644 --- a/bot/cogs/doc/parsing.py +++ b/bot/cogs/doc/parsing.py @@ -82,7 +82,7 @@ _find_next_siblings_until_tag = partial(_find_elements_until_tag, func=Beautiful _find_previous_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_previous_siblings) -def _get_general_description(start_element: PageElement) -> Iterable[Union[Tag, NavigableString]]: +def _get_general_description(start_element: PageElement) -> List[Union[Tag, NavigableString]]: """ Get page content to a table or a tag with its class in `SEARCH_END_TAG_ATTRS`. -- cgit v1.2.3 From 730f30197c43cc170aaecde664712f6f4aaea246 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 26 Sep 2020 17:49:43 +0200 Subject: Collapse signatures between args instead of spaces The signature length needed more logic and shorter limits to ensure messages would fit in a discord message in a nice way. --- bot/cogs/doc/parsing.py | 95 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 92 insertions(+), 3 deletions(-) diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py index 9c82a1c13..7dddadf43 100644 --- a/bot/cogs/doc/parsing.py +++ b/bot/cogs/doc/parsing.py @@ -5,7 +5,7 @@ import re import string import textwrap from functools import partial -from typing import Callable, Iterable, List, Optional, TYPE_CHECKING, Tuple, Union +from typing import Callable, Collection, Iterable, List, Optional, TYPE_CHECKING, Tuple, Union from bs4 import BeautifulSoup from bs4.element import NavigableString, PageElement, Tag @@ -19,6 +19,7 @@ log = logging.getLogger(__name__) _UNWANTED_SIGNATURE_SYMBOLS_RE = re.compile(r"\[source]|\\\\|¶") _WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)") +_PARAMETERS_RE = re.compile(r"\((.+)\)") _SEARCH_END_TAG_ATTRS = ( "data", @@ -39,8 +40,59 @@ _NO_SIGNATURE_GROUPS = { "templatetag", "term", } -_MAX_DESCRIPTION_LENGTH = 1800 +_EMBED_CODE_BLOCK_LENGTH = 61 +# Three code block wrapped lines with py syntax highlight +_MAX_SIGNATURES_LENGTH = (_EMBED_CODE_BLOCK_LENGTH + 8) * 3 +# Maximum discord message length - signatures on top +_MAX_DESCRIPTION_LENGTH = 2000 - _MAX_SIGNATURES_LENGTH _TRUNCATE_STRIP_CHARACTERS = "!?:;." + string.whitespace +_BRACKET_PAIRS = { + "{": "}", + "(": ")", + "[": "]", +} + + +def _split_parameters(parameters_string: str) -> List[str]: + """ + Split parameters of a signature into individual parameter strings on commas. + + Long string literals are not accounted for. + """ + parameters_list = [] + last_split = 0 + depth = 0 + expected_end = None + current_search = None + previous_character = "" + + for index, character in enumerate(parameters_string): + if character in _BRACKET_PAIRS: + if current_search is None: + current_search = character + expected_end = _BRACKET_PAIRS[character] + if character == current_search: + depth += 1 + + elif character in {"'", '"'}: + if depth == 0: + depth += 1 + elif not previous_character == "\\": + depth -= 1 + + elif character == expected_end: + depth -= 1 + if depth == 0: + current_search = None + expected_end = None + + elif depth == 0 and character == ",": + parameters_list.append(parameters_string[last_split:index]) + last_split = index + 1 + previous_character = character + + parameters_list.append(parameters_string[last_split:]) + return parameters_list def _find_elements_until_tag( @@ -121,6 +173,43 @@ def _get_signatures(start_signature: PageElement) -> List[str]: return signatures +def _truncate_signatures(signatures: Collection[str]) -> Union[List[str], Collection[str]]: + """ + Truncate passed signatures to not exceed `_MAX_SIGNAUTRES_LENGTH`. + + If the signatures need to be truncated, parameters are collapsed until they fit withing the limit. + Individual signatures can consist of max 1, 2 or 3 lines of text, inversely proportional to the amount of them. + A maximum of 3 signatures is assumed to be passed. + """ + if not sum(len(signature) for signature in signatures) > _MAX_SIGNATURES_LENGTH: + return signatures + + max_signature_length = _EMBED_CODE_BLOCK_LENGTH * (4 - len(signatures)) + formatted_signatures = [] + for signature in signatures: + signature = signature.strip() + if len(signature) > max_signature_length: + if (parameters_match := _PARAMETERS_RE.search(signature)) is None: + formatted_signatures.append(textwrap.shorten(signature, max_signature_length)) + continue + + truncated_signature = [] + parameters_string = parameters_match[1] + running_length = len(signature) - len(parameters_string) + for parameter in _split_parameters(parameters_string): + if (len(parameter) + running_length) <= max_signature_length - 4: # account for comma and placeholder + truncated_signature.append(parameter) + running_length += len(parameter) + 1 + else: + truncated_signature.append(" ...") + formatted_signatures.append(signature.replace(parameters_string, ",".join(truncated_signature))) + break + else: + formatted_signatures.append(signature) + + return formatted_signatures + + def _get_truncated_description( elements: Iterable[Union[Tag, NavigableString]], markdown_converter: DocMarkdownConverter, @@ -174,7 +263,7 @@ def _parse_into_markdown(signatures: Optional[List[str]], description: Iterable[ description = _get_truncated_description(description, DocMarkdownConverter(bullets="•", page_url=url), 750) description = _WHITESPACE_AFTER_NEWLINES_RE.sub('', description) if signatures is not None: - formatted_markdown = "".join(f"```py\n{textwrap.shorten(signature, 500)}```" for signature in signatures) + formatted_markdown = "".join(f"```py\n{signature}```" for signature in _truncate_signatures(signatures)) else: formatted_markdown = "" formatted_markdown += f"\n{description}" -- cgit v1.2.3 From e10f91fce08f26f92776c3641ddd26f961a0c8b8 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 26 Sep 2020 17:51:52 +0200 Subject: Make amount of included signatures configurable --- bot/cogs/doc/parsing.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py index 7dddadf43..cf1124936 100644 --- a/bot/cogs/doc/parsing.py +++ b/bot/cogs/doc/parsing.py @@ -17,6 +17,8 @@ if TYPE_CHECKING: log = logging.getLogger(__name__) +_MAX_SIGNATURE_AMOUNT = 3 + _UNWANTED_SIGNATURE_SYMBOLS_RE = re.compile(r"\[source]|\\\\|¶") _WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)") _PARAMETERS_RE = re.compile(r"\((.+)\)") @@ -41,8 +43,8 @@ _NO_SIGNATURE_GROUPS = { "term", } _EMBED_CODE_BLOCK_LENGTH = 61 -# Three code block wrapped lines with py syntax highlight -_MAX_SIGNATURES_LENGTH = (_EMBED_CODE_BLOCK_LENGTH + 8) * 3 +# _MAX_SIGNATURE_AMOUNT code block wrapped lines with py syntax highlight +_MAX_SIGNATURES_LENGTH = (_EMBED_CODE_BLOCK_LENGTH + 8) * _MAX_SIGNATURE_AMOUNT # Maximum discord message length - signatures on top _MAX_DESCRIPTION_LENGTH = 2000 - _MAX_SIGNATURES_LENGTH _TRUNCATE_STRIP_CHARACTERS = "!?:;." + string.whitespace @@ -154,7 +156,7 @@ def _get_dd_description(symbol: PageElement) -> List[Union[Tag, NavigableString] def _get_signatures(start_signature: PageElement) -> List[str]: """ - Collect up to 3 signatures from dt tags around the `start_signature` dt tag. + Collect up to `_MAX_SIGNATURE_AMOUNT` signatures from dt tags around the `start_signature` dt tag. First the signatures under the `start_signature` are included; if less than 2 are found, tags above the start signature are added to the result if any are present. @@ -164,7 +166,7 @@ def _get_signatures(start_signature: PageElement) -> List[str]: *reversed(_find_previous_siblings_until_tag(start_signature, ("dd",), limit=2)), start_signature, *_find_next_siblings_until_tag(start_signature, ("dd",), limit=2), - )[-3:]: + )[-_MAX_SIGNATURE_AMOUNT:]: signature = _UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text) if signature: @@ -178,13 +180,14 @@ def _truncate_signatures(signatures: Collection[str]) -> Union[List[str], Collec Truncate passed signatures to not exceed `_MAX_SIGNAUTRES_LENGTH`. If the signatures need to be truncated, parameters are collapsed until they fit withing the limit. - Individual signatures can consist of max 1, 2 or 3 lines of text, inversely proportional to the amount of them. - A maximum of 3 signatures is assumed to be passed. + Individual signatures can consist of max 1, 2, ..., `_MAX_SIGNATURE_AMOUNT` lines of text, + inversely proportional to the amount of signatures. + A maximum of `_MAX_SIGNATURE_AMOUNT` signatures is assumed to be passed. """ if not sum(len(signature) for signature in signatures) > _MAX_SIGNATURES_LENGTH: return signatures - max_signature_length = _EMBED_CODE_BLOCK_LENGTH * (4 - len(signatures)) + max_signature_length = _EMBED_CODE_BLOCK_LENGTH * (_MAX_SIGNATURE_AMOUNT + 1 - len(signatures)) formatted_signatures = [] for signature in signatures: signature = signature.strip() -- cgit v1.2.3 From a2e7db718fbeb6fabb5e261ef4414038477abfb2 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Mon, 28 Sep 2020 23:43:58 +0200 Subject: Add parentheses for clarity --- bot/cogs/doc/parsing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py index cf1124936..7cf4ec7ba 100644 --- a/bot/cogs/doc/parsing.py +++ b/bot/cogs/doc/parsing.py @@ -166,7 +166,7 @@ def _get_signatures(start_signature: PageElement) -> List[str]: *reversed(_find_previous_siblings_until_tag(start_signature, ("dd",), limit=2)), start_signature, *_find_next_siblings_until_tag(start_signature, ("dd",), limit=2), - )[-_MAX_SIGNATURE_AMOUNT:]: + )[-(_MAX_SIGNATURE_AMOUNT):]: signature = _UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text) if signature: -- cgit v1.2.3 From 2b97cfad08f7dac0ea1ce6119bab004b4c2452e7 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Tue, 29 Sep 2020 23:03:36 +0200 Subject: Add async implementation of sphinx fetch_inventory The sphinx version of the function does a lot of checks that are unnecessary for the bot because it's not working with anything else related to docs. The custom implementation means we can throw some of the code out and get rid of sphinx as a dependency. --- LICENSE-THIRD-PARTY | 30 ++++++++++++++ bot/cogs/doc/inventory_parser.py | 87 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 117 insertions(+) create mode 100644 LICENSE-THIRD-PARTY create mode 100644 bot/cogs/doc/inventory_parser.py diff --git a/LICENSE-THIRD-PARTY b/LICENSE-THIRD-PARTY new file mode 100644 index 000000000..f78491fc1 --- /dev/null +++ b/LICENSE-THIRD-PARTY @@ -0,0 +1,30 @@ +License for Sphinx +Applies to: + - bot/cogs/doc/inventory_parser.py: _load_v1, _load_v2 and ZlibStreamReader.__aiter__. +================== + +Copyright (c) 2007-2020 by the Sphinx team (see AUTHORS file). +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/bot/cogs/doc/inventory_parser.py b/bot/cogs/doc/inventory_parser.py new file mode 100644 index 000000000..6c2b63d5e --- /dev/null +++ b/bot/cogs/doc/inventory_parser.py @@ -0,0 +1,87 @@ +import re +import zlib +from collections import defaultdict +from typing import AsyncIterator, DefaultDict, List, Tuple + +import aiohttp + +_V2_LINE_RE = re.compile(r'(?x)(.+?)\s+(\S*:\S*)\s+(-?\d+)\s+?(\S*)\s+(.*)') + + +class ZlibStreamReader: + """Class used for decoding zlib data of a stream line by line.""" + + READ_CHUNK_SIZE = 16 * 1024 + + def __init__(self, stream: aiohttp.StreamReader) -> None: + self.stream = stream + + async def _read_compressed_chunks(self) -> AsyncIterator[bytes]: + """Read zlib data in `READ_CHUNK_SIZE` sized chunks and decompress.""" + decompressor = zlib.decompressobj() + async for chunk in self.stream.iter_chunked(self.READ_CHUNK_SIZE): + yield decompressor.decompress(chunk) + + yield decompressor.flush() + + async def __aiter__(self) -> AsyncIterator[str]: + """Yield lines of decompressed text.""" + buf = b'' + async for chunk in self._read_compressed_chunks(): + buf += chunk + pos = buf.find(b'\n') + while pos != -1: + yield buf[:pos].decode() + buf = buf[pos + 1:] + pos = buf.find(b'\n') + + +async def _load_v1(stream: aiohttp.StreamReader) -> DefaultDict[str, List[Tuple[str, str]]]: + invdata = defaultdict(list) + + async for line in stream: + name, type_, location = line.decode().rstrip().split(maxsplit=2) + # version 1 did not add anchors to the location + if type_ == 'mod': + type_ = 'py:module' + location += '#module-' + name + else: + type_ = 'py:' + type_ + location += '#' + name + invdata[type_].append((name, location)) + return invdata + + +async def _load_v2(stream: aiohttp.StreamReader) -> DefaultDict[str, List[Tuple[str, str]]]: + invdata = defaultdict(list) + + async for line in ZlibStreamReader(stream): + m = _V2_LINE_RE.match(line.rstrip()) + name, type_, _prio, location, _dispname = m.groups() # ignore the parsed items we don't need + if location.endswith('$'): + location = location[:-1] + name + + invdata[type_].append((name, location)) + return invdata + + +async def fetch_inventory(client_session: aiohttp.ClientSession, url: str) -> DefaultDict[str, List[Tuple[str, str]]]: + """Fetch, parse and return an intersphinx inventory file from an url.""" + timeout = aiohttp.ClientTimeout(sock_connect=5, sock_read=5) + async with client_session.get(url, timeout=timeout, raise_for_status=True) as response: + stream = response.content + + inventory_header = (await stream.readline()).decode().rstrip() + inventory_version = int(inventory_header[-1:]) + await stream.readline() # skip project name + await stream.readline() # skip project version + + if inventory_version == 1: + return await _load_v1(stream) + + elif inventory_version == 2: + if b"zlib" not in await stream.readline(): + raise ValueError(f"Invalid inventory file at url {url}.") + return await _load_v2(stream) + + raise ValueError(f"Invalid inventory file at url {url}.") -- cgit v1.2.3 From d8c36ac9f189ba9638ef91df7628f95845161f8e Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 30 Sep 2020 00:19:39 +0200 Subject: Handle errors on inventory fetching --- bot/cogs/doc/inventory_parser.py | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/bot/cogs/doc/inventory_parser.py b/bot/cogs/doc/inventory_parser.py index 6c2b63d5e..23931869b 100644 --- a/bot/cogs/doc/inventory_parser.py +++ b/bot/cogs/doc/inventory_parser.py @@ -1,10 +1,14 @@ +import logging import re import zlib from collections import defaultdict -from typing import AsyncIterator, DefaultDict, List, Tuple +from typing import AsyncIterator, DefaultDict, List, Optional, Tuple import aiohttp +log = logging.getLogger(__name__) + +FAILED_REQUEST_ATTEMPTS = 3 _V2_LINE_RE = re.compile(r'(?x)(.+?)\s+(\S*:\S*)\s+(-?\d+)\s+?(\S*)\s+(.*)') @@ -65,7 +69,7 @@ async def _load_v2(stream: aiohttp.StreamReader) -> DefaultDict[str, List[Tuple[ return invdata -async def fetch_inventory(client_session: aiohttp.ClientSession, url: str) -> DefaultDict[str, List[Tuple[str, str]]]: +async def _fetch_inventory(client_session: aiohttp.ClientSession, url: str) -> DefaultDict[str, List[Tuple[str, str]]]: """Fetch, parse and return an intersphinx inventory file from an url.""" timeout = aiohttp.ClientTimeout(sock_connect=5, sock_read=5) async with client_session.get(url, timeout=timeout, raise_for_status=True) as response: @@ -85,3 +89,32 @@ async def fetch_inventory(client_session: aiohttp.ClientSession, url: str) -> De return await _load_v2(stream) raise ValueError(f"Invalid inventory file at url {url}.") + + +async def fetch_inventory( + client_session: aiohttp.ClientSession, + url: str +) -> Optional[DefaultDict[str, List[Tuple[str, str]]]]: + """Get inventory from `url`, retrying `FAILED_REQUEST_ATTEMPTS` times on errors.""" + for attempt in range(1, FAILED_REQUEST_ATTEMPTS+1): + try: + inventory = await _fetch_inventory(client_session, url) + except aiohttp.ClientConnectorError: + log.warning( + f"Failed to connect to inventory url at {url}, " + f"trying again ({attempt}/{FAILED_REQUEST_ATTEMPTS})." + ) + except aiohttp.ClientError: + log.error( + f"Failed to get inventory from {url}, " + f"trying again ({attempt}/{FAILED_REQUEST_ATTEMPTS})." + ) + except Exception: + log.exception( + f"An unexpected error has occurred during fetching of {url}, " + f"trying again ({attempt}/{FAILED_REQUEST_ATTEMPTS})." + ) + else: + return inventory + + return None -- cgit v1.2.3 From 3bf04d8a353056944ac335b1d387d71464a81aa1 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 30 Sep 2020 00:38:24 +0200 Subject: Use new async inventory fetching --- bot/cogs/doc/cog.py | 71 ++++++----------------------------------------------- 1 file changed, 7 insertions(+), 64 deletions(-) diff --git a/bot/cogs/doc/cog.py b/bot/cogs/doc/cog.py index 7c1bf2a5f..2cb296d53 100644 --- a/bot/cogs/doc/cog.py +++ b/bot/cogs/doc/cog.py @@ -1,22 +1,17 @@ from __future__ import annotations import asyncio -import functools import logging import re import sys from collections import defaultdict from contextlib import suppress -from types import SimpleNamespace from typing import Dict, List, NamedTuple, Optional, Union import discord from aiohttp import ClientSession from bs4 import BeautifulSoup from discord.ext import commands -from requests import ConnectTimeout, ConnectionError, HTTPError -from sphinx.ext import intersphinx -from urllib3.exceptions import ProtocolError from bot.bot import Bot from bot.constants import MODERATION_ROLES, RedirectOutput @@ -24,20 +19,10 @@ from bot.converters import PackageName, ValidURL from bot.decorators import with_role from bot.pagination import LinePaginator from bot.utils.messages import wait_for_deletion +from .inventory_parser import FAILED_REQUEST_ATTEMPTS, fetch_inventory from .parsing import get_symbol_markdown log = logging.getLogger(__name__) -logging.getLogger('urllib3').setLevel(logging.WARNING) - -# Since Intersphinx is intended to be used with Sphinx, -# we need to mock its configuration. -SPHINX_MOCK_APP = SimpleNamespace( - config=SimpleNamespace( - intersphinx_timeout=3, - tls_verify=True, - user_agent="python3:python-discord/bot:1.0.0" - ) -) NO_OVERRIDE_GROUPS = ( "2to3fixer", @@ -51,7 +36,6 @@ NO_OVERRIDE_PACKAGES = ( ) WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)") -FAILED_REQUEST_RETRY_AMOUNT = 3 NOT_FOUND_DELETE_DELAY = RedirectOutput.delete_delay @@ -190,21 +174,8 @@ class InventoryURL(commands.Converter): async def convert(ctx: commands.Context, url: str) -> str: """Convert url to Intersphinx inventory URL.""" await ctx.trigger_typing() - try: - intersphinx.fetch_inventory(SPHINX_MOCK_APP, '', url) - except AttributeError: - raise commands.BadArgument(f"Failed to fetch Intersphinx inventory from URL `{url}`.") - except ConnectionError: - if url.startswith('https'): - raise commands.BadArgument( - f"Cannot establish a connection to `{url}`. Does it support HTTPS?" - ) - raise commands.BadArgument(f"Cannot connect to host with URL `{url}`.") - except ValueError: - raise commands.BadArgument( - f"Failed to read Intersphinx inventory from URL `{url}`. " - "Are you sure that it's a valid inventory file?" - ) + if await fetch_inventory(ctx.bot.http_session, url) is None: + raise commands.BadArgument(f"Failed to fetch inventory file after {FAILED_REQUEST_ATTEMPTS}.") return url @@ -235,17 +206,16 @@ class DocCog(commands.Cog): * `package_name` is the package name to use, appears in the log * `base_url` is the root documentation URL for the specified package, used to build absolute paths that link to specific symbols - * `inventory_url` is the absolute URL to the intersphinx inventory, fetched by running - `intersphinx.fetch_inventory` in an executor on the bot's event loop + * `inventory_url` is the absolute URL to the intersphinx inventory. """ self.base_urls[api_package_name] = base_url - package = await self._fetch_inventory(inventory_url) + package = await fetch_inventory(self.bot.http_session, inventory_url) if not package: return None - for group, value in package.items(): - for symbol, (_package_name, _version, relative_doc_url, _) in value.items(): + for group, items in package.items(): + for symbol, relative_doc_url in items: if "/" in symbol: continue # skip unreachable symbols with slashes # Intern the group names since they're reused in all the DocItems @@ -455,30 +425,3 @@ class DocCog(commands.Cog): description=f"```diff\n{added}\n{removed}```" if added or removed else "" ) await ctx.send(embed=embed) - - async def _fetch_inventory(self, inventory_url: str) -> Optional[dict]: - """Get and return inventory from `inventory_url`. If fetching fails, return None.""" - fetch_func = functools.partial(intersphinx.fetch_inventory, SPHINX_MOCK_APP, '', inventory_url) - for retry in range(1, FAILED_REQUEST_RETRY_AMOUNT+1): - try: - package = await self.bot.loop.run_in_executor(None, fetch_func) - except ConnectTimeout: - log.error( - f"Fetching of inventory {inventory_url} timed out," - f" trying again. ({retry}/{FAILED_REQUEST_RETRY_AMOUNT})" - ) - except ProtocolError: - log.error( - f"Connection lost while fetching inventory {inventory_url}," - f" trying again. ({retry}/{FAILED_REQUEST_RETRY_AMOUNT})" - ) - except HTTPError as e: - log.error(f"Fetching of inventory {inventory_url} failed with status code {e.response.status_code}.") - return None - except ConnectionError: - log.error(f"Couldn't establish connection to inventory {inventory_url}.") - return None - else: - return package - log.error(f"Fetching of inventory {inventory_url} failed.") - return None -- cgit v1.2.3 From 46ee70533328eed3790ebb93d1257b5d4e598802 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 30 Sep 2020 00:42:55 +0200 Subject: Remove sphinx and requests from Pipfile With our own implementation of sphinx's inventory fetching we no longer need the sphinx package, and requests which were used inside of it. --- Pipfile | 2 -- 1 file changed, 2 deletions(-) diff --git a/Pipfile b/Pipfile index 6fff2223e..1e54c9212 100644 --- a/Pipfile +++ b/Pipfile @@ -21,9 +21,7 @@ markdownify = "~=0.4" more_itertools = "~=8.2" python-dateutil = "~=2.8" pyyaml = "~=5.1" -requests = "~=2.22" sentry-sdk = "~=0.14" -sphinx = "~=2.2" statsd = "~=3.3" [dev-packages] -- cgit v1.2.3 From c5aa0c0bd7e8933648fbedc92a7cd1f5ae199772 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Thu, 1 Oct 2020 00:04:53 +0200 Subject: Reschedule failed inventory updates --- bot/cogs/doc/cog.py | 39 +++++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/bot/cogs/doc/cog.py b/bot/cogs/doc/cog.py index 2cb296d53..41fca4584 100644 --- a/bot/cogs/doc/cog.py +++ b/bot/cogs/doc/cog.py @@ -19,6 +19,7 @@ from bot.converters import PackageName, ValidURL from bot.decorators import with_role from bot.pagination import LinePaginator from bot.utils.messages import wait_for_deletion +from bot.utils.scheduling import Scheduler from .inventory_parser import FAILED_REQUEST_ATTEMPTS, fetch_inventory from .parsing import get_symbol_markdown @@ -189,6 +190,9 @@ class DocCog(commands.Cog): self.item_fetcher = CachedParser() self.renamed_symbols = set() + self.inventory_scheduler = Scheduler(self.__class__.__name__) + self.scheduled_inventories = set() + self.bot.loop.create_task(self.init_refresh_inventory()) async def init_refresh_inventory(self) -> None: @@ -198,7 +202,7 @@ class DocCog(commands.Cog): async def update_single( self, api_package_name: str, base_url: str, inventory_url: str - ) -> None: + ) -> bool: """ Rebuild the inventory for a single package. @@ -207,12 +211,27 @@ class DocCog(commands.Cog): * `base_url` is the root documentation URL for the specified package, used to build absolute paths that link to specific symbols * `inventory_url` is the absolute URL to the intersphinx inventory. + + If the inventory file is currently unreachable, + the update is rescheduled to execute in 2 minutes on the first attempt, and 5 minutes on subsequent attempts. + + Return True on success; False if fetching failed and was rescheduled. """ self.base_urls[api_package_name] = base_url - package = await fetch_inventory(self.bot.http_session, inventory_url) + if not package: - return None + delay = 2*60 if inventory_url not in self.scheduled_inventories else 5*60 + log.info(f"Failed to fetch inventory, attempting again in {delay//60} minutes.") + self.inventory_scheduler.schedule_later( + delay, + api_package_name, + fetch_inventory(self.bot.http_session, inventory_url) + ) + self.scheduled_inventories.add(api_package_name) + return False + with suppress(KeyError): + self.scheduled_inventories.discard(api_package_name) for group, items in package.items(): for symbol, relative_doc_url in items: @@ -249,6 +268,7 @@ class DocCog(commands.Cog): self.item_fetcher.add_item(symbol_item) log.trace(f"Fetched inventory for {api_package_name}.") + return True async def refresh_inventory(self) -> None: """Refresh internal documentation inventory.""" @@ -260,6 +280,7 @@ class DocCog(commands.Cog): self.base_urls.clear() self.doc_symbols.clear() self.renamed_symbols.clear() + self.scheduled_inventories.clear() await self.item_fetcher.clear() # Run all coroutines concurrently - since each of them performs a HTTP @@ -385,7 +406,11 @@ class DocCog(commands.Cog): f"Inventory URL: {inventory_url}" ) - await self.update_single(package_name, base_url, inventory_url) + if await self.update_single(package_name, base_url, inventory_url) is None: + await ctx.send( + f"Added package `{package_name}` to database but failed to fetch inventory; rescheduled in 2 minutes." + ) + return await ctx.send(f"Added package `{package_name}` to database and refreshed inventory.") @docs_group.command(name='deletedoc', aliases=('removedoc', 'rm', 'd')) @@ -399,6 +424,9 @@ class DocCog(commands.Cog): """ await self.bot.api_client.delete(f'bot/documentation-links/{package_name}') + if package_name in self.scheduled_inventories: + self.inventory_scheduler.cancel(package_name) + async with ctx.typing(): # Rebuild the inventory to ensure that everything # that was from this package is properly deleted. @@ -409,6 +437,9 @@ class DocCog(commands.Cog): @with_role(*MODERATION_ROLES) async def refresh_command(self, ctx: commands.Context) -> None: """Refresh inventories and send differences to channel.""" + for inventory in self.scheduled_inventories: + self.inventory_scheduler.cancel(inventory) + old_inventories = set(self.base_urls) with ctx.typing(): await self.refresh_inventory() -- cgit v1.2.3 From f4924f0e8c26e373ddae8cb29f1f3935aaf00f4a Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 10 Oct 2020 21:47:34 +0200 Subject: Handle non dt fallback together with modules --- bot/exts/info/doc/_parsing.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py index 83e35e2b1..a79332716 100644 --- a/bot/exts/info/doc/_parsing.py +++ b/bot/exts/info/doc/_parsing.py @@ -296,12 +296,7 @@ def get_symbol_markdown(soup: BeautifulSoup, symbol_data: DocItem) -> str: signature = None # Modules, doc pages and labels don't point to description list tags but to tags like divs, # no special parsing can be done so we only try to include what's under them. - if symbol_data.group in {"module", "doc", "label"}: - description = _get_general_description(symbol_heading) - - elif symbol_heading.name != "dt": - # Use the general parsing for symbols that aren't modules, docs or labels and aren't dt tags, - # log info the tag can be looked at. + if symbol_data.group in {"module", "doc", "label"} or symbol_heading.name != "dt": description = _get_general_description(symbol_heading) elif symbol_data.group in _NO_SIGNATURE_GROUPS: -- cgit v1.2.3 From 2744b10fae0f3b1d4ac198ba819c024e037e5660 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 10 Oct 2020 21:48:10 +0200 Subject: Use more descriptive name for end_tag_filter --- bot/exts/info/doc/_parsing.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py index a79332716..5f6c23c8d 100644 --- a/bot/exts/info/doc/_parsing.py +++ b/bot/exts/info/doc/_parsing.py @@ -99,7 +99,7 @@ def _split_parameters(parameters_string: str) -> List[str]: def _find_elements_until_tag( start_element: PageElement, - tag_filter: Union[Tuple[str, ...], Callable[[Tag], bool]], + end_tag_filter: Union[Tuple[str, ...], Callable[[Tag], bool]], *, func: Callable, include_strings: bool = False, @@ -108,7 +108,7 @@ def _find_elements_until_tag( """ Get all elements up to `limit` or until a tag matching `tag_filter` is found. - `tag_filter` can be either a tuple of string names to check against, + `end_tag_filter` can be either a tuple of string names to check against, or a filtering callable that's applied to tags. When `include_strings` is True, `NavigableString`s from the document will be included in the result along `Tag`s. @@ -116,15 +116,15 @@ def _find_elements_until_tag( `func` takes in a BeautifulSoup unbound method for finding multiple elements, such as `BeautifulSoup.find_all`. The method is then iterated over and all elements until the matching tag or the limit are added to the return list. """ - use_tuple_filter = isinstance(tag_filter, tuple) + use_tuple_filter = isinstance(end_tag_filter, tuple) elements = [] for element in func(start_element, name=Strainer(include_strings=include_strings), limit=limit): if isinstance(element, Tag): if use_tuple_filter: - if element.name in tag_filter: + if element.name in end_tag_filter: break - elif tag_filter(element): + elif end_tag_filter(element): break elements.append(element) -- cgit v1.2.3 From 9e4832965957eec291a3ccde198252ab28ce13e2 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 10 Oct 2020 21:50:37 +0200 Subject: Exclude headerlinks outside of current section --- bot/exts/info/doc/_parsing.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py index 5f6c23c8d..d31f26060 100644 --- a/bot/exts/info/doc/_parsing.py +++ b/bot/exts/info/doc/_parsing.py @@ -132,20 +132,22 @@ def _find_elements_until_tag( _find_next_children_until_tag = partial(_find_elements_until_tag, func=partial(BeautifulSoup.find_all, recursive=False)) +_find_recursive_children_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_all) _find_next_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_next_siblings) _find_previous_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_previous_siblings) -def _get_general_description(start_element: PageElement) -> List[Union[Tag, NavigableString]]: +def _get_general_description(start_element: Tag) -> List[Union[Tag, NavigableString]]: """ Get page content to a table or a tag with its class in `SEARCH_END_TAG_ATTRS`. A headerlink a tag is attempted to be found to skip repeating the symbol information in the description, if it's found it's used as the tag to start the search from instead of the `start_element`. """ - header = start_element.find_next("a", attrs={"class": "headerlink"}) + child_tags = _find_recursive_children_until_tag(start_element, _class_filter_factory(["section"]), limit=100) + header = next(filter(_class_filter_factory(["headerlink"]), child_tags), None) start_tag = header.parent if header is not None else start_element - return _find_next_siblings_until_tag(start_tag, _match_end_tag, include_strings=True) + return _find_next_siblings_until_tag(start_tag, _class_filter_factory(_SEARCH_END_TAG_ATTRS), include_strings=True) def _get_dd_description(symbol: PageElement) -> List[Union[Tag, NavigableString]]: @@ -274,13 +276,15 @@ def _parse_into_markdown(signatures: Optional[List[str]], description: Iterable[ return formatted_markdown -def _match_end_tag(tag: Tag) -> bool: - """Matches `tag` if its class value is in `SEARCH_END_TAG_ATTRS` or the tag is table.""" - for attr in _SEARCH_END_TAG_ATTRS: - if attr in tag.get("class", ()): - return True +def _class_filter_factory(class_names: Iterable[str]) -> Callable[[Tag], bool]: + """Create callable that returns True when the passed in tag's class is in `class_names` or when it's is a table.""" + def match_tag(tag: Tag) -> bool: + for attr in class_names: + if attr in tag.get("class", ()): + return True + return tag.name == "table" - return tag.name == "table" + return match_tag def get_symbol_markdown(soup: BeautifulSoup, symbol_data: DocItem) -> str: -- cgit v1.2.3 From 59f1fffb656447668f6e5a34fcc52697b152780a Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 18 Oct 2020 03:04:29 +0200 Subject: Handle escaped backslashes in strings --- bot/exts/info/doc/_parsing.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py index d31f26060..0883b9f42 100644 --- a/bot/exts/info/doc/_parsing.py +++ b/bot/exts/info/doc/_parsing.py @@ -66,7 +66,6 @@ def _split_parameters(parameters_string: str) -> List[str]: depth = 0 expected_end = None current_search = None - previous_character = "" for index, character in enumerate(parameters_string): if character in _BRACKET_PAIRS: @@ -79,7 +78,9 @@ def _split_parameters(parameters_string: str) -> List[str]: elif character in {"'", '"'}: if depth == 0: depth += 1 - elif not previous_character == "\\": + elif parameters_string[index-1] != "\\": + depth -= 1 + elif parameters_string[index-2] == "\\": depth -= 1 elif character == expected_end: @@ -91,7 +92,6 @@ def _split_parameters(parameters_string: str) -> List[str]: elif depth == 0 and character == ",": parameters_list.append(parameters_string[last_split:index]) last_split = index + 1 - previous_character = character parameters_list.append(parameters_string[last_split:]) return parameters_list -- cgit v1.2.3 From c9fe7b1d6b98334c29f516b682b93b4c1c3946a1 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Tue, 10 Nov 2020 01:14:31 +0100 Subject: Cache user fetched symbols through redis. --- bot/exts/info/doc/_cog.py | 22 ++++++++++++++++++++-- bot/exts/info/doc/_redis_cache.py | 23 +++++++++++++++++++++++ 2 files changed, 43 insertions(+), 2 deletions(-) create mode 100644 bot/exts/info/doc/_redis_cache.py diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 257435e95..ab3ad159a 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -4,6 +4,7 @@ import asyncio import logging import re import sys +import urllib.parse from collections import defaultdict from contextlib import suppress from typing import Dict, List, NamedTuple, Optional, Union @@ -21,6 +22,7 @@ from bot.utils.messages import wait_for_deletion from bot.utils.scheduling import Scheduler from ._inventory_parser import FAILED_REQUEST_ATTEMPTS, fetch_inventory from ._parsing import get_symbol_markdown +from ._redis_cache import DocRedisCache log = logging.getLogger(__name__) @@ -182,6 +184,8 @@ class InventoryURL(commands.Converter): class DocCog(commands.Cog): """A set of commands for querying & displaying documentation.""" + doc_cache = DocRedisCache() + def __init__(self, bot: Bot): self.base_urls = {} self.bot = bot @@ -296,16 +300,30 @@ class DocCog(commands.Cog): Attempt to scrape and fetch the data for the given `symbol`, and build an embed from its contents. If the symbol is known, an Embed with documentation about it is returned. + + First check the DocRedisCache before querying the cog's `CachedParser`, + if not present also create a redis entry for the symbol. """ + log.trace(f"Building embed for symbol `{symbol}`") symbol_info = self.doc_symbols.get(symbol) if symbol_info is None: + log.debug("Symbol does not exist.") return None self.bot.stats.incr(f"doc_fetches.{symbol_info.package.lower()}") + item_url = f"{symbol_info.url}#{symbol_info.symbol_id}" + redis_key = "".join(urllib.parse.urlparse(item_url)[1:]) # url without scheme + + markdown = await self.doc_cache.get(redis_key) + if markdown is None: + log.debug(f"Redis cache miss for symbol `{symbol}`.") + markdown = await self.item_fetcher.get_markdown(self.bot.http_session, symbol_info) + await self.doc_cache.set(redis_key, markdown) + embed = discord.Embed( title=discord.utils.escape_markdown(symbol), - url=f"{symbol_info.url}#{symbol_info.symbol_id}", - description=await self.item_fetcher.get_markdown(self.bot.http_session, symbol_info) + url=item_url, + description=markdown ) # Show all symbols with the same name that were renamed in the footer. embed.set_footer( diff --git a/bot/exts/info/doc/_redis_cache.py b/bot/exts/info/doc/_redis_cache.py new file mode 100644 index 000000000..147394ba6 --- /dev/null +++ b/bot/exts/info/doc/_redis_cache.py @@ -0,0 +1,23 @@ +from typing import Optional + +from async_rediscache.types.base import RedisObject, namespace_lock + + +class DocRedisCache(RedisObject): + """Interface for redis functionality needed by the Doc cog.""" + + @namespace_lock + async def set(self, key: str, value: str) -> None: + """ + Set markdown `value` for `key`. + + Keys expire after a week to keep data up to date. + """ + with await self._get_pool_connection() as connection: + await connection.setex(f"{self.namespace}:{key}", 7*24*60*60, value) + + @namespace_lock + async def get(self, key: str) -> Optional[str]: + """Get markdown contents for `key`.""" + with await self._get_pool_connection() as connection: + return await connection.get(f"{self.namespace}:{key}", encoding="utf8") -- cgit v1.2.3 From b8c12d08c9b8dc4e0bf39fcc242d67a3532d0fd0 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Tue, 10 Nov 2020 03:16:35 +0100 Subject: Add package in front of symbol as default fallback Previously weo nly added the package name for symbols that shared are named name with an another symbol, but in some edge cases we can get to this point with symbols that weren't renamed but have name conflicts, causing some to get overwritten completely without the capturing condition --- bot/exts/info/doc/_cog.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index ab3ad159a..264d6e31e 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -260,8 +260,7 @@ class DocCog(commands.Cog): self.doc_symbols[overridden_symbol] = original_symbol self.renamed_symbols.add(overridden_symbol) - # If renamed `symbol` already exists, add library name in front to differentiate between them. - elif symbol in self.renamed_symbols: + else: symbol = f"{api_package_name}.{symbol}" self.renamed_symbols.add(symbol) -- cgit v1.2.3 From 89169f5c0b203be1963cfe569c216e0094674c4f Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Tue, 10 Nov 2020 03:56:29 +0100 Subject: Simplify duplicate symbol name handling code With the catchall else condition and symbols from FORCE_PREFIX_GROUPS getting renamed even when being overwritten, we can ignore the package handling and let it go to the else which adds the package prefix instead of a group --- bot/exts/info/doc/_cog.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 264d6e31e..ee89f5384 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -26,17 +26,14 @@ from ._redis_cache import DocRedisCache log = logging.getLogger(__name__) -NO_OVERRIDE_GROUPS = ( +# symbols with a group contained here will get the group prefixed on duplicates +FORCE_PREFIX_GROUPS = ( "2to3fixer", "token", "label", "pdbcommand", "term", ) -NO_OVERRIDE_PACKAGES = ( - "python", -) - WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)") NOT_FOUND_DELETE_DELAY = RedirectOutput.delete_delay @@ -245,14 +242,11 @@ class DocCog(commands.Cog): group_name = sys.intern(group.split(":")[1]) if (original_symbol := self.doc_symbols.get(symbol)) is not None: - if ( - group_name in NO_OVERRIDE_GROUPS - or any(package == original_symbol.package for package in NO_OVERRIDE_PACKAGES) - ): + if group_name in FORCE_PREFIX_GROUPS: symbol = f"{group_name}.{symbol}" self.renamed_symbols.add(symbol) - elif (overridden_symbol_group := original_symbol.group) in NO_OVERRIDE_GROUPS: + elif (overridden_symbol_group := original_symbol.group) in FORCE_PREFIX_GROUPS: overridden_symbol = f"{overridden_symbol_group}.{symbol}" if overridden_symbol in self.renamed_symbols: overridden_symbol = f"{api_package_name}.{overridden_symbol}" -- cgit v1.2.3 From faaa85d2d00a2bc7496965fad3f5f53f56718e9c Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Tue, 10 Nov 2020 04:03:23 +0100 Subject: Move InventoryURL converer to the converters file --- bot/converters.py | 20 ++++++++++++++++++++ bot/exts/info/doc/_cog.py | 23 ++--------------------- 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/bot/converters.py b/bot/converters.py index 6c87a50fe..3066eaabb 100644 --- a/bot/converters.py +++ b/bot/converters.py @@ -15,6 +15,7 @@ from discord.utils import DISCORD_EPOCH, snowflake_time from bot.api import ResponseCodeError from bot.constants import URLs +from bot.exts.info.doc import _inventory_parser from bot.utils.regex import INVITE_RE log = logging.getLogger(__name__) @@ -175,6 +176,25 @@ class ValidURL(Converter): return url +class InventoryURL(Converter): + """ + Represents an Intersphinx inventory URL. + + This converter checks whether intersphinx accepts the given inventory URL, and raises + `BadArgument` if that is not the case. + + Otherwise, it simply passes through the given URL. + """ + + @staticmethod + async def convert(ctx: Context, url: str) -> str: + """Convert url to Intersphinx inventory URL.""" + await ctx.trigger_typing() + if await _inventory_parser.fetch_inventory(ctx.bot.http_session, url) is None: + raise BadArgument(f"Failed to fetch inventory file after {_inventory_parser.FAILED_REQUEST_ATTEMPTS}.") + return url + + class Snowflake(IDConverter): """ Converts to an int if the argument is a valid Discord snowflake. diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index ee89f5384..25477fe07 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -16,11 +16,11 @@ from discord.ext import commands from bot.bot import Bot from bot.constants import MODERATION_ROLES, RedirectOutput -from bot.converters import PackageName, ValidURL +from bot.converters import InventoryURL, PackageName, ValidURL from bot.pagination import LinePaginator from bot.utils.messages import wait_for_deletion from bot.utils.scheduling import Scheduler -from ._inventory_parser import FAILED_REQUEST_ATTEMPTS, fetch_inventory +from ._inventory_parser import fetch_inventory from ._parsing import get_symbol_markdown from ._redis_cache import DocRedisCache @@ -159,25 +159,6 @@ class CachedParser: self._item_events.clear() -class InventoryURL(commands.Converter): - """ - Represents an Intersphinx inventory URL. - - This converter checks whether intersphinx accepts the given inventory URL, and raises - `BadArgument` if that is not the case. - - Otherwise, it simply passes through the given URL. - """ - - @staticmethod - async def convert(ctx: commands.Context, url: str) -> str: - """Convert url to Intersphinx inventory URL.""" - await ctx.trigger_typing() - if await fetch_inventory(ctx.bot.http_session, url) is None: - raise commands.BadArgument(f"Failed to fetch inventory file after {FAILED_REQUEST_ATTEMPTS}.") - return url - - class DocCog(commands.Cog): """A set of commands for querying & displaying documentation.""" -- cgit v1.2.3 From 2836ce6f24d66949376a1defbf3813ffae8b7f47 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Tue, 10 Nov 2020 13:45:43 +0100 Subject: Relock Pipfile.lock --- Pipfile.lock | 434 +++++++++++++++++++---------------------------------------- 1 file changed, 136 insertions(+), 298 deletions(-) diff --git a/Pipfile.lock b/Pipfile.lock index becd85c55..f622d9e01 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "073fd0c51749aafa188fdbe96c5b90dd157cb1d23bdd144801fb0d0a369ffa88" + "sha256": "35130d225126e341941fe36e4193fe53aa253e193a50505054a87f48ab7f7c8c" }, "pipfile-spec": 6, "requires": { @@ -34,21 +34,22 @@ }, "aiohttp": { "hashes": [ - "sha256:1e984191d1ec186881ffaed4581092ba04f7c61582a177b187d3a2f07ed9719e", - "sha256:259ab809ff0727d0e834ac5e8a283dc5e3e0ecc30c4d80b3cd17a4139ce1f326", - "sha256:2f4d1a4fdce595c947162333353d4a44952a724fba9ca3205a3df99a33d1307a", - "sha256:32e5f3b7e511aa850829fbe5aa32eb455e5534eaa4b1ce93231d00e2f76e5654", - "sha256:344c780466b73095a72c616fac5ea9c4665add7fc129f285fbdbca3cccf4612a", - "sha256:460bd4237d2dbecc3b5ed57e122992f60188afe46e7319116da5eb8a9dfedba4", - "sha256:4c6efd824d44ae697814a2a85604d8e992b875462c6655da161ff18fd4f29f17", - "sha256:50aaad128e6ac62e7bf7bd1f0c0a24bc968a0c0590a726d5a955af193544bcec", - "sha256:6206a135d072f88da3e71cc501c59d5abffa9d0bb43269a6dcd28d66bfafdbdd", - "sha256:65f31b622af739a802ca6fd1a3076fd0ae523f8485c52924a89561ba10c49b48", - "sha256:ae55bac364c405caa23a4f2d6cfecc6a0daada500274ffca4a9230e7129eac59", - "sha256:b778ce0c909a2653741cb4b1ac7015b5c130ab9c897611df43ae6a58523cb965" + "sha256:1a4160579ffbc1b69e88cb6ca8bb0fbd4947dfcbf9fb1e2a4fc4c7a4a986c1fe", + "sha256:206c0ccfcea46e1bddc91162449c20c72f308aebdcef4977420ef329c8fcc599", + "sha256:2ad493de47a8f926386fa6d256832de3095ba285f325db917c7deae0b54a9fc8", + "sha256:319b490a5e2beaf06891f6711856ea10591cfe84fe9f3e71a721aa8f20a0872a", + "sha256:470e4c90da36b601676fe50c49a60d34eb8c6593780930b1aa4eea6f508dfa37", + "sha256:60f4caa3b7f7a477f66ccdd158e06901e1d235d572283906276e3803f6b098f5", + "sha256:66d64486172b032db19ea8522328b19cfb78a3e1e5b62ab6a0567f93f073dea0", + "sha256:687461cd974722110d1763b45c5db4d2cdee8d50f57b00c43c7590d1dd77fc5c", + "sha256:698cd7bc3c7d1b82bb728bae835724a486a8c376647aec336aa21a60113c3645", + "sha256:797456399ffeef73172945708810f3277f794965eb6ec9bd3a0c007c0476be98", + "sha256:a885432d3cabc1287bcf88ea94e1826d3aec57fd5da4a586afae4591b061d40d", + "sha256:c506853ba52e516b264b106321c424d03f3ddef2813246432fa9d1cefd361c81", + "sha256:fb83326d8295e8840e4ba774edf346e87eca78ba8a89c55d2690352842c15ba5" ], "index": "pypi", - "version": "==3.6.2" + "version": "==3.6.3" }, "aioping": { "hashes": [ @@ -68,18 +69,11 @@ }, "aiormq": { "hashes": [ - "sha256:106695a836f19c1af6c46b58e8aac80e00f86c5b3287a3c6483a1ee369cc95c9", - "sha256:9f6dbf6155fe2b7a3d24bf68de97fb812db0fac0a54e96bc1af14ea95078ba7f" + "sha256:8218dd9f7198d6e7935855468326bbacf0089f926c70baa8dd92944cb2496573", + "sha256:e584dac13a242589aaf42470fd3006cb0dc5aed6506cbd20357c7ec8bbe4a89e" ], "markers": "python_version >= '3.6'", - "version": "==3.2.3" - }, - "alabaster": { - "hashes": [ - "sha256:446438bdcca0e05bd45ea2de1668c1d9b032e1a9154c2c259092d77031ddd359", - "sha256:a661d72d58e6ea8a57f7a86e37d86716863ee5e92788398526d58b26a4e4dc02" - ], - "version": "==0.7.12" + "version": "==3.3.1" }, "async-rediscache": { "extras": [ @@ -103,35 +97,27 @@ }, "attrs": { "hashes": [ - "sha256:26b54ddbbb9ee1d34d5d3668dd37d6cf74990ab23c828c2888dccdceee395594", - "sha256:fce7fc47dfc976152e82d53ff92fa0407700c21acd20886a13777a0d20e655dc" + "sha256:31b2eced602aa8423c2aea9c76a724617ed67cf9513173fd3a4f03e3a929c7e6", + "sha256:832aa3cde19744e49938b91fea06d69ecb9e649c93ba974535d08ad92164f700" ], "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", - "version": "==20.2.0" - }, - "babel": { - "hashes": [ - "sha256:1aac2ae2d0d8ea368fa90906567f5c08463d98ade155c0c4bfedd6a0f7160e38", - "sha256:d670ea0b10f8b723672d3a6abeb87b565b244da220d76b4dba1b66269ec152d4" - ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", - "version": "==2.8.0" + "version": "==20.3.0" }, "beautifulsoup4": { "hashes": [ - "sha256:1edf5e39f3a5bc6e38b235b369128416c7239b34f692acccececb040233032a1", - "sha256:5dfe44f8fddc89ac5453f02659d3ab1668f2c0d9684839f0785037e8c6d9ac8d", - "sha256:645d833a828722357038299b7f6879940c11dddd95b900fe5387c258b72bb883" + "sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35", + "sha256:84729e322ad1d5b4d25f805bfa05b902dd96450f43842c4e99067d5e1369eb25", + "sha256:fff47e031e34ec82bf17e00da8f592fe7de69aeea38be00523c04623c04fb666" ], "index": "pypi", - "version": "==4.9.2" + "version": "==4.9.3" }, "certifi": { "hashes": [ - "sha256:5930595817496dd21bb8dc35dad090f1c2cd0adfaf21204bf6732ca5d8ee34d3", - "sha256:8fc0819f1f30ba15bdb34cceffb9ef04d99f420f68eb75d901e9560b8749fc41" + "sha256:1f422849db327d534e3d0c5f02a263458c3955ec0aae4ff09b95f195c59f4edd", + "sha256:f05def092c44fbf25834a51509ef6e631dc19765ab8a57b4e7ab85531f0a9cf4" ], - "version": "==2020.6.20" + "version": "==2020.11.8" }, "cffi": { "hashes": [ @@ -183,11 +169,12 @@ }, "colorama": { "hashes": [ - "sha256:7d73d2a99753107a36ac6b455ee49046802e59d9d076ef8e47b61499fa29afff", - "sha256:e96da0d330793e2cb9485e9ddfd918d456036c7149416295932478192f4436a1" + "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b", + "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2" ], + "index": "pypi", "markers": "sys_platform == 'win32'", - "version": "==0.4.3" + "version": "==0.4.4" }, "coloredlogs": { "hashes": [ @@ -207,26 +194,18 @@ }, "discord.py": { "hashes": [ - "sha256:3acb61fde0d862ed346a191d69c46021e6063673f63963bc984ae09a685ab211", - "sha256:e71089886aa157341644bdecad63a72ff56b44406b1a6467b66db31c8e5a5a15" + "sha256:2367359e31f6527f8a936751fc20b09d7495dd6a76b28c8fb13d4ca6c55b7563", + "sha256:def00dc50cf36d21346d71bc89f0cad8f18f9a3522978dc18c7796287d47de8b" ], "index": "pypi", - "version": "==1.5.0" - }, - "docutils": { - "hashes": [ - "sha256:0c5b78adfbf7762415433f5515cd5c9e762339e23369dbe8000d84a4bf4ab3af", - "sha256:c2de3a60e9e7d07be26b7f2b00ca0309c207e06c100f9cc2a94931fc75a478fc" - ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", - "version": "==0.16" + "version": "==1.5.1" }, "fakeredis": { "hashes": [ - "sha256:7ea0866ba5edb40fe2e9b1722535df0c7e6b91d518aa5f50d96c2fff3ea7f4c2", - "sha256:aad8836ffe0319ffbba66dcf872ac6e7e32d1f19790e31296ba58445efb0a5c7" + "sha256:8070b7fce16f828beaef2c757a4354af91698685d5232404f1aeeb233529c7a5", + "sha256:f8c8ea764d7b6fd801e7f5486e3edd32ca991d506186f1923a01fc072e33c271" ], - "version": "==1.4.3" + "version": "==1.4.4" }, "feedparser": { "hashes": [ @@ -313,58 +292,48 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==2.10" }, - "imagesize": { - "hashes": [ - "sha256:6965f19a6a2039c7d48bca7dba2473069ff854c36ae6f19d2cde309d998228a1", - "sha256:b1f6b5a4eab1f73479a50fb79fcf729514a900c341d8503d62a62dbc4127a2b1" - ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", - "version": "==1.2.0" - }, - "jinja2": { - "hashes": [ - "sha256:89aab215427ef59c34ad58735269eb58b1a5808103067f7bb9d5836c651b3bb0", - "sha256:f0a4641d3cf955324a89c04f3d94663aa4d638abe8f733ecd3582848e1c37035" - ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", - "version": "==2.11.2" - }, "lxml": { "hashes": [ - "sha256:05a444b207901a68a6526948c7cc8f9fe6d6f24c70781488e32fd74ff5996e3f", - "sha256:08fc93257dcfe9542c0a6883a25ba4971d78297f63d7a5a26ffa34861ca78730", - "sha256:107781b213cf7201ec3806555657ccda67b1fccc4261fb889ef7fc56976db81f", - "sha256:121b665b04083a1e85ff1f5243d4a93aa1aaba281bc12ea334d5a187278ceaf1", - "sha256:1fa21263c3aba2b76fd7c45713d4428dbcc7644d73dcf0650e9d344e433741b3", - "sha256:2b30aa2bcff8e958cd85d907d5109820b01ac511eae5b460803430a7404e34d7", - "sha256:4b4a111bcf4b9c948e020fd207f915c24a6de3f1adc7682a2d92660eb4e84f1a", - "sha256:5591c4164755778e29e69b86e425880f852464a21c7bb53c7ea453bbe2633bbe", - "sha256:59daa84aef650b11bccd18f99f64bfe44b9f14a08a28259959d33676554065a1", - "sha256:5a9c8d11aa2c8f8b6043d845927a51eb9102eb558e3f936df494e96393f5fd3e", - "sha256:5dd20538a60c4cc9a077d3b715bb42307239fcd25ef1ca7286775f95e9e9a46d", - "sha256:74f48ec98430e06c1fa8949b49ebdd8d27ceb9df8d3d1c92e1fdc2773f003f20", - "sha256:786aad2aa20de3dbff21aab86b2fb6a7be68064cbbc0219bde414d3a30aa47ae", - "sha256:7ad7906e098ccd30d8f7068030a0b16668ab8aa5cda6fcd5146d8d20cbaa71b5", - "sha256:80a38b188d20c0524fe8959c8ce770a8fdf0e617c6912d23fc97c68301bb9aba", - "sha256:8f0ec6b9b3832e0bd1d57af41f9238ea7709bbd7271f639024f2fc9d3bb01293", - "sha256:92282c83547a9add85ad658143c76a64a8d339028926d7dc1998ca029c88ea6a", - "sha256:94150231f1e90c9595ccc80d7d2006c61f90a5995db82bccbca7944fd457f0f6", - "sha256:9dc9006dcc47e00a8a6a029eb035c8f696ad38e40a27d073a003d7d1443f5d88", - "sha256:a76979f728dd845655026ab991df25d26379a1a8fc1e9e68e25c7eda43004bed", - "sha256:aa8eba3db3d8761db161003e2d0586608092e217151d7458206e243be5a43843", - "sha256:bea760a63ce9bba566c23f726d72b3c0250e2fa2569909e2d83cda1534c79443", - "sha256:c3f511a3c58676147c277eff0224c061dd5a6a8e1373572ac817ac6324f1b1e0", - "sha256:c9d317efde4bafbc1561509bfa8a23c5cab66c44d49ab5b63ff690f5159b2304", - "sha256:cc411ad324a4486b142c41d9b2b6a722c534096963688d879ea6fa8a35028258", - "sha256:cdc13a1682b2a6241080745b1953719e7fe0850b40a5c71ca574f090a1391df6", - "sha256:cfd7c5dd3c35c19cec59c63df9571c67c6d6e5c92e0fe63517920e97f61106d1", - "sha256:e1cacf4796b20865789083252186ce9dc6cc59eca0c2e79cca332bdff24ac481", - "sha256:e70d4e467e243455492f5de463b72151cc400710ac03a0678206a5f27e79ddef", - "sha256:ecc930ae559ea8a43377e8b60ca6f8d61ac532fc57efb915d899de4a67928efd", - "sha256:f161af26f596131b63b236372e4ce40f3167c1b5b5d459b29d2514bd8c9dc9ee" - ], - "index": "pypi", - "version": "==4.5.2" + "sha256:098fb713b31050463751dcc694878e1d39f316b86366fb9fe3fbbe5396ac9fab", + "sha256:0e89f5d422988c65e6936e4ec0fe54d6f73f3128c80eb7ecc3b87f595523607b", + "sha256:189ad47203e846a7a4951c17694d845b6ade7917c47c64b29b86526eefc3adf5", + "sha256:1d87936cb5801c557f3e981c9c193861264c01209cb3ad0964a16310ca1b3301", + "sha256:211b3bcf5da70c2d4b84d09232534ad1d78320762e2c59dedc73bf01cb1fc45b", + "sha256:2358809cc64394617f2719147a58ae26dac9e21bae772b45cfb80baa26bfca5d", + "sha256:23c83112b4dada0b75789d73f949dbb4e8f29a0a3511647024a398ebd023347b", + "sha256:24e811118aab6abe3ce23ff0d7d38932329c513f9cef849d3ee88b0f848f2aa9", + "sha256:2d5896ddf5389560257bbe89317ca7bcb4e54a02b53a3e572e1ce4226512b51b", + "sha256:2d6571c48328be4304aee031d2d5046cbc8aed5740c654575613c5a4f5a11311", + "sha256:2e311a10f3e85250910a615fe194839a04a0f6bc4e8e5bb5cac221344e3a7891", + "sha256:302160eb6e9764168e01d8c9ec6becddeb87776e81d3fcb0d97954dd51d48e0a", + "sha256:3a7a380bfecc551cfd67d6e8ad9faa91289173bdf12e9cfafbd2bdec0d7b1ec1", + "sha256:3d9b2b72eb0dbbdb0e276403873ecfae870599c83ba22cadff2db58541e72856", + "sha256:475325e037fdf068e0c2140b818518cf6bc4aa72435c407a798b2db9f8e90810", + "sha256:4b7572145054330c8e324a72d808c8c8fbe12be33368db28c39a255ad5f7fb51", + "sha256:4fff34721b628cce9eb4538cf9a73d02e0f3da4f35a515773cce6f5fe413b360", + "sha256:56eff8c6fb7bc4bcca395fdff494c52712b7a57486e4fbde34c31bb9da4c6cc4", + "sha256:573b2f5496c7e9f4985de70b9bbb4719ffd293d5565513e04ac20e42e6e5583f", + "sha256:7ecaef52fd9b9535ae5f01a1dd2651f6608e4ec9dc136fc4dfe7ebe3c3ddb230", + "sha256:803a80d72d1f693aa448566be46ffd70882d1ad8fc689a2e22afe63035eb998a", + "sha256:8862d1c2c020cb7a03b421a9a7b4fe046a208db30994fc8ff68c627a7915987f", + "sha256:9b06690224258db5cd39a84e993882a6874676f5de582da57f3df3a82ead9174", + "sha256:a71400b90b3599eb7bf241f947932e18a066907bf84617d80817998cee81e4bf", + "sha256:bb252f802f91f59767dcc559744e91efa9df532240a502befd874b54571417bd", + "sha256:be1ebf9cc25ab5399501c9046a7dcdaa9e911802ed0e12b7d620cd4bbf0518b3", + "sha256:be7c65e34d1b50ab7093b90427cbc488260e4b3a38ef2435d65b62e9fa3d798a", + "sha256:c0dac835c1a22621ffa5e5f999d57359c790c52bbd1c687fe514ae6924f65ef5", + "sha256:c152b2e93b639d1f36ec5a8ca24cde4a8eefb2b6b83668fcd8e83a67badcb367", + "sha256:d182eada8ea0de61a45a526aa0ae4bcd222f9673424e65315c35820291ff299c", + "sha256:d18331ea905a41ae71596502bd4c9a2998902328bbabd29e3d0f5f8569fabad1", + "sha256:d20d32cbb31d731def4b1502294ca2ee99f9249b63bc80e03e67e8f8e126dea8", + "sha256:d4ad7fd3269281cb471ad6c7bafca372e69789540d16e3755dd717e9e5c9d82f", + "sha256:d6f8c23f65a4bfe4300b85f1f40f6c32569822d08901db3b6454ab785d9117cc", + "sha256:d84d741c6e35c9f3e7406cb7c4c2e08474c2a6441d59322a00dcae65aac6315d", + "sha256:e65c221b2115a91035b55a593b6eb94aa1206fa3ab374f47c6dc10d364583ff9", + "sha256:f98b6f256be6cec8dd308a8563976ddaff0bdc18b730720f6f4bee927ffe926f" + ], + "index": "pypi", + "version": "==4.6.1" }, "markdownify": { "hashes": [ @@ -374,52 +343,13 @@ "index": "pypi", "version": "==0.5.3" }, - "markupsafe": { - "hashes": [ - "sha256:00bc623926325b26bb9605ae9eae8a215691f33cae5df11ca5424f06f2d1f473", - "sha256:09027a7803a62ca78792ad89403b1b7a73a01c8cb65909cd876f7fcebd79b161", - "sha256:09c4b7f37d6c648cb13f9230d847adf22f8171b1ccc4d5682398e77f40309235", - "sha256:1027c282dad077d0bae18be6794e6b6b8c91d58ed8a8d89a89d59693b9131db5", - "sha256:13d3144e1e340870b25e7b10b98d779608c02016d5184cfb9927a9f10c689f42", - "sha256:24982cc2533820871eba85ba648cd53d8623687ff11cbb805be4ff7b4c971aff", - "sha256:29872e92839765e546828bb7754a68c418d927cd064fd4708fab9fe9c8bb116b", - "sha256:43a55c2930bbc139570ac2452adf3d70cdbb3cfe5912c71cdce1c2c6bbd9c5d1", - "sha256:46c99d2de99945ec5cb54f23c8cd5689f6d7177305ebff350a58ce5f8de1669e", - "sha256:500d4957e52ddc3351cabf489e79c91c17f6e0899158447047588650b5e69183", - "sha256:535f6fc4d397c1563d08b88e485c3496cf5784e927af890fb3c3aac7f933ec66", - "sha256:596510de112c685489095da617b5bcbbac7dd6384aeebeda4df6025d0256a81b", - "sha256:62fe6c95e3ec8a7fad637b7f3d372c15ec1caa01ab47926cfdf7a75b40e0eac1", - "sha256:6788b695d50a51edb699cb55e35487e430fa21f1ed838122d722e0ff0ac5ba15", - "sha256:6dd73240d2af64df90aa7c4e7481e23825ea70af4b4922f8ede5b9e35f78a3b1", - "sha256:717ba8fe3ae9cc0006d7c451f0bb265ee07739daf76355d06366154ee68d221e", - "sha256:79855e1c5b8da654cf486b830bd42c06e8780cea587384cf6545b7d9ac013a0b", - "sha256:7c1699dfe0cf8ff607dbdcc1e9b9af1755371f92a68f706051cc8c37d447c905", - "sha256:88e5fcfb52ee7b911e8bb6d6aa2fd21fbecc674eadd44118a9cc3863f938e735", - "sha256:8defac2f2ccd6805ebf65f5eeb132adcf2ab57aa11fdf4c0dd5169a004710e7d", - "sha256:98c7086708b163d425c67c7a91bad6e466bb99d797aa64f965e9d25c12111a5e", - "sha256:9add70b36c5666a2ed02b43b335fe19002ee5235efd4b8a89bfcf9005bebac0d", - "sha256:9bf40443012702a1d2070043cb6291650a0841ece432556f784f004937f0f32c", - "sha256:ade5e387d2ad0d7ebf59146cc00c8044acbd863725f887353a10df825fc8ae21", - "sha256:b00c1de48212e4cc9603895652c5c410df699856a2853135b3967591e4beebc2", - "sha256:b1282f8c00509d99fef04d8ba936b156d419be841854fe901d8ae224c59f0be5", - "sha256:b2051432115498d3562c084a49bba65d97cf251f5a331c64a12ee7e04dacc51b", - "sha256:ba59edeaa2fc6114428f1637ffff42da1e311e29382d81b339c1817d37ec93c6", - "sha256:c8716a48d94b06bb3b2524c2b77e055fb313aeb4ea620c8dd03a105574ba704f", - "sha256:cd5df75523866410809ca100dc9681e301e3c27567cf498077e8551b6d20e42f", - "sha256:cdb132fc825c38e1aeec2c8aa9338310d29d337bebbd7baa06889d09a60a1fa2", - "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7", - "sha256:e8313f01ba26fbbe36c7be1966a7b7424942f670f38e666995b88d012765b9be" - ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", - "version": "==1.1.1" - }, "more-itertools": { "hashes": [ - "sha256:6f83822ae94818eae2612063a5101a7311e68ae8002005b5e05f03fd74a86a20", - "sha256:9b30f12df9393f0d28af9210ff8efe48d10c94f73e5daf886f10c4b0b0b4f03c" + "sha256:8e1a2a43b2f2727425f2b5839587ae37093f19153dc26c0927d1048ff6557330", + "sha256:b3a9005928e5bed54076e6e549c792b306fddfe72b2d1d22dd63d42d5d3899cf" ], "index": "pypi", - "version": "==8.5.0" + "version": "==8.6.0" }, "multidict": { "hashes": [ @@ -451,14 +381,6 @@ "markers": "python_version >= '3.5'", "version": "==4.0.2" }, - "packaging": { - "hashes": [ - "sha256:4357f74f47b9c12db93624a82154e9b120fa8293699949152b22065d556079f8", - "sha256:998416ba6962ae7fbd6596850b80e17859a5753ba17c32284f67bfff33784181" - ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", - "version": "==20.4" - }, "pamqp": { "hashes": [ "sha256:2f81b5c186f668a67f165193925b6bfd83db4363a6222f599517f29ecee60b02", @@ -508,21 +430,14 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==2.20" }, - "pygments": { - "hashes": [ - "sha256:307543fe65c0947b126e83dd5a61bd8acbd84abec11f43caebaf5534cbc17998", - "sha256:926c3f319eda178d1bd90851e4317e6d8cdb5e292a3386aac9bd75eca29cf9c7" - ], - "markers": "python_version >= '3.5'", - "version": "==2.7.1" - }, - "pyparsing": { + "pyreadline": { "hashes": [ - "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1", - "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b" + "sha256:4530592fc2e85b25b1a9f79664433da09237c1a270e4d78ea5aa3a2c7229e2d1", + "sha256:65540c21bfe14405a3a77e4c085ecfce88724743a4ead47c66b84defcf82c32e", + "sha256:9ce5fa65b8992dfa373bddc5b6e0864ead8f291c94fbfec05fbd5c836162e67b" ], - "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'", - "version": "==2.4.7" + "markers": "sys_platform == 'win32'", + "version": "==2.1" }, "python-dateutil": { "hashes": [ @@ -532,13 +447,6 @@ "index": "pypi", "version": "==2.8.1" }, - "pytz": { - "hashes": [ - "sha256:a494d53b6d39c3c6e44c3bec237336e14305e4f29bbf800b599253057fbb79ed", - "sha256:c35965d010ce31b23eeb663ed3cc8c906275d6be1a34393a1d73a41febf4a048" - ], - "version": "==2020.1" - }, "pyyaml": { "hashes": [ "sha256:06a0d7ba600ce0b2d2fe2e78453a470b5a6e000a985dd4a4e54e436cc36b0e97", @@ -564,21 +472,13 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", "version": "==3.5.3" }, - "requests": { - "hashes": [ - "sha256:b3559a131db72c33ee969480840fff4bb6dd111de7dd27c8ee1f820f4f00231b", - "sha256:fe75cc94a9443b9246fc7049224f75604b113c36acb93f87b80ed42c44cbb898" - ], - "index": "pypi", - "version": "==2.24.0" - }, "sentry-sdk": { "hashes": [ - "sha256:c9c0fa1412bad87104c4eee8dd36c7bbf60b0d92ae917ab519094779b22e6d9a", - "sha256:e159f7c919d19ae86e5a4ff370fccc45149fab461fbeb93fb5a735a0b33a9cb1" + "sha256:17b725df2258354ccb39618ae4ead29651aa92c01a92acf72f98efe06ee2e45a", + "sha256:9040539485226708b5cad0401d76628fba4eed9154bf301c50579767afe344fd" ], "index": "pypi", - "version": "==0.17.8" + "version": "==0.19.2" }, "six": { "hashes": [ @@ -588,19 +488,12 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==1.15.0" }, - "snowballstemmer": { - "hashes": [ - "sha256:209f257d7533fdb3cb73bdbd24f436239ca3b2fa67d56f6ff88e86be08cc5ef0", - "sha256:df3bac3df4c2c01363f3dd2cfa78cce2840a79b9f1c2d2de9ce8d31683992f52" - ], - "version": "==2.0.0" - }, "sortedcontainers": { "hashes": [ - "sha256:4e73a757831fc3ca4de2859c422564239a31d8213d09a2a666e375807034d2ba", - "sha256:c633ebde8580f241f274c1f8994a665c0e54a17724fecd0cae2f079e09c36d3f" + "sha256:37257a32add0a3ee490bb170b599e93095eed89a55da91fa9f48753ea12fd73f", + "sha256:59cc937650cf60d677c16775597c89a960658a09cf7c1a668f86e1e4464b10a1" ], - "version": "==2.2.2" + "version": "==2.3.0" }, "soupsieve": { "hashes": [ @@ -610,62 +503,6 @@ "markers": "python_version >= '3.0'", "version": "==2.0.1" }, - "sphinx": { - "hashes": [ - "sha256:b4c750d546ab6d7e05bdff6ac24db8ae3e8b8253a3569b754e445110a0a12b66", - "sha256:fc312670b56cb54920d6cc2ced455a22a547910de10b3142276495ced49231cb" - ], - "index": "pypi", - "version": "==2.4.4" - }, - "sphinxcontrib-applehelp": { - "hashes": [ - "sha256:806111e5e962be97c29ec4c1e7fe277bfd19e9652fb1a4392105b43e01af885a", - "sha256:a072735ec80e7675e3f432fcae8610ecf509c5f1869d17e2eecff44389cdbc58" - ], - "markers": "python_version >= '3.5'", - "version": "==1.0.2" - }, - "sphinxcontrib-devhelp": { - "hashes": [ - "sha256:8165223f9a335cc1af7ffe1ed31d2871f325254c0423bc0c4c7cd1c1e4734a2e", - "sha256:ff7f1afa7b9642e7060379360a67e9c41e8f3121f2ce9164266f61b9f4b338e4" - ], - "markers": "python_version >= '3.5'", - "version": "==1.0.2" - }, - "sphinxcontrib-htmlhelp": { - "hashes": [ - "sha256:3c0bc24a2c41e340ac37c85ced6dafc879ab485c095b1d65d2461ac2f7cca86f", - "sha256:e8f5bb7e31b2dbb25b9cc435c8ab7a79787ebf7f906155729338f3156d93659b" - ], - "markers": "python_version >= '3.5'", - "version": "==1.0.3" - }, - "sphinxcontrib-jsmath": { - "hashes": [ - "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178", - "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8" - ], - "markers": "python_version >= '3.5'", - "version": "==1.0.1" - }, - "sphinxcontrib-qthelp": { - "hashes": [ - "sha256:4c33767ee058b70dba89a6fc5c1892c0d57a54be67ddd3e7875a18d14cba5a72", - "sha256:bd9fc24bcb748a8d51fd4ecaade681350aa63009a347a8c14e637895444dfab6" - ], - "markers": "python_version >= '3.5'", - "version": "==1.0.3" - }, - "sphinxcontrib-serializinghtml": { - "hashes": [ - "sha256:eaa0eccc86e982a9b939b2b82d12cc5d013385ba5eadcc7e4fed23f4405f77bc", - "sha256:f242a81d423f59617a8e5cf16f5d4d74e28ee9a66f9e5b637a18082991db5a9a" - ], - "markers": "python_version >= '3.5'", - "version": "==1.1.4" - }, "statsd": { "hashes": [ "sha256:c610fb80347fca0ef62666d241bce64184bd7cc1efe582f9690e045c25535eaa", @@ -676,34 +513,34 @@ }, "urllib3": { "hashes": [ - "sha256:91056c15fa70756691db97756772bb1eb9678fa585d9184f24534b100dc60f4a", - "sha256:e7983572181f5e1522d9c98453462384ee92a0be7fac5f1413a1e35c56cc0461" + "sha256:8d7eaa5a82a1cac232164990f04874c594c9453ec55eef02eab885aa02fc17a2", + "sha256:f5321fbe4bf3fefa0efd0bfe7fb14e90909eb62a48ccda331726b4319897dd5e" ], "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", - "version": "==1.25.10" + "version": "==1.25.11" }, "yarl": { "hashes": [ - "sha256:04a54f126a0732af75e5edc9addeaa2113e2ca7c6fce8974a63549a70a25e50e", - "sha256:3cc860d72ed989f3b1f3abbd6ecf38e412de722fb38b8f1b1a086315cf0d69c5", - "sha256:5d84cc36981eb5a8533be79d6c43454c8e6a39ee3118ceaadbd3c029ab2ee580", - "sha256:5e447e7f3780f44f890360ea973418025e8c0cdcd7d6a1b221d952600fd945dc", - "sha256:61d3ea3c175fe45f1498af868879c6ffeb989d4143ac542163c45538ba5ec21b", - "sha256:67c5ea0970da882eaf9efcf65b66792557c526f8e55f752194eff8ec722c75c2", - "sha256:6f6898429ec3c4cfbef12907047136fd7b9e81a6ee9f105b45505e633427330a", - "sha256:7ce35944e8e61927a8f4eb78f5bc5d1e6da6d40eadd77e3f79d4e9399e263921", - "sha256:b7c199d2cbaf892ba0f91ed36d12ff41ecd0dde46cbf64ff4bfe997a3ebc925e", - "sha256:c15d71a640fb1f8e98a1423f9c64d7f1f6a3a168f803042eaf3a5b5022fde0c1", - "sha256:c22607421f49c0cb6ff3ed593a49b6a99c6ffdeaaa6c944cdda83c2393c8864d", - "sha256:c604998ab8115db802cc55cb1b91619b2831a6128a62ca7eea577fc8ea4d3131", - "sha256:d088ea9319e49273f25b1c96a3763bf19a882cff774d1792ae6fba34bd40550a", - "sha256:db9eb8307219d7e09b33bcb43287222ef35cbcf1586ba9472b0a4b833666ada1", - "sha256:e31fef4e7b68184545c3d68baec7074532e077bd1906b040ecfba659737df188", - "sha256:e32f0fb443afcfe7f01f95172b66f279938fbc6bdaebe294b0ff6747fb6db020", - "sha256:fcbe419805c9b20db9a51d33b942feddbf6e7fb468cb20686fd7089d4164c12a" + "sha256:040b237f58ff7d800e6e0fd89c8439b841f777dd99b4a9cca04d6935564b9409", + "sha256:17668ec6722b1b7a3a05cc0167659f6c95b436d25a36c2d52db0eca7d3f72593", + "sha256:3a584b28086bc93c888a6c2aa5c92ed1ae20932f078c46509a66dce9ea5533f2", + "sha256:4439be27e4eee76c7632c2427ca5e73703151b22cae23e64adb243a9c2f565d8", + "sha256:48e918b05850fffb070a496d2b5f97fc31d15d94ca33d3d08a4f86e26d4e7c5d", + "sha256:9102b59e8337f9874638fcfc9ac3734a0cfadb100e47d55c20d0dc6087fb4692", + "sha256:9b930776c0ae0c691776f4d2891ebc5362af86f152dd0da463a6614074cb1b02", + "sha256:b3b9ad80f8b68519cc3372a6ca85ae02cc5a8807723ac366b53c0f089db19e4a", + "sha256:bc2f976c0e918659f723401c4f834deb8a8e7798a71be4382e024bcc3f7e23a8", + "sha256:c22c75b5f394f3d47105045ea551e08a3e804dc7e01b37800ca35b58f856c3d6", + "sha256:c52ce2883dc193824989a9b97a76ca86ecd1fa7955b14f87bf367a61b6232511", + "sha256:ce584af5de8830d8701b8979b18fcf450cef9a382b1a3c8ef189bedc408faf1e", + "sha256:da456eeec17fa8aa4594d9a9f27c0b1060b6a75f2419fe0c00609587b2695f4a", + "sha256:db6db0f45d2c63ddb1a9d18d1b9b22f308e52c83638c26b422d520a815c4b3fb", + "sha256:df89642981b94e7db5596818499c4b2219028f2a528c9c37cc1de45bf2fd3a3f", + "sha256:f18d68f2be6bf0e89f1521af2b1bb46e66ab0018faafa81d70f358153170a317", + "sha256:f379b7f83f23fe12823085cd6b906edc49df969eb99757f58ff382349a3303c6" ], "markers": "python_version >= '3.5'", - "version": "==1.6.0" + "version": "==1.5.1" } }, "develop": { @@ -716,11 +553,11 @@ }, "attrs": { "hashes": [ - "sha256:26b54ddbbb9ee1d34d5d3668dd37d6cf74990ab23c828c2888dccdceee395594", - "sha256:fce7fc47dfc976152e82d53ff92fa0407700c21acd20886a13777a0d20e655dc" + "sha256:31b2eced602aa8423c2aea9c76a724617ed67cf9513173fd3a4f03e3a929c7e6", + "sha256:832aa3cde19744e49938b91fea06d69ecb9e649c93ba974535d08ad92164f700" ], "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", - "version": "==20.2.0" + "version": "==20.3.0" }, "cfgv": { "hashes": [ @@ -786,19 +623,19 @@ }, "flake8": { "hashes": [ - "sha256:15e351d19611c887e482fb960eae4d44845013cc142d42896e9862f775d8cf5c", - "sha256:f04b9fcbac03b0a3e58c0ab3a0ecc462e023a9faf046d57794184028123aa208" + "sha256:749dbbd6bfd0cf1318af27bf97a14e28e5ff548ef8e5b1566ccfb25a11e7c839", + "sha256:aadae8761ec651813c24be05c6f7b4680857ef6afaae4651a4eccaef97ce6c3b" ], "index": "pypi", - "version": "==3.8.3" + "version": "==3.8.4" }, "flake8-annotations": { "hashes": [ - "sha256:09fe1aa3f40cb8fef632a0ab3614050a7584bb884b6134e70cf1fc9eeee642fa", - "sha256:5bda552f074fd6e34276c7761756fa07d824ffac91ce9c0a8555eb2bc5b92d7a" + "sha256:0bcebb0792f1f96d617ded674dca7bf64181870bfe5dace353a1483551f8e5f1", + "sha256:bebd11a850f6987a943ce8cdff4159767e0f5f89b3c88aca64680c2175ee02df" ], "index": "pypi", - "version": "==2.4.0" + "version": "==2.4.1" }, "flake8-bugbear": { "hashes": [ @@ -856,11 +693,11 @@ }, "identify": { "hashes": [ - "sha256:7c22c384a2c9b32c5cc891d13f923f6b2653aa83e2d75d8f79be240d6c86c4f4", - "sha256:da683bfb7669fa749fc7731f378229e2dbf29a1d1337cbde04106f02236eb29d" + "sha256:5dd84ac64a9a115b8e0b27d1756b244b882ad264c3c423f42af8235a6e71ca12", + "sha256:c9504ba6a043ee2db0a9d69e43246bc138034895f6338d5aed1b41e4a73b1513" ], "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", - "version": "==1.5.5" + "version": "==1.5.9" }, "mccabe": { "hashes": [ @@ -886,11 +723,11 @@ }, "pre-commit": { "hashes": [ - "sha256:810aef2a2ba4f31eed1941fc270e72696a1ad5590b9751839c90807d0fff6b9a", - "sha256:c54fd3e574565fe128ecc5e7d2f91279772ddb03f8729645fa812fe809084a70" + "sha256:22e6aa3bd571debb01eb7d34483f11c01b65237be4eebbf30c3d4fb65762d315", + "sha256:905ebc9b534b991baec87e934431f2d0606ba27f2b90f7f652985f5a5b8b6ae6" ], "index": "pypi", - "version": "==2.7.1" + "version": "==2.8.2" }, "pycodestyle": { "hashes": [ @@ -950,10 +787,11 @@ }, "toml": { "hashes": [ - "sha256:926b612be1e5ce0634a2ca03470f95169cf16f939018233a670519cb4ac58b0f", - "sha256:bda89d5935c2eac546d648028b9901107a595863cb36bae0c73ac804a9b4ce88" + "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b", + "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f" ], - "version": "==0.10.1" + "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==0.10.2" }, "unittest-xml-reporting": { "hashes": [ @@ -965,11 +803,11 @@ }, "virtualenv": { "hashes": [ - "sha256:43add625c53c596d38f971a465553f6318decc39d98512bc100fa1b1e839c8dc", - "sha256:e0305af10299a7fb0d69393d8f04cb2965dda9351140d11ac8db4e5e3970451b" + "sha256:b0011228208944ce71052987437d3843e05690b2f23d1c7da4263fde104c97a2", + "sha256:b8d6110f493af256a40d65e29846c69340a947669eec8ce784fcf3dd3af28380" ], "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", - "version": "==20.0.31" + "version": "==20.1.0" } } } -- cgit v1.2.3 From 70ee01b8726921e8389abd4f69ffb0e2ceee0773 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Tue, 10 Nov 2020 18:22:11 +0100 Subject: Generalise tag filter hint to accept all containers --- bot/exts/info/doc/_parsing.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py index 0883b9f42..93b6f0def 100644 --- a/bot/exts/info/doc/_parsing.py +++ b/bot/exts/info/doc/_parsing.py @@ -5,7 +5,7 @@ import re import string import textwrap from functools import partial -from typing import Callable, Collection, Iterable, List, Optional, TYPE_CHECKING, Tuple, Union +from typing import Callable, Collection, Container, Iterable, List, Optional, TYPE_CHECKING, Union from bs4 import BeautifulSoup from bs4.element import NavigableString, PageElement, Tag @@ -99,7 +99,7 @@ def _split_parameters(parameters_string: str) -> List[str]: def _find_elements_until_tag( start_element: PageElement, - end_tag_filter: Union[Tuple[str, ...], Callable[[Tag], bool]], + end_tag_filter: Union[Container[str], Callable[[Tag], bool]], *, func: Callable, include_strings: bool = False, @@ -108,7 +108,7 @@ def _find_elements_until_tag( """ Get all elements up to `limit` or until a tag matching `tag_filter` is found. - `end_tag_filter` can be either a tuple of string names to check against, + `end_tag_filter` can be either a container of string names to check against, or a filtering callable that's applied to tags. When `include_strings` is True, `NavigableString`s from the document will be included in the result along `Tag`s. @@ -116,12 +116,12 @@ def _find_elements_until_tag( `func` takes in a BeautifulSoup unbound method for finding multiple elements, such as `BeautifulSoup.find_all`. The method is then iterated over and all elements until the matching tag or the limit are added to the return list. """ - use_tuple_filter = isinstance(end_tag_filter, tuple) + use_container_filter = not callable(end_tag_filter) elements = [] for element in func(start_element, name=Strainer(include_strings=include_strings), limit=limit): if isinstance(element, Tag): - if use_tuple_filter: + if use_container_filter: if element.name in end_tag_filter: break elif end_tag_filter(element): -- cgit v1.2.3 From beebeac45cf487e59ca4d76a84472c898bc23b06 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Tue, 10 Nov 2020 19:20:44 +0100 Subject: Rename variables for clarity --- bot/exts/info/doc/_cog.py | 4 ++-- bot/exts/info/doc/_parsing.py | 18 +++++++++--------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 25477fe07..4e48e81e5 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -227,8 +227,8 @@ class DocCog(commands.Cog): symbol = f"{group_name}.{symbol}" self.renamed_symbols.add(symbol) - elif (overridden_symbol_group := original_symbol.group) in FORCE_PREFIX_GROUPS: - overridden_symbol = f"{overridden_symbol_group}.{symbol}" + elif (original_symbol_group := original_symbol.group) in FORCE_PREFIX_GROUPS: + overridden_symbol = f"{original_symbol_group}.{symbol}" if overridden_symbol in self.renamed_symbols: overridden_symbol = f"{api_package_name}.{overridden_symbol}" diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py index 93b6f0def..9140f635a 100644 --- a/bot/exts/info/doc/_parsing.py +++ b/bot/exts/info/doc/_parsing.py @@ -42,9 +42,9 @@ _NO_SIGNATURE_GROUPS = { "templatetag", "term", } -_EMBED_CODE_BLOCK_LENGTH = 61 +_EMBED_CODE_BLOCK_LINE_LENGTH = 61 # _MAX_SIGNATURE_AMOUNT code block wrapped lines with py syntax highlight -_MAX_SIGNATURES_LENGTH = (_EMBED_CODE_BLOCK_LENGTH + 8) * _MAX_SIGNATURE_AMOUNT +_MAX_SIGNATURES_LENGTH = (_EMBED_CODE_BLOCK_LINE_LENGTH + 8) * _MAX_SIGNATURE_AMOUNT # Maximum discord message length - signatures on top _MAX_DESCRIPTION_LENGTH = 2000 - _MAX_SIGNATURES_LENGTH _TRUNCATE_STRIP_CHARACTERS = "!?:;." + string.whitespace @@ -189,7 +189,7 @@ def _truncate_signatures(signatures: Collection[str]) -> Union[List[str], Collec if not sum(len(signature) for signature in signatures) > _MAX_SIGNATURES_LENGTH: return signatures - max_signature_length = _EMBED_CODE_BLOCK_LENGTH * (_MAX_SIGNATURE_AMOUNT + 1 - len(signatures)) + max_signature_length = _EMBED_CODE_BLOCK_LINE_LENGTH * (_MAX_SIGNATURE_AMOUNT + 1 - len(signatures)) formatted_signatures = [] for signature in signatures: signature = signature.strip() @@ -221,12 +221,12 @@ def _get_truncated_description( max_length: int, ) -> str: """ - Truncate markdown from `elements` to be at most `max_length` characters visually. + Truncate markdown from `elements` to be at most `max_length` characters when rendered. `max_length` limits the length of the rendered characters in the string, with the real string length limited to `_MAX_DESCRIPTION_LENGTH` to accommodate discord length limits """ - visual_length = 0 + rendered_length = 0 real_length = 0 result = [] shortened = False @@ -234,7 +234,7 @@ def _get_truncated_description( for element in elements: is_tag = isinstance(element, Tag) element_length = len(element.text) if is_tag else len(element) - if visual_length + element_length < max_length: + if rendered_length + element_length < max_length: if is_tag: element_markdown = markdown_converter.process_tag(element) else: @@ -247,7 +247,7 @@ def _get_truncated_description( shortened = True break real_length += element_markdown_length - visual_length += element_length + rendered_length += element_length else: shortened = True break @@ -258,7 +258,7 @@ def _get_truncated_description( return markdown_string -def _parse_into_markdown(signatures: Optional[List[str]], description: Iterable[Tag], url: str) -> str: +def _create_markdown(signatures: Optional[List[str]], description: Iterable[Tag], url: str) -> str: """ Create a markdown string with the signatures at the top, and the converted html description below them. @@ -309,4 +309,4 @@ def get_symbol_markdown(soup: BeautifulSoup, symbol_data: DocItem) -> str: else: signature = _get_signatures(symbol_heading) description = _get_dd_description(symbol_heading) - return _parse_into_markdown(signature, description, symbol_data.url).replace('¶', '') + return _create_markdown(signature, description, symbol_data.url).replace('¶', '') -- cgit v1.2.3 From 7348b86bfedfc24c67d97a08d839a18956a6bff6 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Tue, 10 Nov 2020 22:17:15 +0100 Subject: Update outdated docstring --- bot/exts/info/doc/_parsing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py index 9140f635a..82b2ca808 100644 --- a/bot/exts/info/doc/_parsing.py +++ b/bot/exts/info/doc/_parsing.py @@ -263,7 +263,7 @@ def _create_markdown(signatures: Optional[List[str]], description: Iterable[Tag] Create a markdown string with the signatures at the top, and the converted html description below them. The signatures are wrapped in python codeblocks, separated from the description by a newline. - The result string is truncated to be max 1000 symbols long. + The result markdown string is max 750 rendered characters for the description with signatures at the start. """ description = _get_truncated_description(description, DocMarkdownConverter(bullets="•", page_url=url), 750) description = _WHITESPACE_AFTER_NEWLINES_RE.sub('', description) -- cgit v1.2.3 From ddb6b11575c05c8417f5607aec98fb1c09e351af Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Tue, 10 Nov 2020 22:22:27 +0100 Subject: Adjust unparseable symbol behaviour With redis we need to make sure we don't send the "error" string into the cache, returning None instead of the string and then setting it manually in the caller makes this nicer compared to checking against a string --- bot/exts/info/doc/_cog.py | 5 ++++- bot/exts/info/doc/_parsing.py | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 4e48e81e5..fa59bcc42 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -292,7 +292,10 @@ class DocCog(commands.Cog): if markdown is None: log.debug(f"Redis cache miss for symbol `{symbol}`.") markdown = await self.item_fetcher.get_markdown(self.bot.http_session, symbol_info) - await self.doc_cache.set(redis_key, markdown) + if markdown is not None: + await self.doc_cache.set(redis_key, markdown) + else: + markdown = "Unable to parse the requested symbol." embed = discord.Embed( title=discord.utils.escape_markdown(symbol), diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py index 82b2ca808..72e81982a 100644 --- a/bot/exts/info/doc/_parsing.py +++ b/bot/exts/info/doc/_parsing.py @@ -287,7 +287,7 @@ def _class_filter_factory(class_names: Iterable[str]) -> Callable[[Tag], bool]: return match_tag -def get_symbol_markdown(soup: BeautifulSoup, symbol_data: DocItem) -> str: +def get_symbol_markdown(soup: BeautifulSoup, symbol_data: DocItem) -> Optional[str]: """ Return parsed markdown of the passed symbol using the passed in soup, truncated to 1000 characters. @@ -296,7 +296,7 @@ def get_symbol_markdown(soup: BeautifulSoup, symbol_data: DocItem) -> str: symbol_heading = soup.find(id=symbol_data.symbol_id) if symbol_heading is None: log.warning("Symbol present in loaded inventories not found on site, consider refreshing inventories.") - return "Unable to parse the requested symbol." + return None signature = None # Modules, doc pages and labels don't point to description list tags but to tags like divs, # no special parsing can be done so we only try to include what's under them. -- cgit v1.2.3 From d936e5bc049e2e93beca3c62430d048d9f9cf47b Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 11 Nov 2020 18:23:01 +0100 Subject: Cancel scheduled inventory updates on all refreshes --- bot/exts/info/doc/_cog.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index fa59bcc42..822f682bf 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -250,6 +250,8 @@ class DocCog(commands.Cog): async def refresh_inventory(self) -> None: """Refresh internal documentation inventory.""" log.debug("Refreshing documentation inventory...") + for inventory in self.scheduled_inventories: + self.inventory_scheduler.cancel(inventory) # Clear the old base URLS and doc symbols to ensure # that we start from a fresh local dataset. @@ -418,9 +420,6 @@ class DocCog(commands.Cog): """ await self.bot.api_client.delete(f'bot/documentation-links/{package_name}') - if package_name in self.scheduled_inventories: - self.inventory_scheduler.cancel(package_name) - async with ctx.typing(): # Rebuild the inventory to ensure that everything # that was from this package is properly deleted. @@ -431,9 +430,6 @@ class DocCog(commands.Cog): @commands.has_any_role(*MODERATION_ROLES) async def refresh_command(self, ctx: commands.Context) -> None: """Refresh inventories and send differences to channel.""" - for inventory in self.scheduled_inventories: - self.inventory_scheduler.cancel(inventory) - old_inventories = set(self.base_urls) with ctx.typing(): await self.refresh_inventory() -- cgit v1.2.3 From 2bae8eeed0eae75d782da097e78826650e1ac498 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Thu, 12 Nov 2020 19:44:26 +0100 Subject: Intern relative url paths Group name interning was also moved to the DocItem creation to group the behaviour --- bot/exts/info/doc/_cog.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 822f682bf..ecc648d89 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -218,10 +218,8 @@ class DocCog(commands.Cog): for symbol, relative_doc_url in items: if "/" in symbol: continue # skip unreachable symbols with slashes - # Intern the group names since they're reused in all the DocItems - # to remove unnecessary memory consumption from them being unique objects - group_name = sys.intern(group.split(":")[1]) + group_name = group.split(":")[1] if (original_symbol := self.doc_symbols.get(symbol)) is not None: if group_name in FORCE_PREFIX_GROUPS: symbol = f"{group_name}.{symbol}" @@ -240,7 +238,14 @@ class DocCog(commands.Cog): self.renamed_symbols.add(symbol) relative_url_path, _, symbol_id = relative_doc_url.partition("#") - symbol_item = DocItem(api_package_name, group_name, base_url, relative_url_path, symbol_id) + # Intern fields that have shared content so we're not storing unique strings for every object + symbol_item = DocItem( + api_package_name, + sys.intern(group_name), + base_url, + sys.intern(relative_url_path), + symbol_id + ) self.doc_symbols[symbol] = symbol_item self.item_fetcher.add_item(symbol_item) -- cgit v1.2.3 From aeac77a08cdafadcc180a400c32ce21732d7d20d Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 14 Nov 2020 02:39:07 +0100 Subject: Limit newlines in doc descriptions --- bot/exts/info/doc/_parsing.py | 48 ++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py index 72e81982a..418405ca9 100644 --- a/bot/exts/info/doc/_parsing.py +++ b/bot/exts/info/doc/_parsing.py @@ -10,6 +10,7 @@ from typing import Callable, Collection, Container, Iterable, List, Optional, TY from bs4 import BeautifulSoup from bs4.element import NavigableString, PageElement, Tag +from bot.utils.helpers import find_nth_occurrence from ._html import Strainer from ._markdown import DocMarkdownConverter if TYPE_CHECKING: @@ -219,21 +220,23 @@ def _get_truncated_description( elements: Iterable[Union[Tag, NavigableString]], markdown_converter: DocMarkdownConverter, max_length: int, + max_lines: int, ) -> str: """ - Truncate markdown from `elements` to be at most `max_length` characters when rendered. + Truncate markdown from `elements` to be at most `max_length` characters when rendered or `max_lines` newlines. `max_length` limits the length of the rendered characters in the string, with the real string length limited to `_MAX_DESCRIPTION_LENGTH` to accommodate discord length limits """ + result = "" + markdown_element_ends = [] rendered_length = 0 - real_length = 0 - result = [] - shortened = False + tag_end_index = 0 for element in elements: is_tag = isinstance(element, Tag) element_length = len(element.text) if is_tag else len(element) + if rendered_length + element_length < max_length: if is_tag: element_markdown = markdown_converter.process_tag(element) @@ -241,21 +244,29 @@ def _get_truncated_description( element_markdown = markdown_converter.process_text(element) element_markdown_length = len(element_markdown) - if real_length + element_markdown_length < _MAX_DESCRIPTION_LENGTH: - result.append(element_markdown) - else: - shortened = True - break - real_length += element_markdown_length rendered_length += element_length + tag_end_index += element_markdown_length + + if not element_markdown.isspace(): + markdown_element_ends.append(tag_end_index) + result += element_markdown else: - shortened = True break - markdown_string = "".join(result) - if shortened: - markdown_string = markdown_string.rstrip(_TRUNCATE_STRIP_CHARACTERS) + "..." - return markdown_string + if not markdown_element_ends: + return "" + + newline_truncate_index = find_nth_occurrence(result, "\n", max_lines) + if newline_truncate_index is not None and newline_truncate_index < _MAX_DESCRIPTION_LENGTH: + truncate_index = newline_truncate_index + else: + truncate_index = _MAX_DESCRIPTION_LENGTH + + if truncate_index >= markdown_element_ends[-1]: + return result + + markdown_truncate_index = max(cut for cut in markdown_element_ends if cut < truncate_index) + return result[:markdown_truncate_index].strip(_TRUNCATE_STRIP_CHARACTERS) + "..." def _create_markdown(signatures: Optional[List[str]], description: Iterable[Tag], url: str) -> str: @@ -265,7 +276,12 @@ def _create_markdown(signatures: Optional[List[str]], description: Iterable[Tag] The signatures are wrapped in python codeblocks, separated from the description by a newline. The result markdown string is max 750 rendered characters for the description with signatures at the start. """ - description = _get_truncated_description(description, DocMarkdownConverter(bullets="•", page_url=url), 750) + description = _get_truncated_description( + description, + markdown_converter=DocMarkdownConverter(bullets="•", page_url=url), + max_length=750, + max_lines=13 + ) description = _WHITESPACE_AFTER_NEWLINES_RE.sub('', description) if signatures is not None: formatted_markdown = "".join(f"```py\n{signature}```" for signature in _truncate_signatures(signatures)) -- cgit v1.2.3 From b118f4cf38bdf99cf66e822c5b2280aff879123d Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 14 Nov 2020 22:59:50 +0100 Subject: Rework the doc redis cache to work with hashes This rework requires us to delete packages caches easily with deleting the package hash instead of having to pattern match all keys and delete those. The interface was also updated to accept DocItems instead of requiring callers to construct the keys --- bot/exts/info/doc/_cog.py | 11 +++----- bot/exts/info/doc/_redis_cache.py | 57 +++++++++++++++++++++++++++++++++++---- 2 files changed, 56 insertions(+), 12 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index ecc648d89..67a21ed72 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -4,7 +4,6 @@ import asyncio import logging import re import sys -import urllib.parse from collections import defaultdict from contextlib import suppress from typing import Dict, List, NamedTuple, Optional, Union @@ -175,6 +174,7 @@ class DocCog(commands.Cog): self.scheduled_inventories = set() self.bot.loop.create_task(self.init_refresh_inventory()) + self.bot.loop.create_task(self.doc_cache.delete_expired()) async def init_refresh_inventory(self) -> None: """Refresh documentation inventory on cog initialization.""" @@ -292,21 +292,18 @@ class DocCog(commands.Cog): return None self.bot.stats.incr(f"doc_fetches.{symbol_info.package.lower()}") - item_url = f"{symbol_info.url}#{symbol_info.symbol_id}" - redis_key = "".join(urllib.parse.urlparse(item_url)[1:]) # url without scheme - - markdown = await self.doc_cache.get(redis_key) + markdown = await self.doc_cache.get(symbol_info) if markdown is None: log.debug(f"Redis cache miss for symbol `{symbol}`.") markdown = await self.item_fetcher.get_markdown(self.bot.http_session, symbol_info) if markdown is not None: - await self.doc_cache.set(redis_key, markdown) + await self.doc_cache.set(symbol_info, markdown) else: markdown = "Unable to parse the requested symbol." embed = discord.Embed( title=discord.utils.escape_markdown(symbol), - url=item_url, + url=f"{symbol_info.url}#{symbol_info.symbol_id}", description=markdown ) # Show all symbols with the same name that were renamed in the footer. diff --git a/bot/exts/info/doc/_redis_cache.py b/bot/exts/info/doc/_redis_cache.py index 147394ba6..c617eba49 100644 --- a/bot/exts/info/doc/_redis_cache.py +++ b/bot/exts/info/doc/_redis_cache.py @@ -1,23 +1,70 @@ -from typing import Optional +from __future__ import annotations + +import datetime +import pickle +from typing import Optional, TYPE_CHECKING from async_rediscache.types.base import RedisObject, namespace_lock +if TYPE_CHECKING: + from ._cog import DocItem class DocRedisCache(RedisObject): """Interface for redis functionality needed by the Doc cog.""" @namespace_lock - async def set(self, key: str, value: str) -> None: + async def set(self, item: DocItem, value: str) -> None: """ Set markdown `value` for `key`. Keys expire after a week to keep data up to date. """ + expiry_timestamp = datetime.datetime.now().timestamp() + 7 * 24 * 60 * 60 with await self._get_pool_connection() as connection: - await connection.setex(f"{self.namespace}:{key}", 7*24*60*60, value) + await connection.hset( + f"{self.namespace}:{item.package}", + self.get_item_key(item), + pickle.dumps((value, expiry_timestamp)) + ) @namespace_lock - async def get(self, key: str) -> Optional[str]: + async def get(self, item: DocItem) -> Optional[str]: """Get markdown contents for `key`.""" with await self._get_pool_connection() as connection: - return await connection.get(f"{self.namespace}:{key}", encoding="utf8") + cached_value = await connection.hget(f"{self.namespace}:{item.package}", self.get_item_key(item)) + if cached_value is None: + return None + + value, expire = pickle.loads(cached_value) + if expire <= datetime.datetime.now().timestamp(): + await connection.hdel(f"{self.namespace}:{item.package}", self.get_item_key(item)) + return None + + return value + + @namespace_lock + async def delete(self, package: str) -> None: + """Remove all values for `package`.""" + with await self._get_pool_connection() as connection: + await connection.delete(f"{self.namespace}:{package}") + + @namespace_lock + async def delete_expired(self) -> None: + """Delete all expired keys.""" + current_timestamp = datetime.datetime.now().timestamp() + with await self._get_pool_connection() as connection: + async for package_key in connection.iscan(match=f"{self.namespace}*"): + expired_fields = [] + + for field, cached_value in (await connection.hgetall(package_key)).items(): + _, expire = pickle.loads(cached_value) + if expire <= current_timestamp: + expired_fields.append(field) + + if expired_fields: + await connection.hdel(package_key, *expired_fields) + + @staticmethod + def get_item_key(item: DocItem) -> str: + """Create redis key for `item`.""" + return item.relative_url_path + item.symbol_id -- cgit v1.2.3 From 07a5d5fc58a402f930505c7b29a7a275e743a84d Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 14 Nov 2020 23:07:13 +0100 Subject: Update existing redis values when parsing pages If we're parsing a page for a symbol that's out of the cache and encounter a symbol that was already cached we can update that symbol to keep it up to date without additional requests --- bot/exts/info/doc/_cog.py | 14 ++++++++------ bot/exts/info/doc/_redis_cache.py | 17 +++++++++++++++++ 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 67a21ed72..678134f3c 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -36,6 +36,8 @@ FORCE_PREFIX_GROUPS = ( WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)") NOT_FOUND_DELETE_DELAY = RedirectOutput.delete_delay +doc_cache = DocRedisCache(namespace="Docs") + class DocItem(NamedTuple): """Holds inventory symbol information.""" @@ -116,7 +118,9 @@ class CachedParser: while self._queue: item, soup = self._queue.pop() try: - self._results[item] = get_symbol_markdown(soup, item) + markdown = get_symbol_markdown(soup, item) + await doc_cache.set_if_exists(item, markdown) + self._results[item] = markdown except Exception: log.exception(f"Unexpected error when handling {item}") else: @@ -161,8 +165,6 @@ class CachedParser: class DocCog(commands.Cog): """A set of commands for querying & displaying documentation.""" - doc_cache = DocRedisCache() - def __init__(self, bot: Bot): self.base_urls = {} self.bot = bot @@ -174,7 +176,7 @@ class DocCog(commands.Cog): self.scheduled_inventories = set() self.bot.loop.create_task(self.init_refresh_inventory()) - self.bot.loop.create_task(self.doc_cache.delete_expired()) + self.bot.loop.create_task(doc_cache.delete_expired()) async def init_refresh_inventory(self) -> None: """Refresh documentation inventory on cog initialization.""" @@ -292,12 +294,12 @@ class DocCog(commands.Cog): return None self.bot.stats.incr(f"doc_fetches.{symbol_info.package.lower()}") - markdown = await self.doc_cache.get(symbol_info) + markdown = await doc_cache.get(symbol_info) if markdown is None: log.debug(f"Redis cache miss for symbol `{symbol}`.") markdown = await self.item_fetcher.get_markdown(self.bot.http_session, symbol_info) if markdown is not None: - await self.doc_cache.set(symbol_info, markdown) + await doc_cache.set(symbol_info, markdown) else: markdown = "Unable to parse the requested symbol." diff --git a/bot/exts/info/doc/_redis_cache.py b/bot/exts/info/doc/_redis_cache.py index c617eba49..2230884c9 100644 --- a/bot/exts/info/doc/_redis_cache.py +++ b/bot/exts/info/doc/_redis_cache.py @@ -27,6 +27,23 @@ class DocRedisCache(RedisObject): pickle.dumps((value, expiry_timestamp)) ) + @namespace_lock + async def set_if_exists(self, item: DocItem, value: str) -> None: + """ + Set markdown `value` for `key` if `key` exists. + + Keys expire after a week to keep data up to date. + """ + expiry_timestamp = datetime.datetime.now().timestamp() + 7 * 24 * 60 * 60 + + with await self._get_pool_connection() as connection: + if await connection.hexists(f"{self.namespace}:{item.package}", self.get_item_key(item)): + await connection.hset( + f"{self.namespace}:{item.package}", + self.get_item_key(item), + pickle.dumps((value, expiry_timestamp)) + ) + @namespace_lock async def get(self, item: DocItem) -> Optional[str]: """Get markdown contents for `key`.""" -- cgit v1.2.3 From 15e73b7d4148ff16d2d408eaf201ebd5a6fd1251 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 14 Nov 2020 23:34:39 +0100 Subject: Add command for clearing the cache of packages We also clear the cache when removing a package --- bot/exts/info/doc/_cog.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 678134f3c..b2d015b89 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -428,6 +428,7 @@ class DocCog(commands.Cog): # Rebuild the inventory to ensure that everything # that was from this package is properly deleted. await self.refresh_inventory() + await doc_cache.delete(package_name) await ctx.send(f"Successfully deleted `{package_name}` and refreshed inventory.") @docs_group.command(name="refreshdoc", aliases=("rfsh", "r")) @@ -450,3 +451,10 @@ class DocCog(commands.Cog): description=f"```diff\n{added}\n{removed}```" if added or removed else "" ) await ctx.send(embed=embed) + + @docs_group.command(name="cleardoccache") + @commands.has_any_role(*MODERATION_ROLES) + async def clear_cache_command(self, ctx: commands.Context, package_name: PackageName) -> None: + """Clear persistent redis cache for `package`.""" + await doc_cache.delete(package_name) + await ctx.send(f"Succesfully cleared cache for {package_name}") -- cgit v1.2.3 From 531ee4aad5432860afa784d0c067019662b3a0fe Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 15 Nov 2020 02:35:37 +0100 Subject: Ensure packages from PRIORITY_PACKAGES are directly accessible Some packages (currently only python) should be prioritised to others, the previous cleanup didn't account for other packages loading before it which resulted in duplicate symbols getting the python prefix and the original symbols linking to most probably undesired pages --- bot/exts/info/doc/_cog.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index b2d015b89..9e4bb54ea 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -33,6 +33,9 @@ FORCE_PREFIX_GROUPS = ( "pdbcommand", "term", ) +PRIORITY_PACKAGES = ( + "python", +) WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)") NOT_FOUND_DELETE_DELAY = RedirectOutput.delete_delay @@ -235,6 +238,10 @@ class DocCog(commands.Cog): self.doc_symbols[overridden_symbol] = original_symbol self.renamed_symbols.add(overridden_symbol) + elif api_package_name in PRIORITY_PACKAGES: + self.doc_symbols[f"{original_symbol.package}.{symbol}"] = original_symbol + self.renamed_symbols.add(symbol) + else: symbol = f"{api_package_name}.{symbol}" self.renamed_symbols.add(symbol) -- cgit v1.2.3 From 0d3d2bd632e2ed2e14eaacb7db9b49de4cd4baa5 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 29 Nov 2020 04:12:04 +0100 Subject: Use timedelta instead of constructing duration manually A newline was also added to set to keep it consistent with set_if_exists --- bot/exts/info/doc/_redis_cache.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bot/exts/info/doc/_redis_cache.py b/bot/exts/info/doc/_redis_cache.py index 2230884c9..e8577aa64 100644 --- a/bot/exts/info/doc/_redis_cache.py +++ b/bot/exts/info/doc/_redis_cache.py @@ -19,7 +19,8 @@ class DocRedisCache(RedisObject): Keys expire after a week to keep data up to date. """ - expiry_timestamp = datetime.datetime.now().timestamp() + 7 * 24 * 60 * 60 + expiry_timestamp = (datetime.datetime.now() + datetime.timedelta(weeks=1)).timestamp() + with await self._get_pool_connection() as connection: await connection.hset( f"{self.namespace}:{item.package}", @@ -34,7 +35,7 @@ class DocRedisCache(RedisObject): Keys expire after a week to keep data up to date. """ - expiry_timestamp = datetime.datetime.now().timestamp() + 7 * 24 * 60 * 60 + expiry_timestamp = (datetime.datetime.now() + datetime.timedelta(weeks=1)).timestamp() with await self._get_pool_connection() as connection: if await connection.hexists(f"{self.namespace}:{item.package}", self.get_item_key(item)): -- cgit v1.2.3 From e22deb55de286c4186da2f0d2f2d562b9e333630 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 29 Nov 2020 04:34:41 +0100 Subject: Use pop instead of getitem and del Co-authored-by: MarkKoz --- bot/exts/info/doc/_cog.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 9e4bb54ea..e29e3b717 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -139,9 +139,8 @@ class CachedParser: # The parse queue stores soups along with the doc symbols in QueueItem objects, # in case we're moving a DocItem we have to get the associated QueueItem first and then move it. item_index = self._queue.index(item) - queue_item = self._queue[item_index] + queue_item = self._queue.pop(item_index) - del self._queue[item_index] self._queue.append(queue_item) def add_item(self, doc_item: DocItem) -> None: -- cgit v1.2.3 From ad90978fd7c038429b715f30519c01d546441afc Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 29 Nov 2020 04:35:43 +0100 Subject: Clear up docstring so it doesn't rely on private attribute Co-authored-by: MarkKoz --- bot/exts/info/doc/_cog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index e29e3b717..bd9b589ce 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -135,7 +135,7 @@ class CachedParser: log.trace("Finished parsing queue.") def _move_to_front(self, item: Union[QueueItem, DocItem]) -> None: - """Move `item` to the front of the parse queue.""" + """Map a DocItem to its page so that the symbol will be parsed once the page is requested.""" # The parse queue stores soups along with the doc symbols in QueueItem objects, # in case we're moving a DocItem we have to get the associated QueueItem first and then move it. item_index = self._queue.index(item) -- cgit v1.2.3 From b094a6fa0dc9d9c2fde75cd79c95c87582f5e23d Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 29 Nov 2020 04:44:17 +0100 Subject: Various grammar and sentence structure changes Co-authored-by: MarkKoz --- bot/converters.py | 2 +- bot/exts/info/doc/_cog.py | 19 ++++++++++--------- bot/exts/info/doc/_inventory_parser.py | 6 +++--- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/bot/converters.py b/bot/converters.py index 3066eaabb..901ba1cca 100644 --- a/bot/converters.py +++ b/bot/converters.py @@ -140,7 +140,7 @@ class PackageName(Converter): async def convert(cls, ctx: Context, argument: str) -> str: """Checks whether the given string is a valid package name.""" if cls.PACKAGE_NAME_RE.search(argument): - raise BadArgument("The provided package name is not valid, please only use the _ and a-z characters.") + raise BadArgument("The provided package name is not valid; please only use the _ and a-z characters.") return argument diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index bd9b589ce..ea91b2353 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -207,7 +207,7 @@ class DocCog(commands.Cog): if not package: delay = 2*60 if inventory_url not in self.scheduled_inventories else 5*60 - log.info(f"Failed to fetch inventory, attempting again in {delay//60} minutes.") + log.info(f"Failed to fetch inventory; attempting again in {delay//60} minutes.") self.inventory_scheduler.schedule_later( delay, api_package_name, @@ -275,7 +275,7 @@ class DocCog(commands.Cog): self.scheduled_inventories.clear() await self.item_fetcher.clear() - # Run all coroutines concurrently - since each of them performs a HTTP + # Run all coroutines concurrently - since each of them performs an HTTP # request, this speeds up fetching the inventory data heavily. coros = [ self.update_single( @@ -322,7 +322,7 @@ class DocCog(commands.Cog): @commands.group(name='docs', aliases=('doc', 'd'), invoke_without_command=True) async def docs_group(self, ctx: commands.Context, *, symbol: Optional[str]) -> None: - """Lookup documentation for Python symbols.""" + """Look up documentation for Python symbols.""" await ctx.invoke(self.get_command, symbol=symbol) @docs_group.command(name='getdoc', aliases=('g',)) @@ -414,7 +414,8 @@ class DocCog(commands.Cog): if await self.update_single(package_name, base_url, inventory_url) is None: await ctx.send( - f"Added package `{package_name}` to database but failed to fetch inventory; rescheduled in 2 minutes." + f"Added the package `{package_name}` to the database but failed to fetch inventory; " + f"trying again in 2 minutes." ) return await ctx.send(f"Added package `{package_name}` to database and refreshed inventory.") @@ -425,7 +426,7 @@ class DocCog(commands.Cog): """ Removes the specified package from the database. - Examples: + Example: !docs deletedoc aiohttp """ await self.bot.api_client.delete(f'bot/documentation-links/{package_name}') @@ -435,12 +436,12 @@ class DocCog(commands.Cog): # that was from this package is properly deleted. await self.refresh_inventory() await doc_cache.delete(package_name) - await ctx.send(f"Successfully deleted `{package_name}` and refreshed inventory.") + await ctx.send(f"Successfully deleted `{package_name}` and refreshed the inventory.") @docs_group.command(name="refreshdoc", aliases=("rfsh", "r")) @commands.has_any_role(*MODERATION_ROLES) async def refresh_command(self, ctx: commands.Context) -> None: - """Refresh inventories and send differences to channel.""" + """Refresh inventories and show the difference.""" old_inventories = set(self.base_urls) with ctx.typing(): await self.refresh_inventory() @@ -461,6 +462,6 @@ class DocCog(commands.Cog): @docs_group.command(name="cleardoccache") @commands.has_any_role(*MODERATION_ROLES) async def clear_cache_command(self, ctx: commands.Context, package_name: PackageName) -> None: - """Clear persistent redis cache for `package`.""" + """Clear the persistent redis cache for `package`.""" await doc_cache.delete(package_name) - await ctx.send(f"Succesfully cleared cache for {package_name}") + await ctx.send(f"Successfully cleared the cache for `{package_name}`.") diff --git a/bot/exts/info/doc/_inventory_parser.py b/bot/exts/info/doc/_inventory_parser.py index 23931869b..96df08786 100644 --- a/bot/exts/info/doc/_inventory_parser.py +++ b/bot/exts/info/doc/_inventory_parser.py @@ -101,17 +101,17 @@ async def fetch_inventory( inventory = await _fetch_inventory(client_session, url) except aiohttp.ClientConnectorError: log.warning( - f"Failed to connect to inventory url at {url}, " + f"Failed to connect to inventory url at {url}; " f"trying again ({attempt}/{FAILED_REQUEST_ATTEMPTS})." ) except aiohttp.ClientError: log.error( - f"Failed to get inventory from {url}, " + f"Failed to get inventory from {url}; " f"trying again ({attempt}/{FAILED_REQUEST_ATTEMPTS})." ) except Exception: log.exception( - f"An unexpected error has occurred during fetching of {url}, " + f"An unexpected error has occurred during fetching of {url}; " f"trying again ({attempt}/{FAILED_REQUEST_ATTEMPTS})." ) else: -- cgit v1.2.3 From 210f7d9b096b373935ab2a3f5f41989f4a081e35 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 29 Nov 2020 23:42:26 +0100 Subject: Remove redundant suppress --- bot/exts/info/doc/_cog.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index ea91b2353..7d57f65ad 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -215,8 +215,8 @@ class DocCog(commands.Cog): ) self.scheduled_inventories.add(api_package_name) return False - with suppress(KeyError): - self.scheduled_inventories.discard(api_package_name) + + self.scheduled_inventories.discard(api_package_name) for group, items in package.items(): for symbol, relative_doc_url in items: -- cgit v1.2.3 From 0e48ae679abc0937b4aad583b1b29ee0b3e3eb15 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 9 Dec 2020 13:40:01 +0100 Subject: Improve handling of strings Previously the code assumed ' and " can be used interchangeably, and strings that were inside of brackets were ignored for depth but their contents weren't causing strings like "ab[cd" to increase the depth --- bot/exts/info/doc/_parsing.py | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py index 418405ca9..e6103dde2 100644 --- a/bot/exts/info/doc/_parsing.py +++ b/bot/exts/info/doc/_parsing.py @@ -56,6 +56,15 @@ _BRACKET_PAIRS = { } +def _is_closing_quote(search_string: str, index: int) -> bool: + """Check whether the quote at `index` inside `search_string` can be a closing quote.""" + if search_string[index - 1] != "\\": + return True + elif search_string[index - 2] == "\\": + return True + return False + + def _split_parameters(parameters_string: str) -> List[str]: """ Split parameters of a signature into individual parameter strings on commas. @@ -67,9 +76,11 @@ def _split_parameters(parameters_string: str) -> List[str]: depth = 0 expected_end = None current_search = None + quote_character = None - for index, character in enumerate(parameters_string): - if character in _BRACKET_PAIRS: + enumerated_string = enumerate(parameters_string) + for index, character in enumerated_string: + if quote_character is None and character in _BRACKET_PAIRS: if current_search is None: current_search = character expected_end = _BRACKET_PAIRS[character] @@ -77,12 +88,22 @@ def _split_parameters(parameters_string: str) -> List[str]: depth += 1 elif character in {"'", '"'}: - if depth == 0: + if current_search is not None: + # We're currently searching for a bracket, skip all characters that belong to the string + # to avoid false positives of closing brackets + quote_character = character + for index, character in enumerated_string: + if character == quote_character and _is_closing_quote(parameters_string, index): + break + + elif depth == 0: depth += 1 - elif parameters_string[index-1] != "\\": - depth -= 1 - elif parameters_string[index-2] == "\\": - depth -= 1 + quote_character = character + elif character == quote_character: + if _is_closing_quote(parameters_string, index): + depth -= 1 + if depth == 0: + quote_character = None elif character == expected_end: depth -= 1 -- cgit v1.2.3 From 04aa50bc3ac3baca788392fb6a56a4ba43e678d4 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 9 Dec 2020 15:25:53 +0100 Subject: Merge current_search and expected_end in The two variables were initialized and cleared together and contained related information --- bot/exts/info/doc/_parsing.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py index e6103dde2..a8b38f400 100644 --- a/bot/exts/info/doc/_parsing.py +++ b/bot/exts/info/doc/_parsing.py @@ -4,6 +4,7 @@ import logging import re import string import textwrap +from collections import namedtuple from functools import partial from typing import Callable, Collection, Container, Iterable, List, Optional, TYPE_CHECKING, Union @@ -49,10 +50,12 @@ _MAX_SIGNATURES_LENGTH = (_EMBED_CODE_BLOCK_LINE_LENGTH + 8) * _MAX_SIGNATURE_AM # Maximum discord message length - signatures on top _MAX_DESCRIPTION_LENGTH = 2000 - _MAX_SIGNATURES_LENGTH _TRUNCATE_STRIP_CHARACTERS = "!?:;." + string.whitespace + +BracketPair = namedtuple("BracketPair", ["opening_bracket", "closing_bracket"]) _BRACKET_PAIRS = { - "{": "}", - "(": ")", - "[": "]", + "{": BracketPair("{", "}"), + "(": BracketPair("(", ")"), + "[": BracketPair("[", "]"), } @@ -74,17 +77,16 @@ def _split_parameters(parameters_string: str) -> List[str]: parameters_list = [] last_split = 0 depth = 0 - expected_end = None - current_search = None + current_search: Optional[BracketPair] = None quote_character = None enumerated_string = enumerate(parameters_string) for index, character in enumerated_string: if quote_character is None and character in _BRACKET_PAIRS: if current_search is None: - current_search = character - expected_end = _BRACKET_PAIRS[character] - if character == current_search: + current_search = _BRACKET_PAIRS[character] + depth = 1 + elif character == current_search.opening_bracket: depth += 1 elif character in {"'", '"'}: @@ -105,11 +107,10 @@ def _split_parameters(parameters_string: str) -> List[str]: if depth == 0: quote_character = None - elif character == expected_end: + elif current_search is not None and character == current_search.closing_bracket: depth -= 1 if depth == 0: current_search = None - expected_end = None elif depth == 0 and character == ",": parameters_list.append(parameters_string[last_split:index]) -- cgit v1.2.3 From 50cbfbda930aab5492411863aaaf8f8cd5ef57fd Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 9 Dec 2020 15:26:53 +0100 Subject: Create a generator instead of returning a list The result of _split_parameters is only iterated over, so a list is not needed. Making it lazy may also save some time in cases where we don't use all parameters --- bot/exts/info/doc/_parsing.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py index a8b38f400..567786204 100644 --- a/bot/exts/info/doc/_parsing.py +++ b/bot/exts/info/doc/_parsing.py @@ -6,7 +6,7 @@ import string import textwrap from collections import namedtuple from functools import partial -from typing import Callable, Collection, Container, Iterable, List, Optional, TYPE_CHECKING, Union +from typing import Callable, Collection, Container, Iterable, Iterator, List, Optional, TYPE_CHECKING, Union from bs4 import BeautifulSoup from bs4.element import NavigableString, PageElement, Tag @@ -68,13 +68,12 @@ def _is_closing_quote(search_string: str, index: int) -> bool: return False -def _split_parameters(parameters_string: str) -> List[str]: +def _split_parameters(parameters_string: str) -> Iterator[str]: """ Split parameters of a signature into individual parameter strings on commas. Long string literals are not accounted for. """ - parameters_list = [] last_split = 0 depth = 0 current_search: Optional[BracketPair] = None @@ -113,11 +112,10 @@ def _split_parameters(parameters_string: str) -> List[str]: current_search = None elif depth == 0 and character == ",": - parameters_list.append(parameters_string[last_split:index]) + yield parameters_string[last_split:index] last_split = index + 1 - parameters_list.append(parameters_string[last_split:]) - return parameters_list + yield parameters_string[last_split:] def _find_elements_until_tag( -- cgit v1.2.3 From ea9b3e0e9ac74ea541f436f8021178f76f19af39 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Fri, 11 Dec 2020 09:44:11 +0100 Subject: Restructure doc cache to handle caches of whole pages Previously we used packages as the top level keys and fields contained the url and the symbol id, however if we want to store all symbols from fetched pages instead of only the ones that were fetched by the users this comes worse off than using the page url in the field and setting EXPIREs for them instead of doing it manually in python. The new implementation uses package:url as the redis key and only the symbol id for field names, with the expire being set to a week on the key, this means we have to pattern match the keys when deleting the cache for a package but that's being done far less than the expire checking done previously. --- bot/exts/info/doc/_cog.py | 3 +- bot/exts/info/doc/_redis_cache.py | 95 +++++++++++++++------------------------ 2 files changed, 37 insertions(+), 61 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 7d57f65ad..d1518f69d 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -122,7 +122,7 @@ class CachedParser: item, soup = self._queue.pop() try: markdown = get_symbol_markdown(soup, item) - await doc_cache.set_if_exists(item, markdown) + await doc_cache.set(item, markdown) self._results[item] = markdown except Exception: log.exception(f"Unexpected error when handling {item}") @@ -178,7 +178,6 @@ class DocCog(commands.Cog): self.scheduled_inventories = set() self.bot.loop.create_task(self.init_refresh_inventory()) - self.bot.loop.create_task(doc_cache.delete_expired()) async def init_refresh_inventory(self) -> None: """Refresh documentation inventory on cog initialization.""" diff --git a/bot/exts/info/doc/_redis_cache.py b/bot/exts/info/doc/_redis_cache.py index e8577aa64..52cb2bc94 100644 --- a/bot/exts/info/doc/_redis_cache.py +++ b/bot/exts/info/doc/_redis_cache.py @@ -1,7 +1,6 @@ from __future__ import annotations import datetime -import pickle from typing import Optional, TYPE_CHECKING from async_rediscache.types.base import RedisObject, namespace_lock @@ -12,77 +11,55 @@ if TYPE_CHECKING: class DocRedisCache(RedisObject): """Interface for redis functionality needed by the Doc cog.""" + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._set_expires = set() + @namespace_lock async def set(self, item: DocItem, value: str) -> None: """ - Set markdown `value` for `key`. + Set the Markdown `value` for the symbol `item`. - Keys expire after a week to keep data up to date. + All keys from a single page are stored together, expiring a week after the first set. """ - expiry_timestamp = (datetime.datetime.now() + datetime.timedelta(weeks=1)).timestamp() + url_key = remove_suffix(item.relative_url_path, ".html") + redis_key = f"{self.namespace}:{item.package}:{url_key}" + needs_expire = False with await self._get_pool_connection() as connection: - await connection.hset( - f"{self.namespace}:{item.package}", - self.get_item_key(item), - pickle.dumps((value, expiry_timestamp)) - ) - - @namespace_lock - async def set_if_exists(self, item: DocItem, value: str) -> None: - """ - Set markdown `value` for `key` if `key` exists. + if item.package+url_key not in self._set_expires: + self._set_expires.add(item.package+url_key) + needs_expire = not await connection.exists(redis_key) - Keys expire after a week to keep data up to date. - """ - expiry_timestamp = (datetime.datetime.now() + datetime.timedelta(weeks=1)).timestamp() - - with await self._get_pool_connection() as connection: - if await connection.hexists(f"{self.namespace}:{item.package}", self.get_item_key(item)): - await connection.hset( - f"{self.namespace}:{item.package}", - self.get_item_key(item), - pickle.dumps((value, expiry_timestamp)) - ) + await connection.hset(redis_key, item.symbol_id, value) + if needs_expire: + await connection.expire(redis_key, datetime.timedelta(weeks=1).total_seconds()) @namespace_lock async def get(self, item: DocItem) -> Optional[str]: - """Get markdown contents for `key`.""" - with await self._get_pool_connection() as connection: - cached_value = await connection.hget(f"{self.namespace}:{item.package}", self.get_item_key(item)) - if cached_value is None: - return None - - value, expire = pickle.loads(cached_value) - if expire <= datetime.datetime.now().timestamp(): - await connection.hdel(f"{self.namespace}:{item.package}", self.get_item_key(item)) - return None + """Return the Markdown content of the symbol `item` if it exists.""" + url_key = remove_suffix(item.relative_url_path, ".html") - return value - - @namespace_lock - async def delete(self, package: str) -> None: - """Remove all values for `package`.""" with await self._get_pool_connection() as connection: - await connection.delete(f"{self.namespace}:{package}") + return await connection.hget(f"{self.namespace}:{item.package}:{url_key}", item.symbol_id, encoding="utf8") @namespace_lock - async def delete_expired(self) -> None: - """Delete all expired keys.""" - current_timestamp = datetime.datetime.now().timestamp() + async def delete(self, package: str) -> bool: + """Remove all values for `package`; return True if at least one key was deleted, False otherwise.""" with await self._get_pool_connection() as connection: - async for package_key in connection.iscan(match=f"{self.namespace}*"): - expired_fields = [] - - for field, cached_value in (await connection.hgetall(package_key)).items(): - _, expire = pickle.loads(cached_value) - if expire <= current_timestamp: - expired_fields.append(field) - - if expired_fields: - await connection.hdel(package_key, *expired_fields) - - @staticmethod - def get_item_key(item: DocItem) -> str: - """Create redis key for `item`.""" - return item.relative_url_path + item.symbol_id + package_keys = [ + package_key async for package_key in connection.iscan(match=f"{self.namespace}:{package}:*") + ] + if package_keys: + await connection.delete(*package_keys) + return True + return False + + +def remove_suffix(string: str, suffix: str) -> str: + """Remove `suffix` from end of `string`.""" + # TODO replace usages with str.removesuffix on 3.9 + if string.endswith(suffix): + return string[:-len(suffix)] + else: + return string -- cgit v1.2.3 From c42bf69a8b170772710c2184a3d0d3d57f597c30 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Fri, 11 Dec 2020 11:05:42 +0100 Subject: Use global bot http_session instead of parameter --- bot/converters.py | 2 +- bot/exts/info/doc/_cog.py | 12 ++++++------ bot/exts/info/doc/_inventory_parser.py | 13 ++++++------- 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/bot/converters.py b/bot/converters.py index d44b675a7..d558fa3df 100644 --- a/bot/converters.py +++ b/bot/converters.py @@ -190,7 +190,7 @@ class InventoryURL(Converter): async def convert(ctx: Context, url: str) -> str: """Convert url to Intersphinx inventory URL.""" await ctx.trigger_typing() - if await _inventory_parser.fetch_inventory(ctx.bot.http_session, url) is None: + if await _inventory_parser.fetch_inventory(url) is None: raise BadArgument(f"Failed to fetch inventory file after {_inventory_parser.FAILED_REQUEST_ATTEMPTS}.") return url diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 524dcc829..e1be956cd 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -9,10 +9,10 @@ from contextlib import suppress from typing import Dict, List, NamedTuple, Optional, Union import discord -from aiohttp import ClientSession from bs4 import BeautifulSoup from discord.ext import commands +from bot import instance as bot_instance from bot.bot import Bot from bot.constants import MODERATION_ROLES, RedirectOutput from bot.converters import InventoryURL, PackageName, ValidURL @@ -85,7 +85,7 @@ class CachedParser: self._item_events: Dict[DocItem, asyncio.Event] = {} self._parse_task = None - async def get_markdown(self, client_session: ClientSession, doc_item: DocItem) -> str: + async def get_markdown(self, doc_item: DocItem) -> str: """ Get result markdown of `doc_item`. @@ -96,7 +96,7 @@ class CachedParser: return symbol if (symbols_to_queue := self._page_symbols.get(doc_item.url)) is not None: - async with client_session.get(doc_item.url) as response: + async with bot_instance.http_session.get(doc_item.url) as response: soup = BeautifulSoup(await response.text(encoding="utf8"), "lxml") self._queue.extend(QueueItem(symbol, soup) for symbol in symbols_to_queue) @@ -202,7 +202,7 @@ class DocCog(commands.Cog): Return True on success; False if fetching failed and was rescheduled. """ self.base_urls[api_package_name] = base_url - package = await fetch_inventory(self.bot.http_session, inventory_url) + package = await fetch_inventory(inventory_url) if not package: delay = 2*60 if inventory_url not in self.scheduled_inventories else 5*60 @@ -210,7 +210,7 @@ class DocCog(commands.Cog): self.inventory_scheduler.schedule_later( delay, api_package_name, - fetch_inventory(self.bot.http_session, inventory_url) + fetch_inventory(inventory_url) ) self.scheduled_inventories.add(api_package_name) return False @@ -302,7 +302,7 @@ class DocCog(commands.Cog): markdown = await doc_cache.get(symbol_info) if markdown is None: log.debug(f"Redis cache miss for symbol `{symbol}`.") - markdown = await self.item_fetcher.get_markdown(self.bot.http_session, symbol_info) + markdown = await self.item_fetcher.get_markdown(symbol_info) if markdown is not None: await doc_cache.set(symbol_info, markdown) else: diff --git a/bot/exts/info/doc/_inventory_parser.py b/bot/exts/info/doc/_inventory_parser.py index 96df08786..0d9bd726a 100644 --- a/bot/exts/info/doc/_inventory_parser.py +++ b/bot/exts/info/doc/_inventory_parser.py @@ -6,6 +6,8 @@ from typing import AsyncIterator, DefaultDict, List, Optional, Tuple import aiohttp +import bot + log = logging.getLogger(__name__) FAILED_REQUEST_ATTEMPTS = 3 @@ -69,10 +71,10 @@ async def _load_v2(stream: aiohttp.StreamReader) -> DefaultDict[str, List[Tuple[ return invdata -async def _fetch_inventory(client_session: aiohttp.ClientSession, url: str) -> DefaultDict[str, List[Tuple[str, str]]]: +async def _fetch_inventory(url: str) -> DefaultDict[str, List[Tuple[str, str]]]: """Fetch, parse and return an intersphinx inventory file from an url.""" timeout = aiohttp.ClientTimeout(sock_connect=5, sock_read=5) - async with client_session.get(url, timeout=timeout, raise_for_status=True) as response: + async with bot.instance.http_session.get(url, timeout=timeout, raise_for_status=True) as response: stream = response.content inventory_header = (await stream.readline()).decode().rstrip() @@ -91,14 +93,11 @@ async def _fetch_inventory(client_session: aiohttp.ClientSession, url: str) -> D raise ValueError(f"Invalid inventory file at url {url}.") -async def fetch_inventory( - client_session: aiohttp.ClientSession, - url: str -) -> Optional[DefaultDict[str, List[Tuple[str, str]]]]: +async def fetch_inventory(url: str) -> Optional[DefaultDict[str, List[Tuple[str, str]]]]: """Get inventory from `url`, retrying `FAILED_REQUEST_ATTEMPTS` times on errors.""" for attempt in range(1, FAILED_REQUEST_ATTEMPTS+1): try: - inventory = await _fetch_inventory(client_session, url) + inventory = await _fetch_inventory(url) except aiohttp.ClientConnectorError: log.warning( f"Failed to connect to inventory url at {url}; " -- cgit v1.2.3 From fdff2491fc48bac0c55e0a506e7f7c395be13c0d Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Fri, 11 Dec 2020 23:41:38 +0100 Subject: Remove internal CachedParser result cache We no longer need to keep the items around since everything is in redis and the costs of always going through redis is fairly small --- bot/exts/info/doc/_cog.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index e1be956cd..d2bbf8c57 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -80,9 +80,8 @@ class CachedParser: def __init__(self): self._queue: List[QueueItem] = [] - self._results = {} self._page_symbols: Dict[str, List[DocItem]] = defaultdict(list) - self._item_events: Dict[DocItem, asyncio.Event] = {} + self._item_futures: Dict[DocItem, asyncio.Future] = {} self._parse_task = None async def get_markdown(self, doc_item: DocItem) -> str: @@ -107,9 +106,8 @@ class CachedParser: self._parse_task = asyncio.create_task(self._parse_queue()) self._move_to_front(doc_item) - self._item_events[doc_item] = item_event = asyncio.Event() - await item_event.wait() - return self._results[doc_item] + self._item_futures[doc_item] = item_future = asyncio.Future() + return await item_future async def _parse_queue(self) -> None: """ @@ -123,12 +121,11 @@ class CachedParser: try: markdown = get_symbol_markdown(soup, item) await doc_cache.set(item, markdown) - self._results[item] = markdown except Exception: log.exception(f"Unexpected error when handling {item}") else: - if (event := self._item_events.get(item)) is not None: - event.set() + if (future := self._item_futures.get(item)) is not None: + future.set_result(markdown) await asyncio.sleep(0.1) self._parse_task = None @@ -153,15 +150,14 @@ class CachedParser: All currently requested items are waited to be parsed before clearing. """ - for event in self._item_events.values(): - await event.wait() + for future in self._item_futures.values(): + await future if self._parse_task is not None: self._parse_task.cancel() self._parse_task = None self._queue.clear() - self._results.clear() self._page_symbols.clear() - self._item_events.clear() + self._item_futures.clear() class DocCog(commands.Cog): -- cgit v1.2.3 From f6805c397c47d7dbfc2f38998c7de3556de69b42 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Fri, 11 Dec 2020 23:42:36 +0100 Subject: Ensure only one future is created for each doc_item Previously in case get_markdown for an item ran twice, the one that ran second would overwrite the future created by the first one, potentially causing the coro to wait for it infinitely as _parse_queue would only be able to set the last future --- bot/exts/info/doc/_cog.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index d2bbf8c57..78d9c6b9b 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -106,8 +106,9 @@ class CachedParser: self._parse_task = asyncio.create_task(self._parse_queue()) self._move_to_front(doc_item) - self._item_futures[doc_item] = item_future = asyncio.Future() - return await item_future + if doc_item not in self._item_futures: + self._item_futures[doc_item] = bot_instance.loop.create_future() + return await self._item_futures[doc_item] async def _parse_queue(self) -> None: """ -- cgit v1.2.3 From 121bdd16e8ee53d83822e9320232a65ea2ab540a Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Fri, 11 Dec 2020 23:44:59 +0100 Subject: Move parse_queue cleanup into finally block The finally will make sure we reset the task and log it no matter what happens, additionally the clearing of the variable is now only done in one place as the finally also executes when the coro is cancelled --- bot/exts/info/doc/_cog.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 78d9c6b9b..603d7df97 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -117,20 +117,21 @@ class CachedParser: The coroutine will run as long as the queue is not empty, resetting `self._parse_task` to None when finished. """ log.trace("Starting queue parsing.") - while self._queue: - item, soup = self._queue.pop() - try: - markdown = get_symbol_markdown(soup, item) - await doc_cache.set(item, markdown) - except Exception: - log.exception(f"Unexpected error when handling {item}") - else: - if (future := self._item_futures.get(item)) is not None: - future.set_result(markdown) - await asyncio.sleep(0.1) - - self._parse_task = None - log.trace("Finished parsing queue.") + try: + while self._queue: + item, soup = self._queue.pop() + try: + markdown = get_symbol_markdown(soup, item) + await doc_cache.set(item, markdown) + except Exception: + log.exception(f"Unexpected error when handling {item}") + else: + if (future := self._item_futures.get(item)) is not None: + future.set_result(markdown) + await asyncio.sleep(0.1) + finally: + self._parse_task = None + log.trace("Finished parsing queue.") def _move_to_front(self, item: Union[QueueItem, DocItem]) -> None: """Map a DocItem to its page so that the symbol will be parsed once the page is requested.""" @@ -155,7 +156,6 @@ class CachedParser: await future if self._parse_task is not None: self._parse_task.cancel() - self._parse_task = None self._queue.clear() self._page_symbols.clear() self._item_futures.clear() -- cgit v1.2.3 From 97d0625823171a873393c8baf14212104b1ee955 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Fri, 11 Dec 2020 23:46:24 +0100 Subject: Provide feedback to user when no cache to clear was found While technically correct, always sending success could be misleading in case of a typo on the package --- bot/exts/info/doc/_cog.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 603d7df97..933f4500e 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -454,5 +454,7 @@ class DocCog(commands.Cog): @commands.has_any_role(*MODERATION_ROLES) async def clear_cache_command(self, ctx: commands.Context, package_name: PackageName) -> None: """Clear the persistent redis cache for `package`.""" - await doc_cache.delete(package_name) - await ctx.send(f"Successfully cleared the cache for `{package_name}`.") + if await doc_cache.delete(package_name): + await ctx.send(f"Successfully cleared the cache for `{package_name}`.") + else: + await ctx.send("No keys matching the package found.") -- cgit v1.2.3 From 30a3ce49fd346e4a2f4b3c9c12806a2aba8e9e16 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 12 Dec 2020 02:37:03 +0100 Subject: Create function for merging function and decorator wrapper globals discord.py uses the globals of functions to resolve forward refs in commands, previously decorators applied before commands broke the bot with forwardrefs to names that weren't in the namespace of the module where they were defined, the new function takes care of merging the globals in a new function to mitigate this issue. closes: #1323 --- bot/decorators.py | 6 ++---- bot/utils/function.py | 27 +++++++++++++++++++++++++++ bot/utils/lock.py | 3 +-- 3 files changed, 30 insertions(+), 6 deletions(-) diff --git a/bot/decorators.py b/bot/decorators.py index 063c8f878..3892e350f 100644 --- a/bot/decorators.py +++ b/bot/decorators.py @@ -71,7 +71,6 @@ def redirect_output(destination_channel: int, bypass_roles: t.Container[int] = N This decorator must go before (below) the `command` decorator. """ def wrap(func: t.Callable) -> t.Callable: - @wraps(func) async def inner(self: Cog, ctx: Context, *args, **kwargs) -> None: if ctx.channel.id == destination_channel: log.trace(f"Command {ctx.command.name} was invoked in destination_channel, not redirecting") @@ -106,7 +105,7 @@ def redirect_output(destination_channel: int, bypass_roles: t.Container[int] = N await ctx.message.delete() log.trace("Redirect output: Deleted invocation message") - return inner + return wraps(func)(function.update_wrapper_globals(inner, func)) return wrap @@ -123,7 +122,6 @@ def respect_role_hierarchy(member_arg: function.Argument) -> t.Callable: This decorator must go before (below) the `command` decorator. """ def decorator(func: t.Callable) -> t.Callable: - @wraps(func) async def wrapper(*args, **kwargs) -> None: log.trace(f"{func.__name__}: respect role hierarchy decorator called") @@ -151,5 +149,5 @@ def respect_role_hierarchy(member_arg: function.Argument) -> t.Callable: else: log.trace(f"{func.__name__}: {target.top_role=} < {actor.top_role=}; calling func") await func(*args, **kwargs) - return wrapper + return wraps(func)(function.update_wrapper_globals(wrapper, func)) return decorator diff --git a/bot/utils/function.py b/bot/utils/function.py index 3ab32fe3c..8b8c7ba5c 100644 --- a/bot/utils/function.py +++ b/bot/utils/function.py @@ -1,6 +1,7 @@ """Utilities for interaction with functions.""" import inspect +import types import typing as t Argument = t.Union[int, str] @@ -73,3 +74,29 @@ def get_bound_args(func: t.Callable, args: t.Tuple, kwargs: t.Dict[str, t.Any]) bound_args.apply_defaults() return bound_args.arguments + + +def update_wrapper_globals(wrapper: types.FunctionType, func: types.FunctionType) -> types.FunctionType: + """ + Update globals of `wrapper` with the globals from `func`. + + For forwardrefs in command annotations discordpy uses the __global__ attribute of the function + to resolve their values, with decorators that replace the function this breaks because they have + their own globals. + + This function creates a new function functionally identical to `wrapper`, which has the globals replaced with + a merge of `func`s globals and the `wrapper`s globals. + + In case a global name from `func` conflicts with a name from `wrapper`'s globals, `wrapper` will win + to keep it functional, but this may cause problems if the name is used as an annotation and + discord.py uses it as a converter on a parameter from `func`. + """ + new_globals = wrapper.__globals__.copy() + new_globals.update((k, v) for k, v in func.__globals__.items() if k not in wrapper.__code__.co_names) + return types.FunctionType( + code=wrapper.__code__, + globals=new_globals, + name=wrapper.__name__, + argdefs=wrapper.__defaults__, + closure=wrapper.__closure__, + ) diff --git a/bot/utils/lock.py b/bot/utils/lock.py index 7aaafbc88..cf87321c5 100644 --- a/bot/utils/lock.py +++ b/bot/utils/lock.py @@ -61,7 +61,6 @@ def lock(namespace: Hashable, resource_id: ResourceId, *, raise_error: bool = Fa def decorator(func: Callable) -> Callable: name = func.__name__ - @wraps(func) async def wrapper(*args, **kwargs) -> Any: log.trace(f"{name}: mutually exclusive decorator called") @@ -93,7 +92,7 @@ def lock(namespace: Hashable, resource_id: ResourceId, *, raise_error: bool = Fa if raise_error: raise LockedResourceError(str(namespace), id_) - return wrapper + return wraps(func)(function.update_wrapper_globals(wrapper, func)) return decorator -- cgit v1.2.3 From 3cc32ae30a671a31a3f05c2c8a4af44e09095cc8 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 12 Dec 2020 02:37:37 +0100 Subject: Lock inventory refreshes All commands that refresh the inventories in some way are now locked to prevent various race conditions that may have occurred in the unlikely scenario that they got triggered together, the fetching part of the get command now also has to wait for the running inventory refresh to finish before proceeding to fetch and parse the html --- bot/exts/info/doc/_cog.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 933f4500e..11d17222d 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -17,6 +17,7 @@ from bot.bot import Bot from bot.constants import MODERATION_ROLES, RedirectOutput from bot.converters import InventoryURL, PackageName, ValidURL from bot.pagination import LinePaginator +from bot.utils.lock import lock from bot.utils.messages import wait_for_deletion from bot.utils.scheduling import Scheduler from ._inventory_parser import fetch_inventory @@ -39,6 +40,10 @@ PRIORITY_PACKAGES = ( WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)") NOT_FOUND_DELETE_DELAY = RedirectOutput.delete_delay +REFRESH_EVENT = asyncio.Event() +REFRESH_EVENT.set() +COMMAND_LOCK_SINGLETON = "inventory refresh" + doc_cache = DocRedisCache(namespace="Docs") @@ -91,9 +96,6 @@ class CachedParser: If no symbols were fetched from `doc_item`s page before, the HTML has to be fetched before parsing can be queued. """ - if (symbol := self._results.get(doc_item)) is not None: - return symbol - if (symbols_to_queue := self._page_symbols.get(doc_item.url)) is not None: async with bot_instance.http_session.get(doc_item.url) as response: soup = BeautifulSoup(await response.text(encoding="utf8"), "lxml") @@ -176,6 +178,7 @@ class DocCog(commands.Cog): self.bot.loop.create_task(self.init_refresh_inventory()) + @lock("doc", COMMAND_LOCK_SINGLETON, raise_error=True) async def init_refresh_inventory(self) -> None: """Refresh documentation inventory on cog initialization.""" await self.bot.wait_until_guild_available() @@ -258,6 +261,7 @@ class DocCog(commands.Cog): async def refresh_inventory(self) -> None: """Refresh internal documentation inventory.""" + REFRESH_EVENT.clear() log.debug("Refreshing documentation inventory...") for inventory in self.scheduled_inventories: self.inventory_scheduler.cancel(inventory) @@ -279,6 +283,7 @@ class DocCog(commands.Cog): ) for package in await self.bot.api_client.get('bot/documentation-links') ] await asyncio.gather(*coros) + REFRESH_EVENT.set() async def get_symbol_embed(self, symbol: str) -> Optional[discord.Embed]: """ @@ -299,6 +304,9 @@ class DocCog(commands.Cog): markdown = await doc_cache.get(symbol_info) if markdown is None: log.debug(f"Redis cache miss for symbol `{symbol}`.") + if not REFRESH_EVENT.is_set(): + log.debug("Waiting for inventories to be refreshed before processing item.") + await REFRESH_EVENT.wait() markdown = await self.item_fetcher.get_markdown(symbol_info) if markdown is not None: await doc_cache.set(symbol_info, markdown) @@ -374,6 +382,7 @@ class DocCog(commands.Cog): @docs_group.command(name='setdoc', aliases=('s',)) @commands.has_any_role(*MODERATION_ROLES) + @lock("doc", COMMAND_LOCK_SINGLETON, raise_error=True) async def set_command( self, ctx: commands.Context, package_name: PackageName, base_url: ValidURL, inventory_url: InventoryURL @@ -413,6 +422,7 @@ class DocCog(commands.Cog): @docs_group.command(name='deletedoc', aliases=('removedoc', 'rm', 'd')) @commands.has_any_role(*MODERATION_ROLES) + @lock("doc", COMMAND_LOCK_SINGLETON, raise_error=True) async def delete_command(self, ctx: commands.Context, package_name: PackageName) -> None: """ Removes the specified package from the database. @@ -431,6 +441,7 @@ class DocCog(commands.Cog): @docs_group.command(name="refreshdoc", aliases=("rfsh", "r")) @commands.has_any_role(*MODERATION_ROLES) + @lock("doc", COMMAND_LOCK_SINGLETON, raise_error=True) async def refresh_command(self, ctx: commands.Context) -> None: """Refresh inventories and show the difference.""" old_inventories = set(self.base_urls) -- cgit v1.2.3 From 9f11b453930b5abbab0b891e8b1ca0a2f9d013d0 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 12 Dec 2020 03:59:40 +0100 Subject: Simplify flow The else is a bit clearer than the early return --- bot/exts/info/doc/_cog.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 11d17222d..5e7399afb 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -417,8 +417,8 @@ class DocCog(commands.Cog): f"Added the package `{package_name}` to the database but failed to fetch inventory; " f"trying again in 2 minutes." ) - return - await ctx.send(f"Added package `{package_name}` to database and refreshed inventory.") + else: + await ctx.send(f"Added package `{package_name}` to database and refreshed inventory.") @docs_group.command(name='deletedoc', aliases=('removedoc', 'rm', 'd')) @commands.has_any_role(*MODERATION_ROLES) -- cgit v1.2.3 From d21540d56853bc33625b0e1b8e2227294706eedb Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 12 Dec 2020 04:02:49 +0100 Subject: Clear up grammar Co-authored-by: MarkKoz --- bot/exts/info/doc/_cog.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 5e7399afb..d828e6b4a 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -76,10 +76,10 @@ class QueueItem(NamedTuple): class CachedParser: """ - Get symbol markdown from pages with smarter caching. + Get the symbol Markdown from pages with smarter caching. DocItems are added through the `add_item` method which adds them to the `_page_symbols` dict. - `get_markdown` is used to fetch the markdown; when this is used for the first time on a page, + `get_markdown` is used to fetch the Markdown; when this is used for the first time on a page, all of the symbols are queued to be parsed to avoid multiple web requests to the same page. """ @@ -91,7 +91,7 @@ class CachedParser: async def get_markdown(self, doc_item: DocItem) -> str: """ - Get result markdown of `doc_item`. + Get the result Markdown of `doc_item`. If no symbols were fetched from `doc_item`s page before, the HTML has to be fetched before parsing can be queued. @@ -418,7 +418,7 @@ class DocCog(commands.Cog): f"trying again in 2 minutes." ) else: - await ctx.send(f"Added package `{package_name}` to database and refreshed inventory.") + await ctx.send(f"Added the package `{package_name}` to the database and refreshed the inventory.") @docs_group.command(name='deletedoc', aliases=('removedoc', 'rm', 'd')) @commands.has_any_role(*MODERATION_ROLES) -- cgit v1.2.3 From b3f9cc10b7fe50575fee74424ba26636007cbcdc Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 12 Dec 2020 04:16:19 +0100 Subject: Reuse form body to construct log message Co-authored-by: MarkKoz --- bot/exts/info/doc/_cog.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index d828e6b4a..61f770c0a 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -407,9 +407,7 @@ class DocCog(commands.Cog): log.info( f"User @{ctx.author} ({ctx.author.id}) added a new documentation package:\n" - f"Package name: {package_name}\n" - f"Base url: {base_url}\n" - f"Inventory URL: {inventory_url}" + + "\n".join(f"{key}: {value}" for key, value in body.items()) ) if await self.update_single(package_name, base_url, inventory_url) is None: -- cgit v1.2.3 From 7aea86dd22572e9685ed8353428f14e90a9db321 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 12 Dec 2020 04:22:20 +0100 Subject: Make reschedule delays a module constant --- bot/exts/info/doc/_cog.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 61f770c0a..30579894c 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -6,6 +6,7 @@ import re import sys from collections import defaultdict from contextlib import suppress +from types import SimpleNamespace from typing import Dict, List, NamedTuple, Optional, Union import discord @@ -39,6 +40,8 @@ PRIORITY_PACKAGES = ( ) WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)") NOT_FOUND_DELETE_DELAY = RedirectOutput.delete_delay +# Delay to wait before trying to reach a rescheduled inventory again, in minutes +FETCH_RESCHEDULE_DELAY = SimpleNamespace(first=2, repeated=5) REFRESH_EVENT = asyncio.Event() REFRESH_EVENT.set() @@ -197,7 +200,8 @@ class DocCog(commands.Cog): * `inventory_url` is the absolute URL to the intersphinx inventory. If the inventory file is currently unreachable, - the update is rescheduled to execute in 2 minutes on the first attempt, and 5 minutes on subsequent attempts. + the update is rescheduled to execute in FETCH_RESCHEDULE_DELAY.first minutes on the first attempt, + and FETCH_RESCHEDULE_DELAY.repeated minutes on the subsequent attempts. Return True on success; False if fetching failed and was rescheduled. """ @@ -205,7 +209,10 @@ class DocCog(commands.Cog): package = await fetch_inventory(inventory_url) if not package: - delay = 2*60 if inventory_url not in self.scheduled_inventories else 5*60 + if inventory_url not in self.scheduled_inventories: + delay = FETCH_RESCHEDULE_DELAY.first * 60 + else: + delay = FETCH_RESCHEDULE_DELAY.repeated * 60 log.info(f"Failed to fetch inventory; attempting again in {delay//60} minutes.") self.inventory_scheduler.schedule_later( delay, @@ -413,7 +420,7 @@ class DocCog(commands.Cog): if await self.update_single(package_name, base_url, inventory_url) is None: await ctx.send( f"Added the package `{package_name}` to the database but failed to fetch inventory; " - f"trying again in 2 minutes." + f"trying again in {FETCH_RESCHEDULE_DELAY.first} minutes." ) else: await ctx.send(f"Added the package `{package_name}` to the database and refreshed the inventory.") -- cgit v1.2.3 From 73502611d1420a62f1e8c0a6ca51c02dc2c8f896 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 12 Dec 2020 04:25:26 +0100 Subject: Call command method directly Co-authored-by: MarkKoz --- bot/exts/info/doc/_cog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 30579894c..4cd28e29a 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -334,7 +334,7 @@ class DocCog(commands.Cog): @commands.group(name='docs', aliases=('doc', 'd'), invoke_without_command=True) async def docs_group(self, ctx: commands.Context, *, symbol: Optional[str]) -> None: """Look up documentation for Python symbols.""" - await ctx.invoke(self.get_command, symbol=symbol) + await self.get_command(ctx, symbol=symbol) @docs_group.command(name='getdoc', aliases=('g',)) async def get_command(self, ctx: commands.Context, *, symbol: Optional[str]) -> None: -- cgit v1.2.3 From 677f2ad91dbc16ef3a33c102e4932d99a65437da Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 12 Dec 2020 04:26:22 +0100 Subject: Change param styling to be consistent with the repo --- bot/exts/info/doc/_cog.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 4cd28e29a..60e86353b 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -187,9 +187,7 @@ class DocCog(commands.Cog): await self.bot.wait_until_guild_available() await self.refresh_inventory() - async def update_single( - self, api_package_name: str, base_url: str, inventory_url: str - ) -> bool: + async def update_single(self, api_package_name: str, base_url: str, inventory_url: str) -> bool: """ Rebuild the inventory for a single package. @@ -391,8 +389,11 @@ class DocCog(commands.Cog): @commands.has_any_role(*MODERATION_ROLES) @lock("doc", COMMAND_LOCK_SINGLETON, raise_error=True) async def set_command( - self, ctx: commands.Context, package_name: PackageName, - base_url: ValidURL, inventory_url: InventoryURL + self, + ctx: commands.Context, + package_name: PackageName, + base_url: ValidURL, + inventory_url: InventoryURL, ) -> None: """ Adds a new documentation metadata object to the site's database. -- cgit v1.2.3 From f988d3ec07c4ca814fa5ddb47a6e064c4bb32461 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 12 Dec 2020 04:27:29 +0100 Subject: Use string addition instead of join With only two strings, the addition is a bit clearer than constructing and joining a tuple Co-authored-by: MarkKoz --- bot/exts/info/doc/_cog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 60e86353b..1b5eaa6d5 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -62,7 +62,7 @@ class DocItem(NamedTuple): @property def url(self) -> str: """Return the absolute url to the symbol.""" - return "".join((self.base_url, self.relative_url_path)) + return self.base_url + self.relative_url_path class QueueItem(NamedTuple): -- cgit v1.2.3 From 9cfdeacb807442c27de08e2b66c49d998dfae5ce Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 12 Dec 2020 04:29:36 +0100 Subject: Move copyright outside of license text Co-authored-by: MarkKoz --- LICENSE-THIRD-PARTY | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/LICENSE-THIRD-PARTY b/LICENSE-THIRD-PARTY index d454070c2..ab715630d 100644 --- a/LICENSE-THIRD-PARTY +++ b/LICENSE-THIRD-PARTY @@ -37,12 +37,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. --------------------------------------------------------------------------------------------------- BSD 2-Clause License Applies to: - - bot/cogs/doc/inventory_parser.py: _load_v1, _load_v2 and ZlibStreamReader.__aiter__. + - Copyright (c) 2007-2020 by the Sphinx team (see AUTHORS file). All rights reserved. + - bot/cogs/doc/inventory_parser.py: _load_v1, _load_v2 and ZlibStreamReader.__aiter__. --------------------------------------------------------------------------------------------------- -Copyright (c) 2007-2020 by the Sphinx team (see AUTHORS file). -All rights reserved. - Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: -- cgit v1.2.3 From f416e42efce74082d155d9159114f698a97305cb Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 12 Dec 2020 04:34:05 +0100 Subject: Return the sent message This allows the caller to work with the message further --- bot/utils/messages.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bot/utils/messages.py b/bot/utils/messages.py index 42bde358d..c42e4bacc 100644 --- a/bot/utils/messages.py +++ b/bot/utils/messages.py @@ -135,14 +135,14 @@ def sub_clyde(username: Optional[str]) -> Optional[str]: return username # Empty string or None -async def send_denial(ctx: Context, reason: str) -> None: +async def send_denial(ctx: Context, reason: str) -> discord.Message: """Send an embed denying the user with the given reason.""" embed = discord.Embed() embed.colour = discord.Colour.red() embed.title = random.choice(NEGATIVE_REPLIES) embed.description = reason - await ctx.send(embed=embed) + return await ctx.send(embed=embed) def format_user(user: discord.abc.User) -> str: -- cgit v1.2.3 From 9c6f3acac1334e885cc6b9d176a4b816bb68710a Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 12 Dec 2020 04:35:37 +0100 Subject: Use send_denial util instead of creating embed manually The symbol is also no longer sent back to the user, as it is not necessary and we can skip the cleanup on it --- bot/exts/info/doc/_cog.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 1b5eaa6d5..8c52b04cf 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -19,7 +19,7 @@ from bot.constants import MODERATION_ROLES, RedirectOutput from bot.converters import InventoryURL, PackageName, ValidURL from bot.pagination import LinePaginator from bot.utils.lock import lock -from bot.utils.messages import wait_for_deletion +from bot.utils.messages import send_denial, wait_for_deletion from bot.utils.scheduling import Scheduler from ._inventory_parser import fetch_inventory from ._parsing import get_symbol_markdown @@ -370,12 +370,7 @@ class DocCog(commands.Cog): doc_embed = await self.get_symbol_embed(symbol) if doc_embed is None: - symbol = await discord.ext.commands.clean_content().convert(ctx, symbol) - error_embed = discord.Embed( - description=f"Sorry, I could not find any documentation for `{(symbol)}`.", - colour=discord.Colour.red() - ) - error_message = await ctx.send(embed=error_embed) + error_message = await send_denial(ctx, "No documentation found for the requested symbol.") await wait_for_deletion(error_message, (ctx.author.id,), timeout=NOT_FOUND_DELETE_DELAY) with suppress(discord.NotFound): await ctx.message.delete() -- cgit v1.2.3 From 2a855de33c79bfebee4c85757d26b5463c1fccce Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 12 Dec 2020 04:42:46 +0100 Subject: Use cancel_all instead of manually calling cancel repeatedly --- bot/exts/info/doc/_cog.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 8c52b04cf..07a287572 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -268,8 +268,7 @@ class DocCog(commands.Cog): """Refresh internal documentation inventory.""" REFRESH_EVENT.clear() log.debug("Refreshing documentation inventory...") - for inventory in self.scheduled_inventories: - self.inventory_scheduler.cancel(inventory) + self.inventory_scheduler.cancel_all() # Clear the old base URLS and doc symbols to ensure # that we start from a fresh local dataset. -- cgit v1.2.3 From fdc24cf48fcd34b14098befc36bb3d4ce768dccd Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 12 Dec 2020 04:44:37 +0100 Subject: Strip whitespace from symbol Markdown before returning it The html we parse frequently ends up with trailing and sometimes leading newlines which get stripped out by discord anyway, we have no reason to keep those around when sending the Markdown over to redis --- bot/exts/info/doc/_parsing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py index 567786204..521034006 100644 --- a/bot/exts/info/doc/_parsing.py +++ b/bot/exts/info/doc/_parsing.py @@ -345,4 +345,4 @@ def get_symbol_markdown(soup: BeautifulSoup, symbol_data: DocItem) -> Optional[s else: signature = _get_signatures(symbol_heading) description = _get_dd_description(symbol_heading) - return _create_markdown(signature, description, symbol_data.url).replace('¶', '') + return _create_markdown(signature, description, symbol_data.url).replace('¶', '').strip() -- cgit v1.2.3 From b827d9bc8b66b2b7cc3702056b473ebbaf601031 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 13 Dec 2020 05:48:27 +0100 Subject: Simplify the implementation of the custom strainer The strainer now forces the text attribute to be None, simplifying the check on strings and falls back to the superclass' method on non string elements --- bot/exts/info/doc/_html.py | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/bot/exts/info/doc/_html.py b/bot/exts/info/doc/_html.py index bc705130d..88fbc8825 100644 --- a/bot/exts/info/doc/_html.py +++ b/bot/exts/info/doc/_html.py @@ -1,7 +1,9 @@ -from collections.abc import Iterable +import logging from typing import List, Union -from bs4.element import NavigableString, PageElement, SoupStrainer, Tag +from bs4.element import PageElement, SoupStrainer + +log = logging.getLogger(__name__) class Strainer(SoupStrainer): @@ -9,25 +11,18 @@ class Strainer(SoupStrainer): def __init__(self, *, include_strings: bool, **kwargs): self.include_strings = include_strings + passed_text = kwargs.pop("text", None) + if passed_text is not None: + log.warning("`text` is not a supported kwarg in the custom strainer.") super().__init__(**kwargs) markup_hint = Union[PageElement, List["markup_hint"]] def search(self, markup: markup_hint) -> Union[PageElement, str]: """Extend default SoupStrainer behaviour to allow matching both `Tag`s` and `NavigableString`s.""" - if isinstance(markup, Iterable) and not isinstance(markup, (Tag, str)): - for element in markup: - if isinstance(element, NavigableString) and self.search(element): - return element - elif isinstance(markup, Tag): - # Also include tags while we're searching for strings and tags. - if self.include_strings or (not self.text or self.name or self.attrs): - return self.search_tag(markup) - - elif isinstance(markup, str): + if isinstance(markup, str): # Let everything through the text filter if we're including strings and tags. - text_filter = None if not self.include_strings else True - if not self.name and not self.attrs and self._matches(markup, text_filter): + if not self.name and not self.attrs and self.include_strings: return markup else: - raise Exception(f"I don't know how to match against a {markup.__class__}") + return super().search(markup) -- cgit v1.2.3 From 73d7d748a550e644980d2604542d279472eb1b0c Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Mon, 14 Dec 2020 05:49:58 +0100 Subject: Run html parsing in an executor The parsing may take up to a few hundred ms depending on the amount of work it has to do --- bot/exts/info/doc/_cog.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 07a287572..093e5cdb7 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -6,6 +6,7 @@ import re import sys from collections import defaultdict from contextlib import suppress +from functools import partial from types import SimpleNamespace from typing import Dict, List, NamedTuple, Optional, Union @@ -126,7 +127,10 @@ class CachedParser: while self._queue: item, soup = self._queue.pop() try: - markdown = get_symbol_markdown(soup, item) + markdown = await bot_instance.loop.run_in_executor( + None, + partial(get_symbol_markdown, soup, item), + ) await doc_cache.set(item, markdown) except Exception: log.exception(f"Unexpected error when handling {item}") -- cgit v1.2.3 From a9dfeb195e53aba9b444959da8b16addea3574d2 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Mon, 14 Dec 2020 05:50:45 +0100 Subject: Revert "Clear up docstring so it doesn't rely on private attribute" This reverts commit ad90978f --- bot/exts/info/doc/_cog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 093e5cdb7..92190bc55 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -143,7 +143,7 @@ class CachedParser: log.trace("Finished parsing queue.") def _move_to_front(self, item: Union[QueueItem, DocItem]) -> None: - """Map a DocItem to its page so that the symbol will be parsed once the page is requested.""" + """Move `item` to the front of the parse queue.""" # The parse queue stores soups along with the doc symbols in QueueItem objects, # in case we're moving a DocItem we have to get the associated QueueItem first and then move it. item_index = self._queue.index(item) -- cgit v1.2.3 From 2da9d443598bcf91c9eb6ab22963806a201fce01 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Mon, 14 Dec 2020 05:51:13 +0100 Subject: Clear up docstring so it doesn't rely on private attribute Co-authored-by: MarkKoz --- bot/exts/info/doc/_cog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 92190bc55..6c51ab738 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -152,7 +152,7 @@ class CachedParser: self._queue.append(queue_item) def add_item(self, doc_item: DocItem) -> None: - """Add a DocItem to `_page_symbols`.""" + """Map a DocItem to its page so that the symbol will be parsed once the page is requested.""" self._page_symbols[doc_item.url].append(doc_item) async def clear(self) -> None: -- cgit v1.2.3 From cf00aff24d20a57c2c9178d6d9e30f5d33d9a426 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Tue, 15 Dec 2020 00:30:17 +0100 Subject: Create futures for all items in the queue Creating futures for everything and then awaiting at the end takes care of all the potential race conditions that may pop up from items that are parsed and sent to redis while the get_markdown method is in the middle of fetching a page. In case it happens with the implementation we'll just need to move the item to the front and the future will get a result set soon afterwards. --- bot/exts/info/doc/_cog.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 6c51ab738..0d344c363 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -7,6 +7,7 @@ import sys from collections import defaultdict from contextlib import suppress from functools import partial +from operator import attrgetter from types import SimpleNamespace from typing import Dict, List, NamedTuple, Optional, Union @@ -78,6 +79,14 @@ class QueueItem(NamedTuple): return NamedTuple.__eq__(self, other) +class ParseResultFuture(asyncio.Future): + """Future with the user_requested attribute to know which futures need to be waited for before clearing.""" + + def __init__(self): + super().__init__() + self.user_requested = False + + class CachedParser: """ Get the symbol Markdown from pages with smarter caching. @@ -90,7 +99,7 @@ class CachedParser: def __init__(self): self._queue: List[QueueItem] = [] self._page_symbols: Dict[str, List[DocItem]] = defaultdict(list) - self._item_futures: Dict[DocItem, asyncio.Future] = {} + self._item_futures: Dict[DocItem, ParseResultFuture] = {} self._parse_task = None async def get_markdown(self, doc_item: DocItem) -> str: @@ -99,21 +108,25 @@ class CachedParser: If no symbols were fetched from `doc_item`s page before, the HTML has to be fetched before parsing can be queued. + + Not safe to run while `self.clear` is running. """ if (symbols_to_queue := self._page_symbols.get(doc_item.url)) is not None: async with bot_instance.http_session.get(doc_item.url) as response: soup = BeautifulSoup(await response.text(encoding="utf8"), "lxml") self._queue.extend(QueueItem(symbol, soup) for symbol in symbols_to_queue) + self._item_futures.update((symbol, ParseResultFuture()) for symbol in symbols_to_queue) del self._page_symbols[doc_item.url] log.debug(f"Added symbols from {doc_item.url} to parse queue.") if self._parse_task is None: self._parse_task = asyncio.create_task(self._parse_queue()) - self._move_to_front(doc_item) - if doc_item not in self._item_futures: - self._item_futures[doc_item] = bot_instance.loop.create_future() + with suppress(ValueError): + # If the item is not in the list then the item is already parsed or is being parsed + self._move_to_front(doc_item) + self._item_futures[doc_item].user_requested = True return await self._item_futures[doc_item] async def _parse_queue(self) -> None: @@ -161,7 +174,7 @@ class CachedParser: All currently requested items are waited to be parsed before clearing. """ - for future in self._item_futures.values(): + for future in filter(attrgetter("user_requested"), self._item_futures.values()): await future if self._parse_task is not None: self._parse_task.cancel() -- cgit v1.2.3 From a430f1aefdb092bc7ca2fd41bff20aedaa949f5e Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Tue, 15 Dec 2020 00:35:12 +0100 Subject: Wait for the inventory to be refreshed before attempting any fetching Previously the bot returned an error if a symbol was not found while inventories were refreshing, but we can just wait for the to finish refreshing and then the symbol may be filled in. A logging call to notify of the refresh being done was also added. --- bot/exts/info/doc/_cog.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 0d344c363..a8642be3e 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -304,6 +304,7 @@ class DocCog(commands.Cog): ) for package in await self.bot.api_client.get('bot/documentation-links') ] await asyncio.gather(*coros) + log.debug("Finished inventory refresh.") REFRESH_EVENT.set() async def get_symbol_embed(self, symbol: str) -> Optional[discord.Embed]: @@ -316,6 +317,10 @@ class DocCog(commands.Cog): if not present also create a redis entry for the symbol. """ log.trace(f"Building embed for symbol `{symbol}`") + if not REFRESH_EVENT.is_set(): + log.debug("Waiting for inventories to be refreshed before processing item.") + await REFRESH_EVENT.wait() + symbol_info = self.doc_symbols.get(symbol) if symbol_info is None: log.debug("Symbol does not exist.") @@ -325,9 +330,6 @@ class DocCog(commands.Cog): markdown = await doc_cache.get(symbol_info) if markdown is None: log.debug(f"Redis cache miss for symbol `{symbol}`.") - if not REFRESH_EVENT.is_set(): - log.debug("Waiting for inventories to be refreshed before processing item.") - await REFRESH_EVENT.wait() markdown = await self.item_fetcher.get_markdown(symbol_info) if markdown is not None: await doc_cache.set(symbol_info, markdown) -- cgit v1.2.3 From 7e5fb88a9976570590a4e946722fd60ada1aad95 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Tue, 15 Dec 2020 05:00:23 +0100 Subject: Return the fetched inventory in the Inventory converter Instead of fetching it again in the cog, the converter now returns the inventory for later use. The set command now no longer attempts to reschedule the inventory, and a bug that caused the inventory rescheduling to do nothing in `update_single` was fixed after moving it to its own method --- bot/converters.py | 12 +++--- bot/exts/info/doc/_cog.py | 75 ++++++++++++++++++---------------- bot/exts/info/doc/_inventory_parser.py | 9 ++-- 3 files changed, 50 insertions(+), 46 deletions(-) diff --git a/bot/converters.py b/bot/converters.py index d558fa3df..6bbc22c3a 100644 --- a/bot/converters.py +++ b/bot/converters.py @@ -176,23 +176,23 @@ class ValidURL(Converter): return url -class InventoryURL(Converter): +class Inventory(Converter): """ Represents an Intersphinx inventory URL. This converter checks whether intersphinx accepts the given inventory URL, and raises - `BadArgument` if that is not the case. + `BadArgument` if that is not the case or if the url is unreachable. - Otherwise, it simply passes through the given URL. + Otherwise, it returns the url and the fetched inventory dict in a tuple. """ @staticmethod - async def convert(ctx: Context, url: str) -> str: + async def convert(ctx: Context, url: str) -> t.Tuple[str, _inventory_parser.INVENTORY_DICT]: """Convert url to Intersphinx inventory URL.""" await ctx.trigger_typing() - if await _inventory_parser.fetch_inventory(url) is None: + if (inventory := await _inventory_parser.fetch_inventory(url)) is None: raise BadArgument(f"Failed to fetch inventory file after {_inventory_parser.FAILED_REQUEST_ATTEMPTS}.") - return url + return url, inventory class Snowflake(IDConverter): diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index a8642be3e..11d1dc9ad 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -18,12 +18,12 @@ from discord.ext import commands from bot import instance as bot_instance from bot.bot import Bot from bot.constants import MODERATION_ROLES, RedirectOutput -from bot.converters import InventoryURL, PackageName, ValidURL +from bot.converters import Inventory, PackageName, ValidURL from bot.pagination import LinePaginator from bot.utils.lock import lock from bot.utils.messages import send_denial, wait_for_deletion from bot.utils.scheduling import Scheduler -from ._inventory_parser import fetch_inventory +from ._inventory_parser import INVENTORY_DICT, fetch_inventory from ._parsing import get_symbol_markdown from ._redis_cache import DocRedisCache @@ -204,7 +204,7 @@ class DocCog(commands.Cog): await self.bot.wait_until_guild_available() await self.refresh_inventory() - async def update_single(self, api_package_name: str, base_url: str, inventory_url: str) -> bool: + async def update_single(self, api_package_name: str, base_url: str, package: INVENTORY_DICT) -> None: """ Rebuild the inventory for a single package. @@ -213,31 +213,8 @@ class DocCog(commands.Cog): * `base_url` is the root documentation URL for the specified package, used to build absolute paths that link to specific symbols * `inventory_url` is the absolute URL to the intersphinx inventory. - - If the inventory file is currently unreachable, - the update is rescheduled to execute in FETCH_RESCHEDULE_DELAY.first minutes on the first attempt, - and FETCH_RESCHEDULE_DELAY.repeated minutes on the subsequent attempts. - - Return True on success; False if fetching failed and was rescheduled. """ self.base_urls[api_package_name] = base_url - package = await fetch_inventory(inventory_url) - - if not package: - if inventory_url not in self.scheduled_inventories: - delay = FETCH_RESCHEDULE_DELAY.first * 60 - else: - delay = FETCH_RESCHEDULE_DELAY.repeated * 60 - log.info(f"Failed to fetch inventory; attempting again in {delay//60} minutes.") - self.inventory_scheduler.schedule_later( - delay, - api_package_name, - fetch_inventory(inventory_url) - ) - self.scheduled_inventories.add(api_package_name) - return False - - self.scheduled_inventories.discard(api_package_name) for group, items in package.items(): for symbol, relative_doc_url in items: @@ -279,7 +256,37 @@ class DocCog(commands.Cog): self.item_fetcher.add_item(symbol_item) log.trace(f"Fetched inventory for {api_package_name}.") - return True + + async def update_or_reschedule_inventory( + self, + api_package_name: str, + base_url: str, + inventory_url: str + ) -> Optional[INVENTORY_DICT]: + """ + Update the cog's inventory, or reschedule this method to execute again if the remote inventory unreachable. + + The first attempt is rescheduled to execute in `FETCH_RESCHEDULE_DELAY.first` minutes, the subsequent attempts + in `FETCH_RESCHEDULE_DELAY.repeated` minutes. + """ + package = await fetch_inventory(inventory_url) + + if not package: + if inventory_url not in self.scheduled_inventories: + delay = FETCH_RESCHEDULE_DELAY.first + else: + delay = FETCH_RESCHEDULE_DELAY.repeated + log.info(f"Failed to fetch inventory; attempting again in {delay} minutes.") + self.inventory_scheduler.schedule_later( + delay*60, + api_package_name, + self.update_or_reschedule_inventory(api_package_name, base_url, inventory_url) + ) + self.scheduled_inventories.add(api_package_name) + return + + self.scheduled_inventories.discard(api_package_name) + await self.update_single(api_package_name, base_url, package) async def refresh_inventory(self) -> None: """Refresh internal documentation inventory.""" @@ -299,7 +306,7 @@ class DocCog(commands.Cog): # Run all coroutines concurrently - since each of them performs an HTTP # request, this speeds up fetching the inventory data heavily. coros = [ - self.update_single( + self.update_or_reschedule_inventory( package["package"], package["base_url"], package["inventory_url"] ) for package in await self.bot.api_client.get('bot/documentation-links') ] @@ -406,7 +413,7 @@ class DocCog(commands.Cog): ctx: commands.Context, package_name: PackageName, base_url: ValidURL, - inventory_url: InventoryURL, + inventory: Inventory, ) -> None: """ Adds a new documentation metadata object to the site's database. @@ -419,6 +426,7 @@ class DocCog(commands.Cog): https://docs.python.org/3/ \ https://docs.python.org/3/objects.inv """ + inventory_url, inventory_dict = inventory body = { 'package': package_name, 'base_url': base_url, @@ -431,13 +439,8 @@ class DocCog(commands.Cog): + "\n".join(f"{key}: {value}" for key, value in body.items()) ) - if await self.update_single(package_name, base_url, inventory_url) is None: - await ctx.send( - f"Added the package `{package_name}` to the database but failed to fetch inventory; " - f"trying again in {FETCH_RESCHEDULE_DELAY.first} minutes." - ) - else: - await ctx.send(f"Added the package `{package_name}` to the database and refreshed the inventory.") + await self.update_single(package_name, base_url, inventory_dict) + await ctx.send(f"Added the package `{package_name}` to the database and refreshed the inventory.") @docs_group.command(name='deletedoc', aliases=('removedoc', 'rm', 'd')) @commands.has_any_role(*MODERATION_ROLES) diff --git a/bot/exts/info/doc/_inventory_parser.py b/bot/exts/info/doc/_inventory_parser.py index 0d9bd726a..b38c3b2a8 100644 --- a/bot/exts/info/doc/_inventory_parser.py +++ b/bot/exts/info/doc/_inventory_parser.py @@ -11,6 +11,7 @@ import bot log = logging.getLogger(__name__) FAILED_REQUEST_ATTEMPTS = 3 +INVENTORY_DICT = DefaultDict[str, List[Tuple[str, str]]] _V2_LINE_RE = re.compile(r'(?x)(.+?)\s+(\S*:\S*)\s+(-?\d+)\s+?(\S*)\s+(.*)') @@ -42,7 +43,7 @@ class ZlibStreamReader: pos = buf.find(b'\n') -async def _load_v1(stream: aiohttp.StreamReader) -> DefaultDict[str, List[Tuple[str, str]]]: +async def _load_v1(stream: aiohttp.StreamReader) -> INVENTORY_DICT: invdata = defaultdict(list) async for line in stream: @@ -58,7 +59,7 @@ async def _load_v1(stream: aiohttp.StreamReader) -> DefaultDict[str, List[Tuple[ return invdata -async def _load_v2(stream: aiohttp.StreamReader) -> DefaultDict[str, List[Tuple[str, str]]]: +async def _load_v2(stream: aiohttp.StreamReader) -> INVENTORY_DICT: invdata = defaultdict(list) async for line in ZlibStreamReader(stream): @@ -71,7 +72,7 @@ async def _load_v2(stream: aiohttp.StreamReader) -> DefaultDict[str, List[Tuple[ return invdata -async def _fetch_inventory(url: str) -> DefaultDict[str, List[Tuple[str, str]]]: +async def _fetch_inventory(url: str) -> INVENTORY_DICT: """Fetch, parse and return an intersphinx inventory file from an url.""" timeout = aiohttp.ClientTimeout(sock_connect=5, sock_read=5) async with bot.instance.http_session.get(url, timeout=timeout, raise_for_status=True) as response: @@ -93,7 +94,7 @@ async def _fetch_inventory(url: str) -> DefaultDict[str, List[Tuple[str, str]]]: raise ValueError(f"Invalid inventory file at url {url}.") -async def fetch_inventory(url: str) -> Optional[DefaultDict[str, List[Tuple[str, str]]]]: +async def fetch_inventory(url: str) -> Optional[INVENTORY_DICT]: """Get inventory from `url`, retrying `FAILED_REQUEST_ATTEMPTS` times on errors.""" for attempt in range(1, FAILED_REQUEST_ATTEMPTS+1): try: -- cgit v1.2.3 From 7134c10485d2b4215213c1ffb670fa9a06d5de1e Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Fri, 18 Dec 2020 21:41:30 +0100 Subject: Use update_wrapper instead of wraps We're not using it as a decorator so using wraps only complicates the call syntax --- bot/decorators.py | 6 +++--- bot/utils/lock.py | 5 ++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/bot/decorators.py b/bot/decorators.py index 3892e350f..a37996e80 100644 --- a/bot/decorators.py +++ b/bot/decorators.py @@ -2,7 +2,7 @@ import asyncio import logging import typing as t from contextlib import suppress -from functools import wraps +from functools import update_wrapper from discord import Member, NotFound from discord.ext import commands @@ -105,7 +105,7 @@ def redirect_output(destination_channel: int, bypass_roles: t.Container[int] = N await ctx.message.delete() log.trace("Redirect output: Deleted invocation message") - return wraps(func)(function.update_wrapper_globals(inner, func)) + return update_wrapper(function.update_wrapper_globals(inner, func), func) return wrap @@ -149,5 +149,5 @@ def respect_role_hierarchy(member_arg: function.Argument) -> t.Callable: else: log.trace(f"{func.__name__}: {target.top_role=} < {actor.top_role=}; calling func") await func(*args, **kwargs) - return wraps(func)(function.update_wrapper_globals(wrapper, func)) + return update_wrapper(function.update_wrapper_globals(wrapper, func), func) return decorator diff --git a/bot/utils/lock.py b/bot/utils/lock.py index cf87321c5..02188c827 100644 --- a/bot/utils/lock.py +++ b/bot/utils/lock.py @@ -1,7 +1,7 @@ import inspect import logging from collections import defaultdict -from functools import partial, wraps +from functools import partial, update_wrapper from typing import Any, Awaitable, Callable, Hashable, Union from weakref import WeakValueDictionary @@ -91,8 +91,7 @@ def lock(namespace: Hashable, resource_id: ResourceId, *, raise_error: bool = Fa log.info(f"{name}: aborted because resource {namespace!r}:{id_!r} is locked") if raise_error: raise LockedResourceError(str(namespace), id_) - - return wraps(func)(function.update_wrapper_globals(wrapper, func)) + return update_wrapper(function.update_wrapper_globals(wrapper, func), func) return decorator -- cgit v1.2.3 From 003613ff0f89871c8477e996c708873e1387e514 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 6 Jan 2021 06:56:17 +0100 Subject: Add comments to truncation handling code Co-authored-by: MarkKoz --- bot/exts/info/doc/_parsing.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py index 521034006..f51ab4ea1 100644 --- a/bot/exts/info/doc/_parsing.py +++ b/bot/exts/info/doc/_parsing.py @@ -276,15 +276,21 @@ def _get_truncated_description( if not markdown_element_ends: return "" + # Determine the "hard" truncation index. newline_truncate_index = find_nth_occurrence(result, "\n", max_lines) if newline_truncate_index is not None and newline_truncate_index < _MAX_DESCRIPTION_LENGTH: + # Truncate based on maximum lines if there are more than the maximum number of lines. truncate_index = newline_truncate_index else: + # There are less than the maximum number of lines; truncate based on the max char length. truncate_index = _MAX_DESCRIPTION_LENGTH + # Nothing needs to be truncated if the last element ends before the truncation index. if truncate_index >= markdown_element_ends[-1]: return result + # Determine the actual truncation index. + # Truncate at the last Markdown element that comes before the truncation index. markdown_truncate_index = max(cut for cut in markdown_element_ends if cut < truncate_index) return result[:markdown_truncate_index].strip(_TRUNCATE_STRIP_CHARACTERS) + "..." -- cgit v1.2.3 From fef6c50f0c8a9c54e6e0519c0feae5c8c32152c1 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 6 Jan 2021 06:57:54 +0100 Subject: Remove redundant variable Co-authored-by: MarkKoz --- bot/exts/info/doc/_parsing.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py index f51ab4ea1..032fe3404 100644 --- a/bot/exts/info/doc/_parsing.py +++ b/bot/exts/info/doc/_parsing.py @@ -263,9 +263,8 @@ def _get_truncated_description( else: element_markdown = markdown_converter.process_text(element) - element_markdown_length = len(element_markdown) rendered_length += element_length - tag_end_index += element_markdown_length + tag_end_index += len(element_markdown) if not element_markdown.isspace(): markdown_element_ends.append(tag_end_index) -- cgit v1.2.3 From cbd84558ef4e5e89ce032c8b5d47f1bb94b89ba0 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 6 Jan 2021 18:27:10 +0100 Subject: Do not attempt to set cache values for symbols that were not found --- bot/exts/info/doc/_cog.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 11d1dc9ad..df5d417d7 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -144,7 +144,8 @@ class CachedParser: None, partial(get_symbol_markdown, soup, item), ) - await doc_cache.set(item, markdown) + if markdown is not None: + await doc_cache.set(item, markdown) except Exception: log.exception(f"Unexpected error when handling {item}") else: -- cgit v1.2.3 From 3439badedb65f7d37ba9733bc4e8268f2efe316e Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 9 Jan 2021 06:37:31 +0100 Subject: Ensure no symbols get overwritten while generating symbol mappings The code handling this was moved to a function to achieve this cleanly. Includes fixes for bugs where incorrect package was added to the symbol name in the second branch and an incorrect symbol being added in the third branch Co-authored-by: MarkKoz --- bot/exts/info/doc/_cog.py | 74 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 55 insertions(+), 19 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index df5d417d7..ed9432ed2 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -222,27 +222,19 @@ class DocCog(commands.Cog): if "/" in symbol: continue # skip unreachable symbols with slashes + # e.g. get 'class' from 'py:class' group_name = group.split(":")[1] - if (original_symbol := self.doc_symbols.get(symbol)) is not None: - if group_name in FORCE_PREFIX_GROUPS: - symbol = f"{group_name}.{symbol}" - self.renamed_symbols.add(symbol) - - elif (original_symbol_group := original_symbol.group) in FORCE_PREFIX_GROUPS: - overridden_symbol = f"{original_symbol_group}.{symbol}" - if overridden_symbol in self.renamed_symbols: - overridden_symbol = f"{api_package_name}.{overridden_symbol}" - - self.doc_symbols[overridden_symbol] = original_symbol - self.renamed_symbols.add(overridden_symbol) - - elif api_package_name in PRIORITY_PACKAGES: - self.doc_symbols[f"{original_symbol.package}.{symbol}"] = original_symbol - self.renamed_symbols.add(symbol) - + while (original_symbol := self.doc_symbols.get(symbol)) is not None: + replaced_symbol_name = self.ensure_unique_symbol_name( + api_package_name, + group_name, + original_symbol, + symbol, + ) + if replaced_symbol_name is None: + break else: - symbol = f"{api_package_name}.{symbol}" - self.renamed_symbols.add(symbol) + symbol = replaced_symbol_name relative_url_path, _, symbol_id = relative_doc_url.partition("#") # Intern fields that have shared content so we're not storing unique strings for every object @@ -289,6 +281,50 @@ class DocCog(commands.Cog): self.scheduled_inventories.discard(api_package_name) await self.update_single(api_package_name, base_url, package) + def ensure_unique_symbol_name( + self, + package_name: str, + group_name: str, + original_item: DocItem, + symbol_name: str + ) -> Optional[str]: + """ + Ensure `symbol_name` doesn't overwrite an another symbol in `doc_symbols`. + + Should only be called with symbol names that already have a conflict in `doc_symbols`. + + If None is returned, space was created for `symbol_name` in `doc_symbols` instead of + the symbol name being changed. + """ + # Certain groups are added as prefixes to disambiguate the symbols. + if group_name in FORCE_PREFIX_GROUPS: + self.renamed_symbols.add(symbol_name) + return f"{group_name}.{symbol_name}" + + # The existing symbol with which the current symbol conflicts should have a group prefix. + # It currently doesn't have the group prefix because it's only added once there's a conflict. + elif (original_symbol_group := original_item.group) in FORCE_PREFIX_GROUPS: + overridden_symbol = f"{original_symbol_group}.{symbol_name}" + if overridden_symbol in self.doc_symbols: + # If there's still a conflict, prefix with package name. + overridden_symbol = f"{original_item.package}.{overridden_symbol}" + + self.doc_symbols[overridden_symbol] = original_item + self.renamed_symbols.add(overridden_symbol) + + elif package_name in PRIORITY_PACKAGES: + overridden_symbol = f"{original_item.package}.{symbol_name}" + if overridden_symbol in self.doc_symbols: + # If there's still a conflict, add the symbol's group in the middle. + overridden_symbol = f"{original_item.package}.{original_item.group}.{symbol_name}" + + self.doc_symbols[overridden_symbol] = original_item + self.renamed_symbols.add(overridden_symbol) + + else: + self.renamed_symbols.add(symbol_name) + return f"{package_name}.{symbol_name}" + async def refresh_inventory(self) -> None: """Refresh internal documentation inventory.""" REFRESH_EVENT.clear() -- cgit v1.2.3 From fcfb604bc9123254622b763dba46d3f25ed4d93c Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 9 Jan 2021 06:38:43 +0100 Subject: Do not ignore symbols with slashes. In some cases these are actual symbols that we can look up --- bot/exts/info/doc/_cog.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index ed9432ed2..7aa6d0428 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -219,8 +219,6 @@ class DocCog(commands.Cog): for group, items in package.items(): for symbol, relative_doc_url in items: - if "/" in symbol: - continue # skip unreachable symbols with slashes # e.g. get 'class' from 'py:class' group_name = group.split(":")[1] -- cgit v1.2.3 From 33c861b4e1fb88c52585647a958ac27810399704 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 9 Jan 2021 19:27:21 +0100 Subject: Do not add package name to the front of the symbol if it's already there --- bot/exts/info/doc/_cog.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 7aa6d0428..feb08e1cb 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -319,9 +319,18 @@ class DocCog(commands.Cog): self.doc_symbols[overridden_symbol] = original_item self.renamed_symbols.add(overridden_symbol) + # If we can't specially handle the symbol through its group or package, + # fall back to prepending its package name to the front. else: - self.renamed_symbols.add(symbol_name) - return f"{package_name}.{symbol_name}" + if symbol_name.startswith(package_name): + # If the symbol already starts with the package name, insert the group name after it. + split_symbol_name = symbol_name.split(".", maxsplit=1) + split_symbol_name.insert(1, group_name) + overridden_symbol = ".".join(split_symbol_name) + else: + overridden_symbol = f"{package_name}.{symbol_name}" + self.renamed_symbols.add(overridden_symbol) + return overridden_symbol async def refresh_inventory(self) -> None: """Refresh internal documentation inventory.""" -- cgit v1.2.3 From 70609baca94dc7c7ad7598f707ac479efe348e88 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 9 Jan 2021 21:48:51 +0100 Subject: Periodically clear unnecessary futures from the _item_futures dict The code has no way of reaching futures through new requests after their result has been set as that also includes setting its value in redis. --- bot/exts/info/doc/_cog.py | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index feb08e1cb..364d99182 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -4,6 +4,7 @@ import asyncio import logging import re import sys +import time from collections import defaultdict from contextlib import suppress from functools import partial @@ -80,11 +81,25 @@ class QueueItem(NamedTuple): class ParseResultFuture(asyncio.Future): - """Future with the user_requested attribute to know which futures need to be waited for before clearing.""" + """ + Future with metadata for the parser class. + + `user_requested` is set by the parser when a Future is requested by an user and moved to the front, + allowing the futures to only be waited for when clearing if they were user requested. + + `result_set_time` provides the time at which the future's result has been set, + or -inf if the result hasn't been set yet + """ def __init__(self): super().__init__() self.user_requested = False + self.result_set_time = float("inf") + + def set_result(self, result: str, /) -> None: + """Set `self.result_set_time` to current time when the result is set.""" + self.result_set_time = time.time() + super().set_result(result) class CachedParser: @@ -102,6 +117,8 @@ class CachedParser: self._item_futures: Dict[DocItem, ParseResultFuture] = {} self._parse_task = None + self.cleanup_futures_task = bot_instance.loop.create_task(self._cleanup_futures()) + async def get_markdown(self, doc_item: DocItem) -> str: """ Get the result Markdown of `doc_item`. @@ -183,6 +200,21 @@ class CachedParser: self._page_symbols.clear() self._item_futures.clear() + async def _cleanup_futures(self) -> None: + """ + Clear old futures from internal results. + + After a future is set, we only need to wait for old requests to its associated DocItem to finish + as all new requests will get the value from the redis cache in the cog first. + Keeping them around for longer than a second is unnecessary and keeps the parsed Markdown strings alive. + """ + while True: + current_time = time.time() + for key, future in self._item_futures.copy().items(): + if current_time - future.result_set_time > 5: + del self._item_futures[key] + await asyncio.sleep(5) + class DocCog(commands.Cog): """A set of commands for querying & displaying documentation.""" -- cgit v1.2.3 From 5ad2afbc0160a7d9b0ab9c50b73044e7169db7cb Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 9 Jan 2021 21:59:03 +0100 Subject: Stop scheduled and long running tasks on cog unload --- bot/exts/info/doc/_cog.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 364d99182..61ac35b6f 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -567,3 +567,9 @@ class DocCog(commands.Cog): await ctx.send(f"Successfully cleared the cache for `{package_name}`.") else: await ctx.send("No keys matching the package found.") + + def cog_unload(self) -> None: + """Clear scheduled inventories, queued symbols and cleanup task on cog unload.""" + self.inventory_scheduler.cancel_all() + self.item_fetcher.cleanup_futures_task.cancel() + asyncio.create_task(self.item_fetcher.clear()) -- cgit v1.2.3 From 50bb3439824277991124b888d0b46c5936c2efce Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 10 Jan 2021 00:11:16 +0100 Subject: Handle equal DocItems in the queue This could be handled by using sets to hold the items in _page_symbols, but ultimately the check has a much smaller cost than having thousands of sets for the urls. Because we create futures for every item that ends up in the queue we can also skip the .get is None check and instead fetch the future directly from the dict --- bot/exts/info/doc/_cog.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 61ac35b6f..cee482c30 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -157,6 +157,11 @@ class CachedParser: while self._queue: item, soup = self._queue.pop() try: + if (future := self._item_futures[item]).done(): + # Some items are present in the inventories multiple times under different symbols, + # if we already parsed an equal item, we can just skip it. + continue + markdown = await bot_instance.loop.run_in_executor( None, partial(get_symbol_markdown, soup, item), @@ -166,8 +171,7 @@ class CachedParser: except Exception: log.exception(f"Unexpected error when handling {item}") else: - if (future := self._item_futures.get(item)) is not None: - future.set_result(markdown) + future.set_result(markdown) await asyncio.sleep(0.1) finally: self._parse_task = None -- cgit v1.2.3 From 298ad2f8e8f31d9f06a9e01a91a4d08f5b5d6347 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 10 Jan 2021 01:48:26 +0100 Subject: Refresh inventories when the redis cache is cleared Because the futures are cleaned up and Markdown only exists in the cache after a short time, items that were requested previously and had the cache cleared would be missing from the CachedParser --- bot/exts/info/doc/_cog.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index cee482c30..a78916d4a 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -565,9 +565,11 @@ class DocCog(commands.Cog): @docs_group.command(name="cleardoccache") @commands.has_any_role(*MODERATION_ROLES) + @lock("doc", COMMAND_LOCK_SINGLETON, raise_error=True) async def clear_cache_command(self, ctx: commands.Context, package_name: PackageName) -> None: """Clear the persistent redis cache for `package`.""" if await doc_cache.delete(package_name): + await self.refresh_inventory() await ctx.send(f"Successfully cleared the cache for `{package_name}`.") else: await ctx.send("No keys matching the package found.") -- cgit v1.2.3 From 383e4e993c1bc9d31562748cc55ab4c468bcdd8d Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 10 Jan 2021 03:25:50 +0100 Subject: Set exception on future Without the exception set, to the user the bot would fail silently if an exception was handled here --- bot/exts/info/doc/_cog.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index a78916d4a..3f7604072 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -168,8 +168,9 @@ class CachedParser: ) if markdown is not None: await doc_cache.set(item, markdown) - except Exception: + except Exception as e: log.exception(f"Unexpected error when handling {item}") + future.set_exception(e) else: future.set_result(markdown) await asyncio.sleep(0.1) -- cgit v1.2.3 From 5df60dd2ad10aec1c0368ed357562338e89a1250 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 10 Jan 2021 03:32:54 +0100 Subject: Bump markdownify to 0.6.1-0.6.* The 0.6 release brought a new parameter that has to be included in all tag handling methods --- Pipfile | 2 +- bot/exts/info/doc/_markdown.py | 21 +++++++++++++-------- bot/exts/info/doc/_parsing.py | 2 +- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/Pipfile b/Pipfile index 4ca651c92..a92f64f59 100644 --- a/Pipfile +++ b/Pipfile @@ -18,7 +18,7 @@ deepdiff = "~=4.0" feedparser = "~=5.2" fuzzywuzzy = "~=0.17" lxml = "~=4.4" -markdownify = "~=0.4" +markdownify = "~=0.6.1" more_itertools = "~=8.2" python-dateutil = "~=2.8" pyyaml = "~=5.1" diff --git a/bot/exts/info/doc/_markdown.py b/bot/exts/info/doc/_markdown.py index ba35a84c4..1b7d8232b 100644 --- a/bot/exts/info/doc/_markdown.py +++ b/bot/exts/info/doc/_markdown.py @@ -11,7 +11,7 @@ class DocMarkdownConverter(MarkdownConverter): super().__init__(**options) self.page_url = page_url - def convert_li(self, el: PageElement, text: str) -> str: + def convert_li(self, el: PageElement, text: str, convert_as_inline: bool) -> str: """Fix markdownify's erroneous indexing in ol tags.""" parent = el.parent if parent is not None and parent.name == "ol": @@ -27,27 +27,32 @@ class DocMarkdownConverter(MarkdownConverter): bullet = bullets[depth % len(bullets)] return f"{bullet} {text}\n" - def convert_hn(self, _n: int, el: PageElement, text: str) -> str: + def convert_hn(self, _n: int, el: PageElement, text: str, convert_as_inline: bool) -> str: """Convert h tags to bold text with ** instead of adding #.""" + if convert_as_inline: + return text return f"**{text}**\n\n" - def convert_code(self, el: PageElement, text: str) -> str: + def convert_code(self, el: PageElement, text: str, convert_as_inline: bool) -> str: """Undo `markdownify`s underscore escaping.""" return f"`{text}`".replace("\\", "") - def convert_pre(self, el: PageElement, text: str) -> str: + def convert_pre(self, el: PageElement, text: str, convert_as_inline: bool) -> str: """Wrap any codeblocks in `py` for syntax highlighting.""" code = "".join(el.strings) return f"```py\n{code}```" - def convert_a(self, el: PageElement, text: str) -> str: + def convert_a(self, el: PageElement, text: str, convert_as_inline: bool) -> str: """Resolve relative URLs to `self.page_url`.""" el["href"] = urljoin(self.page_url, el["href"]) - return super().convert_a(el, text) + return super().convert_a(el, text, convert_as_inline) - def convert_p(self, el: PageElement, text: str) -> str: + def convert_p(self, el: PageElement, text: str, convert_as_inline: bool) -> str: """Include only one newline instead of two when the parent is a li tag.""" + if convert_as_inline: + return text + parent = el.parent if parent is not None and parent.name == "li": return f"{text}\n" - return super().convert_p(el, text) + return super().convert_p(el, text, convert_as_inline) diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py index 032fe3404..46ae33b92 100644 --- a/bot/exts/info/doc/_parsing.py +++ b/bot/exts/info/doc/_parsing.py @@ -259,7 +259,7 @@ def _get_truncated_description( if rendered_length + element_length < max_length: if is_tag: - element_markdown = markdown_converter.process_tag(element) + element_markdown = markdown_converter.process_tag(element, convert_as_inline=False) else: element_markdown = markdown_converter.process_text(element) -- cgit v1.2.3 From 58154398d0ed905e0418451cfa7d3e8b66508bc6 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 10 Jan 2021 03:39:06 +0100 Subject: Expand docstring --- bot/exts/info/doc/_inventory_parser.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/bot/exts/info/doc/_inventory_parser.py b/bot/exts/info/doc/_inventory_parser.py index b38c3b2a8..886708867 100644 --- a/bot/exts/info/doc/_inventory_parser.py +++ b/bot/exts/info/doc/_inventory_parser.py @@ -95,7 +95,12 @@ async def _fetch_inventory(url: str) -> INVENTORY_DICT: async def fetch_inventory(url: str) -> Optional[INVENTORY_DICT]: - """Get inventory from `url`, retrying `FAILED_REQUEST_ATTEMPTS` times on errors.""" + """ + Get an inventory dict from `url`, retrying `FAILED_REQUEST_ATTEMPTS` times on errors. + + `url` should point at a valid sphinx objects.inv inventory file, which will be parsed into the + inventory dict in the format of {"domain:role": [("symbol_name", "relative_url_to_symbol"), ...], ...} + """ for attempt in range(1, FAILED_REQUEST_ATTEMPTS+1): try: inventory = await _fetch_inventory(url) -- cgit v1.2.3 From 695044167756eb2b6b4d953ef17f0359ba688246 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 10 Jan 2021 03:58:43 +0100 Subject: Move functions strictly related to parsing html to the _html module Some constants need to be shared between html and parsing, because they may also be wanted to be edited by the cog user to change the behaviour, they were moved into the package's init. --- bot/exts/info/doc/__init__.py | 5 ++ bot/exts/info/doc/_cog.py | 4 +- bot/exts/info/doc/_html.py | 112 ++++++++++++++++++++++++++++++++++++- bot/exts/info/doc/_parsing.py | 125 ++++-------------------------------------- 4 files changed, 126 insertions(+), 120 deletions(-) diff --git a/bot/exts/info/doc/__init__.py b/bot/exts/info/doc/__init__.py index e9eb9428c..af0bbff2d 100644 --- a/bot/exts/info/doc/__init__.py +++ b/bot/exts/info/doc/__init__.py @@ -1,6 +1,11 @@ from bot.bot import Bot from ._cog import DocCog +MAX_SIGNATURE_AMOUNT = 3 +PRIORITY_PACKAGES = ( + "python", +) + def setup(bot: Bot) -> None: """Load the Doc cog.""" diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 3f7604072..fd211d9f1 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -24,6 +24,7 @@ from bot.pagination import LinePaginator from bot.utils.lock import lock from bot.utils.messages import send_denial, wait_for_deletion from bot.utils.scheduling import Scheduler +from . import PRIORITY_PACKAGES from ._inventory_parser import INVENTORY_DICT, fetch_inventory from ._parsing import get_symbol_markdown from ._redis_cache import DocRedisCache @@ -38,9 +39,6 @@ FORCE_PREFIX_GROUPS = ( "pdbcommand", "term", ) -PRIORITY_PACKAGES = ( - "python", -) WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)") NOT_FOUND_DELETE_DELAY = RedirectOutput.delete_delay # Delay to wait before trying to reach a rescheduled inventory again, in minutes diff --git a/bot/exts/info/doc/_html.py b/bot/exts/info/doc/_html.py index 88fbc8825..f9fe542ce 100644 --- a/bot/exts/info/doc/_html.py +++ b/bot/exts/info/doc/_html.py @@ -1,10 +1,27 @@ import logging -from typing import List, Union +import re +from functools import partial +from typing import Callable, Container, Iterable, List, Union -from bs4.element import PageElement, SoupStrainer +from bs4 import BeautifulSoup +from bs4.element import NavigableString, PageElement, SoupStrainer, Tag + +from . import MAX_SIGNATURE_AMOUNT log = logging.getLogger(__name__) +_UNWANTED_SIGNATURE_SYMBOLS_RE = re.compile(r"\[source]|\\\\|¶") +_SEARCH_END_TAG_ATTRS = ( + "data", + "function", + "class", + "exception", + "seealso", + "section", + "rubric", + "sphinxsidebar", +) + class Strainer(SoupStrainer): """Subclass of SoupStrainer to allow matching of both `Tag`s and `NavigableString`s.""" @@ -26,3 +43,94 @@ class Strainer(SoupStrainer): return markup else: return super().search(markup) + + +def _find_elements_until_tag( + start_element: PageElement, + end_tag_filter: Union[Container[str], Callable[[Tag], bool]], + *, + func: Callable, + include_strings: bool = False, + limit: int = None, +) -> List[Union[Tag, NavigableString]]: + """ + Get all elements up to `limit` or until a tag matching `tag_filter` is found. + + `end_tag_filter` can be either a container of string names to check against, + or a filtering callable that's applied to tags. + + When `include_strings` is True, `NavigableString`s from the document will be included in the result along `Tag`s. + + `func` takes in a BeautifulSoup unbound method for finding multiple elements, such as `BeautifulSoup.find_all`. + The method is then iterated over and all elements until the matching tag or the limit are added to the return list. + """ + use_container_filter = not callable(end_tag_filter) + elements = [] + + for element in func(start_element, name=Strainer(include_strings=include_strings), limit=limit): + if isinstance(element, Tag): + if use_container_filter: + if element.name in end_tag_filter: + break + elif end_tag_filter(element): + break + elements.append(element) + + return elements + + +_find_next_children_until_tag = partial(_find_elements_until_tag, func=partial(BeautifulSoup.find_all, recursive=False)) +_find_recursive_children_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_all) +_find_next_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_next_siblings) +_find_previous_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_previous_siblings) + + +def _class_filter_factory(class_names: Iterable[str]) -> Callable[[Tag], bool]: + """Create callable that returns True when the passed in tag's class is in `class_names` or when it's is a table.""" + def match_tag(tag: Tag) -> bool: + for attr in class_names: + if attr in tag.get("class", ()): + return True + return tag.name == "table" + + return match_tag + + +def get_general_description(start_element: Tag) -> List[Union[Tag, NavigableString]]: + """ + Get page content to a table or a tag with its class in `SEARCH_END_TAG_ATTRS`. + + A headerlink a tag is attempted to be found to skip repeating the symbol information in the description, + if it's found it's used as the tag to start the search from instead of the `start_element`. + """ + child_tags = _find_recursive_children_until_tag(start_element, _class_filter_factory(["section"]), limit=100) + header = next(filter(_class_filter_factory(["headerlink"]), child_tags), None) + start_tag = header.parent if header is not None else start_element + return _find_next_siblings_until_tag(start_tag, _class_filter_factory(_SEARCH_END_TAG_ATTRS), include_strings=True) + + +def get_dd_description(symbol: PageElement) -> List[Union[Tag, NavigableString]]: + """Get the contents of the next dd tag, up to a dt or a dl tag.""" + description_tag = symbol.find_next("dd") + return _find_next_children_until_tag(description_tag, ("dt", "dl"), include_strings=True) + + +def get_signatures(start_signature: PageElement) -> List[str]: + """ + Collect up to `_MAX_SIGNATURE_AMOUNT` signatures from dt tags around the `start_signature` dt tag. + + First the signatures under the `start_signature` are included; + if less than 2 are found, tags above the start signature are added to the result if any are present. + """ + signatures = [] + for element in ( + *reversed(_find_previous_siblings_until_tag(start_signature, ("dd",), limit=2)), + start_signature, + *_find_next_siblings_until_tag(start_signature, ("dd",), limit=2), + )[-MAX_SIGNATURE_AMOUNT:]: + signature = _UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text) + + if signature: + signatures.append(signature) + + return signatures diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py index 46ae33b92..d68f7c8d7 100644 --- a/bot/exts/info/doc/_parsing.py +++ b/bot/exts/info/doc/_parsing.py @@ -5,37 +5,23 @@ import re import string import textwrap from collections import namedtuple -from functools import partial -from typing import Callable, Collection, Container, Iterable, Iterator, List, Optional, TYPE_CHECKING, Union +from typing import Collection, Iterable, Iterator, List, Optional, TYPE_CHECKING, Union from bs4 import BeautifulSoup -from bs4.element import NavigableString, PageElement, Tag +from bs4.element import NavigableString, Tag from bot.utils.helpers import find_nth_occurrence -from ._html import Strainer +from . import MAX_SIGNATURE_AMOUNT +from ._html import get_dd_description, get_general_description, get_signatures from ._markdown import DocMarkdownConverter if TYPE_CHECKING: from ._cog import DocItem log = logging.getLogger(__name__) -_MAX_SIGNATURE_AMOUNT = 3 - -_UNWANTED_SIGNATURE_SYMBOLS_RE = re.compile(r"\[source]|\\\\|¶") _WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)") _PARAMETERS_RE = re.compile(r"\((.+)\)") -_SEARCH_END_TAG_ATTRS = ( - "data", - "function", - "class", - "exception", - "seealso", - "section", - "rubric", - "sphinxsidebar", -) - _NO_SIGNATURE_GROUPS = { "attribute", "envvar", @@ -46,7 +32,7 @@ _NO_SIGNATURE_GROUPS = { } _EMBED_CODE_BLOCK_LINE_LENGTH = 61 # _MAX_SIGNATURE_AMOUNT code block wrapped lines with py syntax highlight -_MAX_SIGNATURES_LENGTH = (_EMBED_CODE_BLOCK_LINE_LENGTH + 8) * _MAX_SIGNATURE_AMOUNT +_MAX_SIGNATURES_LENGTH = (_EMBED_CODE_BLOCK_LINE_LENGTH + 8) * MAX_SIGNATURE_AMOUNT # Maximum discord message length - signatures on top _MAX_DESCRIPTION_LENGTH = 2000 - _MAX_SIGNATURES_LENGTH _TRUNCATE_STRIP_CHARACTERS = "!?:;." + string.whitespace @@ -118,86 +104,6 @@ def _split_parameters(parameters_string: str) -> Iterator[str]: yield parameters_string[last_split:] -def _find_elements_until_tag( - start_element: PageElement, - end_tag_filter: Union[Container[str], Callable[[Tag], bool]], - *, - func: Callable, - include_strings: bool = False, - limit: int = None, -) -> List[Union[Tag, NavigableString]]: - """ - Get all elements up to `limit` or until a tag matching `tag_filter` is found. - - `end_tag_filter` can be either a container of string names to check against, - or a filtering callable that's applied to tags. - - When `include_strings` is True, `NavigableString`s from the document will be included in the result along `Tag`s. - - `func` takes in a BeautifulSoup unbound method for finding multiple elements, such as `BeautifulSoup.find_all`. - The method is then iterated over and all elements until the matching tag or the limit are added to the return list. - """ - use_container_filter = not callable(end_tag_filter) - elements = [] - - for element in func(start_element, name=Strainer(include_strings=include_strings), limit=limit): - if isinstance(element, Tag): - if use_container_filter: - if element.name in end_tag_filter: - break - elif end_tag_filter(element): - break - elements.append(element) - - return elements - - -_find_next_children_until_tag = partial(_find_elements_until_tag, func=partial(BeautifulSoup.find_all, recursive=False)) -_find_recursive_children_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_all) -_find_next_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_next_siblings) -_find_previous_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_previous_siblings) - - -def _get_general_description(start_element: Tag) -> List[Union[Tag, NavigableString]]: - """ - Get page content to a table or a tag with its class in `SEARCH_END_TAG_ATTRS`. - - A headerlink a tag is attempted to be found to skip repeating the symbol information in the description, - if it's found it's used as the tag to start the search from instead of the `start_element`. - """ - child_tags = _find_recursive_children_until_tag(start_element, _class_filter_factory(["section"]), limit=100) - header = next(filter(_class_filter_factory(["headerlink"]), child_tags), None) - start_tag = header.parent if header is not None else start_element - return _find_next_siblings_until_tag(start_tag, _class_filter_factory(_SEARCH_END_TAG_ATTRS), include_strings=True) - - -def _get_dd_description(symbol: PageElement) -> List[Union[Tag, NavigableString]]: - """Get the contents of the next dd tag, up to a dt or a dl tag.""" - description_tag = symbol.find_next("dd") - return _find_next_children_until_tag(description_tag, ("dt", "dl"), include_strings=True) - - -def _get_signatures(start_signature: PageElement) -> List[str]: - """ - Collect up to `_MAX_SIGNATURE_AMOUNT` signatures from dt tags around the `start_signature` dt tag. - - First the signatures under the `start_signature` are included; - if less than 2 are found, tags above the start signature are added to the result if any are present. - """ - signatures = [] - for element in ( - *reversed(_find_previous_siblings_until_tag(start_signature, ("dd",), limit=2)), - start_signature, - *_find_next_siblings_until_tag(start_signature, ("dd",), limit=2), - )[-(_MAX_SIGNATURE_AMOUNT):]: - signature = _UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text) - - if signature: - signatures.append(signature) - - return signatures - - def _truncate_signatures(signatures: Collection[str]) -> Union[List[str], Collection[str]]: """ Truncate passed signatures to not exceed `_MAX_SIGNAUTRES_LENGTH`. @@ -210,7 +116,7 @@ def _truncate_signatures(signatures: Collection[str]) -> Union[List[str], Collec if not sum(len(signature) for signature in signatures) > _MAX_SIGNATURES_LENGTH: return signatures - max_signature_length = _EMBED_CODE_BLOCK_LINE_LENGTH * (_MAX_SIGNATURE_AMOUNT + 1 - len(signatures)) + max_signature_length = _EMBED_CODE_BLOCK_LINE_LENGTH * (MAX_SIGNATURE_AMOUNT + 1 - len(signatures)) formatted_signatures = [] for signature in signatures: signature = signature.strip() @@ -317,17 +223,6 @@ def _create_markdown(signatures: Optional[List[str]], description: Iterable[Tag] return formatted_markdown -def _class_filter_factory(class_names: Iterable[str]) -> Callable[[Tag], bool]: - """Create callable that returns True when the passed in tag's class is in `class_names` or when it's is a table.""" - def match_tag(tag: Tag) -> bool: - for attr in class_names: - if attr in tag.get("class", ()): - return True - return tag.name == "table" - - return match_tag - - def get_symbol_markdown(soup: BeautifulSoup, symbol_data: DocItem) -> Optional[str]: """ Return parsed markdown of the passed symbol using the passed in soup, truncated to 1000 characters. @@ -342,12 +237,12 @@ def get_symbol_markdown(soup: BeautifulSoup, symbol_data: DocItem) -> Optional[s # Modules, doc pages and labels don't point to description list tags but to tags like divs, # no special parsing can be done so we only try to include what's under them. if symbol_data.group in {"module", "doc", "label"} or symbol_heading.name != "dt": - description = _get_general_description(symbol_heading) + description = get_general_description(symbol_heading) elif symbol_data.group in _NO_SIGNATURE_GROUPS: - description = _get_dd_description(symbol_heading) + description = get_dd_description(symbol_heading) else: - signature = _get_signatures(symbol_heading) - description = _get_dd_description(symbol_heading) + signature = get_signatures(symbol_heading) + description = get_dd_description(symbol_heading) return _create_markdown(signature, description, symbol_data.url).replace('¶', '').strip() -- cgit v1.2.3 From 22520b9b37e161437a376a6067955e0c9b91cc76 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 10 Jan 2021 04:01:34 +0100 Subject: Defer import to avoid circular imports --- bot/exts/info/doc/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/exts/info/doc/__init__.py b/bot/exts/info/doc/__init__.py index af0bbff2d..dff7a0269 100644 --- a/bot/exts/info/doc/__init__.py +++ b/bot/exts/info/doc/__init__.py @@ -1,5 +1,4 @@ from bot.bot import Bot -from ._cog import DocCog MAX_SIGNATURE_AMOUNT = 3 PRIORITY_PACKAGES = ( @@ -9,4 +8,5 @@ PRIORITY_PACKAGES = ( def setup(bot: Bot) -> None: """Load the Doc cog.""" + from ._cog import DocCog bot.add_cog(DocCog(bot)) -- cgit v1.2.3 From 33b408d9e2cc805e2cfc6851225929c50725ea80 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 10 Jan 2021 06:15:27 +0100 Subject: Rename CachedParser to BatchParser and move it to its own module --- bot/exts/info/doc/__init__.py | 3 + bot/exts/info/doc/_batch_parser.py | 173 +++++++++++++++++++++++++++++++++++++ bot/exts/info/doc/_cog.py | 170 +----------------------------------- 3 files changed, 180 insertions(+), 166 deletions(-) create mode 100644 bot/exts/info/doc/_batch_parser.py diff --git a/bot/exts/info/doc/__init__.py b/bot/exts/info/doc/__init__.py index dff7a0269..2bb43a950 100644 --- a/bot/exts/info/doc/__init__.py +++ b/bot/exts/info/doc/__init__.py @@ -1,10 +1,13 @@ from bot.bot import Bot +from ._redis_cache import DocRedisCache MAX_SIGNATURE_AMOUNT = 3 PRIORITY_PACKAGES = ( "python", ) +doc_cache = DocRedisCache(namespace="Docs") + def setup(bot: Bot) -> None: """Load the Doc cog.""" diff --git a/bot/exts/info/doc/_batch_parser.py b/bot/exts/info/doc/_batch_parser.py new file mode 100644 index 000000000..edd6bb090 --- /dev/null +++ b/bot/exts/info/doc/_batch_parser.py @@ -0,0 +1,173 @@ +from __future__ import annotations + +import asyncio +import logging +import time +from collections import defaultdict +from contextlib import suppress +from functools import partial +from operator import attrgetter +from typing import Dict, List, NamedTuple, TYPE_CHECKING, Union + +from bs4 import BeautifulSoup + +import bot +from . import doc_cache +from ._parsing import get_symbol_markdown +if TYPE_CHECKING: + from ._cog import DocItem + +log = logging.getLogger(__name__) + + +class QueueItem(NamedTuple): + """Contains a symbol and the BeautifulSoup object needed to parse it.""" + + symbol: DocItem + soup: BeautifulSoup + + def __eq__(self, other: Union[QueueItem, DocItem]): + if isinstance(other, type(self.symbol)): + return self.symbol == other + return NamedTuple.__eq__(self, other) + + +class ParseResultFuture(asyncio.Future): + """ + Future with metadata for the parser class. + + `user_requested` is set by the parser when a Future is requested by an user and moved to the front, + allowing the futures to only be waited for when clearing if they were user requested. + + `result_set_time` provides the time at which the future's result has been set, + or -inf if the result hasn't been set yet + """ + + def __init__(self): + super().__init__() + self.user_requested = False + self.result_set_time = float("inf") + + def set_result(self, result: str, /) -> None: + """Set `self.result_set_time` to current time when the result is set.""" + self.result_set_time = time.time() + super().set_result(result) + + +class BatchParser: + """ + Get the Markdown of all symbols on a page and send them to redis when a symbol is requested. + + DocItems are added through the `add_item` method which adds them to the `_page_symbols` dict. + `get_markdown` is used to fetch the Markdown; when this is used for the first time on a page, + all of the symbols are queued to be parsed to avoid multiple web requests to the same page. + """ + + def __init__(self): + self._queue: List[QueueItem] = [] + self._page_symbols: Dict[str, List[DocItem]] = defaultdict(list) + self._item_futures: Dict[DocItem, ParseResultFuture] = {} + self._parse_task = None + + self.cleanup_futures_task = bot.instance.loop.create_task(self._cleanup_futures()) + + async def get_markdown(self, doc_item: DocItem) -> str: + """ + Get the result Markdown of `doc_item`. + + If no symbols were fetched from `doc_item`s page before, + the HTML has to be fetched and then all items from the page are put into the parse queue. + + Not safe to run while `self.clear` is running. + """ + if (symbols_to_queue := self._page_symbols.get(doc_item.url)) is not None: + async with bot.instance.http_session.get(doc_item.url) as response: + soup = BeautifulSoup(await response.text(encoding="utf8"), "lxml") + + self._queue.extend(QueueItem(symbol, soup) for symbol in symbols_to_queue) + self._item_futures.update((symbol, ParseResultFuture()) for symbol in symbols_to_queue) + del self._page_symbols[doc_item.url] + log.debug(f"Added symbols from {doc_item.url} to parse queue.") + + if self._parse_task is None: + self._parse_task = asyncio.create_task(self._parse_queue()) + + with suppress(ValueError): + # If the item is not in the list then the item is already parsed or is being parsed + self._move_to_front(doc_item) + self._item_futures[doc_item].user_requested = True + return await self._item_futures[doc_item] + + async def _parse_queue(self) -> None: + """ + Parse all item from the queue, setting their result markdown on the futures and sending them to redis. + + The coroutine will run as long as the queue is not empty, resetting `self._parse_task` to None when finished. + """ + log.trace("Starting queue parsing.") + try: + while self._queue: + item, soup = self._queue.pop() + try: + if (future := self._item_futures[item]).done(): + # Some items are present in the inventories multiple times under different symbols, + # if we already parsed an equal item, we can just skip it. + continue + + markdown = await bot.instance.loop.run_in_executor( + None, + partial(get_symbol_markdown, soup, item), + ) + if markdown is not None: + await doc_cache.set(item, markdown) + except Exception as e: + log.exception(f"Unexpected error when handling {item}") + future.set_exception(e) + else: + future.set_result(markdown) + await asyncio.sleep(0.1) + finally: + self._parse_task = None + log.trace("Finished parsing queue.") + + def _move_to_front(self, item: Union[QueueItem, DocItem]) -> None: + """Move `item` to the front of the parse queue.""" + # The parse queue stores soups along with the doc symbols in QueueItem objects, + # in case we're moving a DocItem we have to get the associated QueueItem first and then move it. + item_index = self._queue.index(item) + queue_item = self._queue.pop(item_index) + + self._queue.append(queue_item) + + def add_item(self, doc_item: DocItem) -> None: + """Map a DocItem to its page so that the symbol will be parsed once the page is requested.""" + self._page_symbols[doc_item.url].append(doc_item) + + async def clear(self) -> None: + """ + Clear all internal symbol data. + + All currently requested items are waited to be parsed before clearing. + """ + for future in filter(attrgetter("user_requested"), self._item_futures.values()): + await future + if self._parse_task is not None: + self._parse_task.cancel() + self._queue.clear() + self._page_symbols.clear() + self._item_futures.clear() + + async def _cleanup_futures(self) -> None: + """ + Clear old futures from internal results. + + After a future is set, we only need to wait for old requests to its associated `DocItem` to finish + as all new requests will get the value from the redis cache in the cog first. + Keeping them around for longer than a second is unnecessary and keeps the parsed Markdown strings alive. + """ + while True: + current_time = time.time() + for key, future in self._item_futures.copy().items(): + if current_time - future.result_set_time > 5: + del self._item_futures[key] + await asyncio.sleep(5) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index fd211d9f1..7a943f1a4 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -4,19 +4,13 @@ import asyncio import logging import re import sys -import time -from collections import defaultdict from contextlib import suppress -from functools import partial -from operator import attrgetter from types import SimpleNamespace -from typing import Dict, List, NamedTuple, Optional, Union +from typing import Dict, NamedTuple, Optional import discord -from bs4 import BeautifulSoup from discord.ext import commands -from bot import instance as bot_instance from bot.bot import Bot from bot.constants import MODERATION_ROLES, RedirectOutput from bot.converters import Inventory, PackageName, ValidURL @@ -24,10 +18,9 @@ from bot.pagination import LinePaginator from bot.utils.lock import lock from bot.utils.messages import send_denial, wait_for_deletion from bot.utils.scheduling import Scheduler -from . import PRIORITY_PACKAGES +from . import PRIORITY_PACKAGES, doc_cache +from ._batch_parser import BatchParser from ._inventory_parser import INVENTORY_DICT, fetch_inventory -from ._parsing import get_symbol_markdown -from ._redis_cache import DocRedisCache log = logging.getLogger(__name__) @@ -48,8 +41,6 @@ REFRESH_EVENT = asyncio.Event() REFRESH_EVENT.set() COMMAND_LOCK_SINGLETON = "inventory refresh" -doc_cache = DocRedisCache(namespace="Docs") - class DocItem(NamedTuple): """Holds inventory symbol information.""" @@ -66,159 +57,6 @@ class DocItem(NamedTuple): return self.base_url + self.relative_url_path -class QueueItem(NamedTuple): - """Contains a symbol and the BeautifulSoup object needed to parse it.""" - - symbol: DocItem - soup: BeautifulSoup - - def __eq__(self, other: Union[QueueItem, DocItem]): - if isinstance(other, DocItem): - return self.symbol == other - return NamedTuple.__eq__(self, other) - - -class ParseResultFuture(asyncio.Future): - """ - Future with metadata for the parser class. - - `user_requested` is set by the parser when a Future is requested by an user and moved to the front, - allowing the futures to only be waited for when clearing if they were user requested. - - `result_set_time` provides the time at which the future's result has been set, - or -inf if the result hasn't been set yet - """ - - def __init__(self): - super().__init__() - self.user_requested = False - self.result_set_time = float("inf") - - def set_result(self, result: str, /) -> None: - """Set `self.result_set_time` to current time when the result is set.""" - self.result_set_time = time.time() - super().set_result(result) - - -class CachedParser: - """ - Get the symbol Markdown from pages with smarter caching. - - DocItems are added through the `add_item` method which adds them to the `_page_symbols` dict. - `get_markdown` is used to fetch the Markdown; when this is used for the first time on a page, - all of the symbols are queued to be parsed to avoid multiple web requests to the same page. - """ - - def __init__(self): - self._queue: List[QueueItem] = [] - self._page_symbols: Dict[str, List[DocItem]] = defaultdict(list) - self._item_futures: Dict[DocItem, ParseResultFuture] = {} - self._parse_task = None - - self.cleanup_futures_task = bot_instance.loop.create_task(self._cleanup_futures()) - - async def get_markdown(self, doc_item: DocItem) -> str: - """ - Get the result Markdown of `doc_item`. - - If no symbols were fetched from `doc_item`s page before, - the HTML has to be fetched before parsing can be queued. - - Not safe to run while `self.clear` is running. - """ - if (symbols_to_queue := self._page_symbols.get(doc_item.url)) is not None: - async with bot_instance.http_session.get(doc_item.url) as response: - soup = BeautifulSoup(await response.text(encoding="utf8"), "lxml") - - self._queue.extend(QueueItem(symbol, soup) for symbol in symbols_to_queue) - self._item_futures.update((symbol, ParseResultFuture()) for symbol in symbols_to_queue) - del self._page_symbols[doc_item.url] - log.debug(f"Added symbols from {doc_item.url} to parse queue.") - - if self._parse_task is None: - self._parse_task = asyncio.create_task(self._parse_queue()) - - with suppress(ValueError): - # If the item is not in the list then the item is already parsed or is being parsed - self._move_to_front(doc_item) - self._item_futures[doc_item].user_requested = True - return await self._item_futures[doc_item] - - async def _parse_queue(self) -> None: - """ - Parse all item from the queue, setting associated events for symbols if present. - - The coroutine will run as long as the queue is not empty, resetting `self._parse_task` to None when finished. - """ - log.trace("Starting queue parsing.") - try: - while self._queue: - item, soup = self._queue.pop() - try: - if (future := self._item_futures[item]).done(): - # Some items are present in the inventories multiple times under different symbols, - # if we already parsed an equal item, we can just skip it. - continue - - markdown = await bot_instance.loop.run_in_executor( - None, - partial(get_symbol_markdown, soup, item), - ) - if markdown is not None: - await doc_cache.set(item, markdown) - except Exception as e: - log.exception(f"Unexpected error when handling {item}") - future.set_exception(e) - else: - future.set_result(markdown) - await asyncio.sleep(0.1) - finally: - self._parse_task = None - log.trace("Finished parsing queue.") - - def _move_to_front(self, item: Union[QueueItem, DocItem]) -> None: - """Move `item` to the front of the parse queue.""" - # The parse queue stores soups along with the doc symbols in QueueItem objects, - # in case we're moving a DocItem we have to get the associated QueueItem first and then move it. - item_index = self._queue.index(item) - queue_item = self._queue.pop(item_index) - - self._queue.append(queue_item) - - def add_item(self, doc_item: DocItem) -> None: - """Map a DocItem to its page so that the symbol will be parsed once the page is requested.""" - self._page_symbols[doc_item.url].append(doc_item) - - async def clear(self) -> None: - """ - Clear all internal symbol data. - - All currently requested items are waited to be parsed before clearing. - """ - for future in filter(attrgetter("user_requested"), self._item_futures.values()): - await future - if self._parse_task is not None: - self._parse_task.cancel() - self._queue.clear() - self._page_symbols.clear() - self._item_futures.clear() - - async def _cleanup_futures(self) -> None: - """ - Clear old futures from internal results. - - After a future is set, we only need to wait for old requests to its associated DocItem to finish - as all new requests will get the value from the redis cache in the cog first. - Keeping them around for longer than a second is unnecessary and keeps the parsed Markdown strings alive. - """ - while True: - current_time = time.time() - for key, future in self._item_futures.copy().items(): - if current_time - future.result_set_time > 5: - del self._item_futures[key] - await asyncio.sleep(5) - - class DocCog(commands.Cog): """A set of commands for querying & displaying documentation.""" @@ -226,7 +64,7 @@ class DocCog(commands.Cog): self.base_urls = {} self.bot = bot self.doc_symbols: Dict[str, DocItem] = {} - self.item_fetcher = CachedParser() + self.item_fetcher = BatchParser() self.renamed_symbols = set() self.inventory_scheduler = Scheduler(self.__class__.__name__) -- cgit v1.2.3 From 9a4ad5f73cd2c42087643cb36b9e6076c24695fb Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 10 Jan 2021 22:00:59 +0100 Subject: Change the func name to wrapped for clarity --- bot/utils/function.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/bot/utils/function.py b/bot/utils/function.py index 8b8c7ba5c..037516ac4 100644 --- a/bot/utils/function.py +++ b/bot/utils/function.py @@ -76,23 +76,23 @@ def get_bound_args(func: t.Callable, args: t.Tuple, kwargs: t.Dict[str, t.Any]) return bound_args.arguments -def update_wrapper_globals(wrapper: types.FunctionType, func: types.FunctionType) -> types.FunctionType: +def update_wrapper_globals(wrapper: types.FunctionType, wrapped: types.FunctionType) -> types.FunctionType: """ - Update globals of `wrapper` with the globals from `func`. + Update globals of `wrapper` with the globals from `wrapped`. For forwardrefs in command annotations discordpy uses the __global__ attribute of the function to resolve their values, with decorators that replace the function this breaks because they have their own globals. This function creates a new function functionally identical to `wrapper`, which has the globals replaced with - a merge of `func`s globals and the `wrapper`s globals. + a merge of `wrapped`s globals and the `wrapper`s globals. - In case a global name from `func` conflicts with a name from `wrapper`'s globals, `wrapper` will win + In case a global name from `wrapped` conflicts with a name from `wrapper`'s globals, `wrapper` will win to keep it functional, but this may cause problems if the name is used as an annotation and - discord.py uses it as a converter on a parameter from `func`. + discord.py uses it as a converter on a parameter from `wrapped`. """ new_globals = wrapper.__globals__.copy() - new_globals.update((k, v) for k, v in func.__globals__.items() if k not in wrapper.__code__.co_names) + new_globals.update((k, v) for k, v in wrapped.__globals__.items() if k not in wrapper.__code__.co_names) return types.FunctionType( code=wrapper.__code__, globals=new_globals, -- cgit v1.2.3 From 4788a9364ac84cf0ee210c8b026ea7f2d5dd31ee Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 10 Jan 2021 22:07:58 +0100 Subject: Create decorator for update_wrapper_globals mimicking functools.wraps --- bot/decorators.py | 14 ++++++++------ bot/utils/function.py | 15 +++++++++++++++ bot/utils/lock.py | 10 +++++++--- 3 files changed, 30 insertions(+), 9 deletions(-) diff --git a/bot/decorators.py b/bot/decorators.py index a37996e80..02735d0dc 100644 --- a/bot/decorators.py +++ b/bot/decorators.py @@ -1,8 +1,8 @@ import asyncio import logging +import types import typing as t from contextlib import suppress -from functools import update_wrapper from discord import Member, NotFound from discord.ext import commands @@ -11,6 +11,7 @@ from discord.ext.commands import Cog, Context from bot.constants import Channels, RedirectOutput from bot.utils import function from bot.utils.checks import in_whitelist_check +from bot.utils.function import command_wraps log = logging.getLogger(__name__) @@ -70,7 +71,8 @@ def redirect_output(destination_channel: int, bypass_roles: t.Container[int] = N This decorator must go before (below) the `command` decorator. """ - def wrap(func: t.Callable) -> t.Callable: + def wrap(func: types.FunctionType) -> types.FunctionType: + @command_wraps(func) async def inner(self: Cog, ctx: Context, *args, **kwargs) -> None: if ctx.channel.id == destination_channel: log.trace(f"Command {ctx.command.name} was invoked in destination_channel, not redirecting") @@ -104,8 +106,7 @@ def redirect_output(destination_channel: int, bypass_roles: t.Container[int] = N with suppress(NotFound): await ctx.message.delete() log.trace("Redirect output: Deleted invocation message") - - return update_wrapper(function.update_wrapper_globals(inner, func), func) + return inner return wrap @@ -121,7 +122,8 @@ def respect_role_hierarchy(member_arg: function.Argument) -> t.Callable: This decorator must go before (below) the `command` decorator. """ - def decorator(func: t.Callable) -> t.Callable: + def decorator(func: types.FunctionType) -> types.FunctionType: + @command_wraps(func) async def wrapper(*args, **kwargs) -> None: log.trace(f"{func.__name__}: respect role hierarchy decorator called") @@ -149,5 +151,5 @@ def respect_role_hierarchy(member_arg: function.Argument) -> t.Callable: else: log.trace(f"{func.__name__}: {target.top_role=} < {actor.top_role=}; calling func") await func(*args, **kwargs) - return update_wrapper(function.update_wrapper_globals(wrapper, func), func) + return wrapper return decorator diff --git a/bot/utils/function.py b/bot/utils/function.py index 037516ac4..5fd70e1e8 100644 --- a/bot/utils/function.py +++ b/bot/utils/function.py @@ -1,5 +1,6 @@ """Utilities for interaction with functions.""" +import functools import inspect import types import typing as t @@ -100,3 +101,17 @@ def update_wrapper_globals(wrapper: types.FunctionType, wrapped: types.FunctionT argdefs=wrapper.__defaults__, closure=wrapper.__closure__, ) + + +def command_wraps( + wrapped: types.FunctionType, + assigned: t.Sequence[str] = functools.WRAPPER_ASSIGNMENTS, + updated: t.Sequence[str] = functools.WRAPPER_UPDATES, +) -> t.Callable[[types.FunctionType], types.FunctionType]: + """Update `wrapped` to look like the decorated function and update globals for discordpy forwardref evaluation.""" + def decorator(wrapper: types.FunctionType) -> types.FunctionType: + return functools.update_wrapper( + update_wrapper_globals(wrapper, wrapped), wrapped, assigned, updated + ) + + return decorator diff --git a/bot/utils/lock.py b/bot/utils/lock.py index 02188c827..978e3ae94 100644 --- a/bot/utils/lock.py +++ b/bot/utils/lock.py @@ -1,12 +1,14 @@ import inspect import logging +import types from collections import defaultdict -from functools import partial, update_wrapper +from functools import partial from typing import Any, Awaitable, Callable, Hashable, Union from weakref import WeakValueDictionary from bot.errors import LockedResourceError from bot.utils import function +from bot.utils.function import command_wraps log = logging.getLogger(__name__) __lock_dicts = defaultdict(WeakValueDictionary) @@ -58,9 +60,10 @@ def lock(namespace: Hashable, resource_id: ResourceId, *, raise_error: bool = Fa If decorating a command, this decorator must go before (below) the `command` decorator. """ - def decorator(func: Callable) -> Callable: + def decorator(func: types.FunctionType) -> types.FunctionType: name = func.__name__ + @command_wraps(func) async def wrapper(*args, **kwargs) -> Any: log.trace(f"{name}: mutually exclusive decorator called") @@ -91,7 +94,8 @@ def lock(namespace: Hashable, resource_id: ResourceId, *, raise_error: bool = Fa log.info(f"{name}: aborted because resource {namespace!r}:{id_!r} is locked") if raise_error: raise LockedResourceError(str(namespace), id_) - return update_wrapper(function.update_wrapper_globals(wrapper, func), func) + return wrapper + return decorator -- cgit v1.2.3 From d50ae50681f552c9a0d3e2c797b0916a09da54da Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 10 Jan 2021 22:10:12 +0100 Subject: Resolve wrapped command callbacks in the source command Without this the command will fetch the source of the wrapper --- bot/exts/info/source.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bot/exts/info/source.py b/bot/exts/info/source.py index 7b41352d4..ae68ef7e8 100644 --- a/bot/exts/info/source.py +++ b/bot/exts/info/source.py @@ -68,7 +68,10 @@ class BotSource(commands.Cog): Raise BadArgument if `source_item` is a dynamically-created object (e.g. via internal eval). """ if isinstance(source_item, commands.Command): - src = source_item.callback.__code__ + source_item = source_item.callback + while hasattr(source_item, "__wrapped__"): + source_item = source_item.__wrapped__ + src = source_item.__code__ filename = src.co_filename elif isinstance(source_item, str): tags_cog = self.bot.get_cog("Tags") -- cgit v1.2.3 From 760ca7e9a0996865ee4d9e127baef8f0246a9e25 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Mon, 11 Jan 2021 00:54:37 +0100 Subject: Send a message to devlog instead of logging a warning --- bot/exts/info/doc/_batch_parser.py | 30 ++++++++++++++++++++++++++++++ bot/exts/info/doc/_parsing.py | 1 - 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/bot/exts/info/doc/_batch_parser.py b/bot/exts/info/doc/_batch_parser.py index edd6bb090..ebae6efb8 100644 --- a/bot/exts/info/doc/_batch_parser.py +++ b/bot/exts/info/doc/_batch_parser.py @@ -9,9 +9,11 @@ from functools import partial from operator import attrgetter from typing import Dict, List, NamedTuple, TYPE_CHECKING, Union +import discord from bs4 import BeautifulSoup import bot +from bot.constants import Channels from . import doc_cache from ._parsing import get_symbol_markdown if TYPE_CHECKING: @@ -20,6 +22,30 @@ if TYPE_CHECKING: log = logging.getLogger(__name__) +class StaleInventoryNotifier: + """Handle sending notifications about stale inventories through `DocItem`s to dev log.""" + + def __init__(self): + self._init_task = bot.instance.loop.create_task(self._init_channel()) + self._warned_urls = set() + + async def _init_channel(self) -> None: + """Wait for guild and get channel.""" + await bot.instance.wait_until_guild_available() + self._dev_log = bot.instance.get_channel(Channels.dev_log) + + async def send_warning(self, item: DocItem) -> None: + """Send a warning to dev log is one wasn't already sent for `item`'s url.""" + if item.url not in self._warned_urls: + self._warned_urls.add(item.url) + await self._init_task + embed = discord.Embed( + description=f"Doc item `{item.symbol_id=}` present in loaded documentation inventories " + f"not found on [site]({item.url}), inventories may need to be refreshed." + ) + await self._dev_log.send(embed=embed) + + class QueueItem(NamedTuple): """Contains a symbol and the BeautifulSoup object needed to parse it.""" @@ -71,6 +97,8 @@ class BatchParser: self.cleanup_futures_task = bot.instance.loop.create_task(self._cleanup_futures()) + self.stale_inventory_notifier = StaleInventoryNotifier() + async def get_markdown(self, doc_item: DocItem) -> str: """ Get the result Markdown of `doc_item`. @@ -120,6 +148,8 @@ class BatchParser: ) if markdown is not None: await doc_cache.set(item, markdown) + else: + asyncio.create_task(self.stale_inventory_notifier.send_warning(item)) except Exception as e: log.exception(f"Unexpected error when handling {item}") future.set_exception(e) diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py index d68f7c8d7..257161dd5 100644 --- a/bot/exts/info/doc/_parsing.py +++ b/bot/exts/info/doc/_parsing.py @@ -231,7 +231,6 @@ def get_symbol_markdown(soup: BeautifulSoup, symbol_data: DocItem) -> Optional[s """ symbol_heading = soup.find(id=symbol_data.symbol_id) if symbol_heading is None: - log.warning("Symbol present in loaded inventories not found on site, consider refreshing inventories.") return None signature = None # Modules, doc pages and labels don't point to description list tags but to tags like divs, -- cgit v1.2.3 From c2447e0f2a3f28f79ec73d82b3ba4923b377f3e9 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Mon, 11 Jan 2021 01:07:07 +0100 Subject: Update outdated docstring --- bot/exts/info/doc/_parsing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py index 257161dd5..f07b530c1 100644 --- a/bot/exts/info/doc/_parsing.py +++ b/bot/exts/info/doc/_parsing.py @@ -225,7 +225,7 @@ def _create_markdown(signatures: Optional[List[str]], description: Iterable[Tag] def get_symbol_markdown(soup: BeautifulSoup, symbol_data: DocItem) -> Optional[str]: """ - Return parsed markdown of the passed symbol using the passed in soup, truncated to 1000 characters. + Return parsed markdown of the passed symbol using the passed in soup, truncated to fit within a discord message. The method of parsing and what information gets included depends on the symbol's group. """ -- cgit v1.2.3 From 3b735398ca88b022e2fd815d715f3965c87f32ce Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Mon, 11 Jan 2021 02:01:00 +0100 Subject: Handle renaming conflicting symbols in ensure_unique_symbol_name Previously update_single looped this function until there were no duplicates and when creating new symbols the function had to check if the symbol to create a new name from started with a group/package to avoid redundancy. The new approach ensures a new symbol is always unique when returning by handling the containment check inside and outputting a symbol name in the format of package.group.symbol which should always be unique --- bot/exts/info/doc/_cog.py | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 7a943f1a4..5b38af95b 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -95,16 +95,14 @@ class DocCog(commands.Cog): # e.g. get 'class' from 'py:class' group_name = group.split(":")[1] - while (original_symbol := self.doc_symbols.get(symbol)) is not None: + if (original_symbol := self.doc_symbols.get(symbol)) is not None: replaced_symbol_name = self.ensure_unique_symbol_name( api_package_name, group_name, original_symbol, symbol, ) - if replaced_symbol_name is None: - break - else: + if replaced_symbol_name is not None: symbol = replaced_symbol_name relative_url_path, _, symbol_id = relative_doc_url.partition("#") @@ -169,8 +167,12 @@ class DocCog(commands.Cog): """ # Certain groups are added as prefixes to disambiguate the symbols. if group_name in FORCE_PREFIX_GROUPS: - self.renamed_symbols.add(symbol_name) - return f"{group_name}.{symbol_name}" + new_symbol = f"{group_name}.{symbol_name}" + if new_symbol in self.doc_symbols: + # If there's still a conflict, prefix with package name. + new_symbol = f"{package_name}.{new_symbol}" + self.renamed_symbols.add(new_symbol) + return new_symbol # The existing symbol with which the current symbol conflicts should have a group prefix. # It currently doesn't have the group prefix because it's only added once there's a conflict. @@ -195,15 +197,12 @@ class DocCog(commands.Cog): # If we can't specially handle the symbol through its group or package, # fall back to prepending its package name to the front. else: - if symbol_name.startswith(package_name): - # If the symbol already starts with the package name, insert the group name after it. - split_symbol_name = symbol_name.split(".", maxsplit=1) - split_symbol_name.insert(1, group_name) - overridden_symbol = ".".join(split_symbol_name) - else: - overridden_symbol = f"{package_name}.{symbol_name}" - self.renamed_symbols.add(overridden_symbol) - return overridden_symbol + new_symbol = f"{package_name}.{symbol_name}" + if new_symbol in self.doc_symbols: + # If there's still a conflict, add the symbol's group in the middle. + new_symbol = f"{package_name}.{group_name}.{symbol_name}" + self.renamed_symbols.add(new_symbol) + return new_symbol async def refresh_inventory(self) -> None: """Refresh internal documentation inventory.""" -- cgit v1.2.3 From c92a9985a5a43dc26e7590d7581d47fbbc5e27a8 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Mon, 11 Jan 2021 02:02:56 +0100 Subject: Use a dictionary of lists instead of set for renamed symbols A dictionary allows us to grab the original symbol name and then get all the renamed symbols from it, with the improvements to `ensure_unique_symbol_name` we can also use lists instead of sets as each symbol we add should be unique --- bot/exts/info/doc/_cog.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 5b38af95b..deef37f8f 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -4,6 +4,7 @@ import asyncio import logging import re import sys +from collections import defaultdict from contextlib import suppress from types import SimpleNamespace from typing import Dict, NamedTuple, Optional @@ -65,7 +66,7 @@ class DocCog(commands.Cog): self.bot = bot self.doc_symbols: Dict[str, DocItem] = {} self.item_fetcher = BatchParser() - self.renamed_symbols = set() + self.renamed_symbols = defaultdict(list) self.inventory_scheduler = Scheduler(self.__class__.__name__) self.scheduled_inventories = set() @@ -171,7 +172,7 @@ class DocCog(commands.Cog): if new_symbol in self.doc_symbols: # If there's still a conflict, prefix with package name. new_symbol = f"{package_name}.{new_symbol}" - self.renamed_symbols.add(new_symbol) + self.renamed_symbols[symbol_name].append(new_symbol) return new_symbol # The existing symbol with which the current symbol conflicts should have a group prefix. @@ -183,7 +184,7 @@ class DocCog(commands.Cog): overridden_symbol = f"{original_item.package}.{overridden_symbol}" self.doc_symbols[overridden_symbol] = original_item - self.renamed_symbols.add(overridden_symbol) + self.renamed_symbols[symbol_name].append(overridden_symbol) elif package_name in PRIORITY_PACKAGES: overridden_symbol = f"{original_item.package}.{symbol_name}" @@ -192,7 +193,7 @@ class DocCog(commands.Cog): overridden_symbol = f"{original_item.package}.{original_item.group}.{symbol_name}" self.doc_symbols[overridden_symbol] = original_item - self.renamed_symbols.add(overridden_symbol) + self.renamed_symbols[symbol_name].append(overridden_symbol) # If we can't specially handle the symbol through its group or package, # fall back to prepending its package name to the front. @@ -201,7 +202,7 @@ class DocCog(commands.Cog): if new_symbol in self.doc_symbols: # If there's still a conflict, add the symbol's group in the middle. new_symbol = f"{package_name}.{group_name}.{symbol_name}" - self.renamed_symbols.add(new_symbol) + self.renamed_symbols[symbol_name].append(new_symbol) return new_symbol async def refresh_inventory(self) -> None: @@ -265,9 +266,7 @@ class DocCog(commands.Cog): description=markdown ) # Show all symbols with the same name that were renamed in the footer. - embed.set_footer( - text=", ".join(renamed for renamed in self.renamed_symbols - {symbol} if renamed.endswith(f".{symbol}")) - ) + embed.set_footer(text=", ".join(self.renamed_symbols[symbol])) return embed @commands.group(name='docs', aliases=('doc', 'd'), invoke_without_command=True) -- cgit v1.2.3 From 8d927ff13e0fd93e80102b43c2568f1e74a29a7c Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Mon, 11 Jan 2021 02:10:35 +0100 Subject: Ensure footer fits into message The footer also now says Moved: at the start to clarify the meaning of the symbols to the user --- bot/exts/info/doc/_cog.py | 7 ++++++- bot/exts/info/doc/_parsing.py | 4 ++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index deef37f8f..b8c1a10d4 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -4,6 +4,7 @@ import asyncio import logging import re import sys +import textwrap from collections import defaultdict from contextlib import suppress from types import SimpleNamespace @@ -266,7 +267,11 @@ class DocCog(commands.Cog): description=markdown ) # Show all symbols with the same name that were renamed in the footer. - embed.set_footer(text=", ".join(self.renamed_symbols[symbol])) + if renamed_symbols := self.renamed_symbols[symbol]: + footer_text = f"Moved: {textwrap.shorten(', '.join(renamed_symbols), 100, placeholder=' ...')}" + else: + footer_text = "" + embed.set_footer(text=footer_text) return embed @commands.group(name='docs', aliases=('doc', 'd'), invoke_without_command=True) diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py index f07b530c1..45a81a4cb 100644 --- a/bot/exts/info/doc/_parsing.py +++ b/bot/exts/info/doc/_parsing.py @@ -33,8 +33,8 @@ _NO_SIGNATURE_GROUPS = { _EMBED_CODE_BLOCK_LINE_LENGTH = 61 # _MAX_SIGNATURE_AMOUNT code block wrapped lines with py syntax highlight _MAX_SIGNATURES_LENGTH = (_EMBED_CODE_BLOCK_LINE_LENGTH + 8) * MAX_SIGNATURE_AMOUNT -# Maximum discord message length - signatures on top -_MAX_DESCRIPTION_LENGTH = 2000 - _MAX_SIGNATURES_LENGTH +# Maximum discord message length - signatures on top - space for footer +_MAX_DESCRIPTION_LENGTH = 1900 - _MAX_SIGNATURES_LENGTH _TRUNCATE_STRIP_CHARACTERS = "!?:;." + string.whitespace BracketPair = namedtuple("BracketPair", ["opening_bracket", "closing_bracket"]) -- cgit v1.2.3 From 5c97efab1bf3d15911a343687b50af92b57bc036 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Mon, 11 Jan 2021 02:13:58 +0100 Subject: Don't convert package names into lowercase The converter used to set them already ensures this for us, making the call redundant --- bot/exts/info/doc/_cog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index b8c1a10d4..0e7eff9d9 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -250,7 +250,7 @@ class DocCog(commands.Cog): if symbol_info is None: log.debug("Symbol does not exist.") return None - self.bot.stats.incr(f"doc_fetches.{symbol_info.package.lower()}") + self.bot.stats.incr(f"doc_fetches.{symbol_info.package}") markdown = await doc_cache.get(symbol_info) if markdown is None: -- cgit v1.2.3 From a7ba149904ac0643cc7e267d219fe86c159816e0 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Mon, 11 Jan 2021 02:37:53 +0100 Subject: Notify the user that inventories were refreshed on cache clears --- bot/exts/info/doc/_cog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 0e7eff9d9..822c984d7 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -410,7 +410,7 @@ class DocCog(commands.Cog): """Clear the persistent redis cache for `package`.""" if await doc_cache.delete(package_name): await self.refresh_inventory() - await ctx.send(f"Successfully cleared the cache for `{package_name}`.") + await ctx.send(f"Successfully cleared the cache for `{package_name}` and refreshed the inventories.") else: await ctx.send("No keys matching the package found.") -- cgit v1.2.3 From f5235b16343816b02ceef56d1e753cb0167c6b03 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Mon, 11 Jan 2021 02:42:19 +0100 Subject: Check for containment instead of always getting the value from the dict Getting the value from a defaultdict will always create the key for it, creating unnecessary entries every time a symbol is fetched from the bot --- bot/exts/info/doc/_cog.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 822c984d7..b35469787 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -267,8 +267,9 @@ class DocCog(commands.Cog): description=markdown ) # Show all symbols with the same name that were renamed in the footer. - if renamed_symbols := self.renamed_symbols[symbol]: - footer_text = f"Moved: {textwrap.shorten(', '.join(renamed_symbols), 100, placeholder=' ...')}" + if symbol in self.renamed_symbols: + renamed_symbols = ', '.join(self.renamed_symbols[symbol]) + footer_text = f"Moved: {textwrap.shorten(renamed_symbols, 100, placeholder=' ...')}" else: footer_text = "" embed.set_footer(text=footer_text) -- cgit v1.2.3 From 780dbc7683c7ce9cece6f0707840f56005466dfe Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Mon, 11 Jan 2021 02:52:34 +0100 Subject: Remove old reference to CachedParser and unused const --- bot/exts/info/doc/_cog.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index b35469787..bc230b74b 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -2,7 +2,6 @@ from __future__ import annotations import asyncio import logging -import re import sys import textwrap from collections import defaultdict @@ -34,7 +33,6 @@ FORCE_PREFIX_GROUPS = ( "pdbcommand", "term", ) -WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)") NOT_FOUND_DELETE_DELAY = RedirectOutput.delete_delay # Delay to wait before trying to reach a rescheduled inventory again, in minutes FETCH_RESCHEDULE_DELAY = SimpleNamespace(first=2, repeated=5) @@ -238,8 +236,7 @@ class DocCog(commands.Cog): If the symbol is known, an Embed with documentation about it is returned. - First check the DocRedisCache before querying the cog's `CachedParser`, - if not present also create a redis entry for the symbol. + First check the DocRedisCache before querying the cog's `BatchParser`. """ log.trace(f"Building embed for symbol `{symbol}`") if not REFRESH_EVENT.is_set(): -- cgit v1.2.3 From a2c1e67ac764b363d48d685ace707a650279e009 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Mon, 11 Jan 2021 04:32:10 +0100 Subject: Make REFRESH_EVENT an instance variable --- bot/exts/info/doc/_cog.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index bc230b74b..7bb819987 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -37,8 +37,6 @@ NOT_FOUND_DELETE_DELAY = RedirectOutput.delete_delay # Delay to wait before trying to reach a rescheduled inventory again, in minutes FETCH_RESCHEDULE_DELAY = SimpleNamespace(first=2, repeated=5) -REFRESH_EVENT = asyncio.Event() -REFRESH_EVENT.set() COMMAND_LOCK_SINGLETON = "inventory refresh" @@ -70,6 +68,8 @@ class DocCog(commands.Cog): self.inventory_scheduler = Scheduler(self.__class__.__name__) self.scheduled_inventories = set() + self.refresh_event = asyncio.Event() + self.refresh_event.set() self.bot.loop.create_task(self.init_refresh_inventory()) @lock("doc", COMMAND_LOCK_SINGLETON, raise_error=True) @@ -206,7 +206,7 @@ class DocCog(commands.Cog): async def refresh_inventory(self) -> None: """Refresh internal documentation inventory.""" - REFRESH_EVENT.clear() + self.refresh_event.clear() log.debug("Refreshing documentation inventory...") self.inventory_scheduler.cancel_all() @@ -228,7 +228,7 @@ class DocCog(commands.Cog): ] await asyncio.gather(*coros) log.debug("Finished inventory refresh.") - REFRESH_EVENT.set() + self.refresh_event.set() async def get_symbol_embed(self, symbol: str) -> Optional[discord.Embed]: """ @@ -239,9 +239,9 @@ class DocCog(commands.Cog): First check the DocRedisCache before querying the cog's `BatchParser`. """ log.trace(f"Building embed for symbol `{symbol}`") - if not REFRESH_EVENT.is_set(): + if not self.refresh_event.is_set(): log.debug("Waiting for inventories to be refreshed before processing item.") - await REFRESH_EVENT.wait() + await self.refresh_event.wait() symbol_info = self.doc_symbols.get(symbol) if symbol_info is None: -- cgit v1.2.3 From 551c01e2537b036c17253d5cbfc4cfee6150cc4a Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Mon, 11 Jan 2021 04:59:25 +0100 Subject: Return whitespace to its previous state --- bot/utils/lock.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/utils/lock.py b/bot/utils/lock.py index 978e3ae94..997c653a1 100644 --- a/bot/utils/lock.py +++ b/bot/utils/lock.py @@ -94,8 +94,8 @@ def lock(namespace: Hashable, resource_id: ResourceId, *, raise_error: bool = Fa log.info(f"{name}: aborted because resource {namespace!r}:{id_!r} is locked") if raise_error: raise LockedResourceError(str(namespace), id_) - return wrapper + return wrapper return decorator -- cgit v1.2.3 From bf2d3d58dda76e7407b2d10f1dd9c89ce8f17d8f Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Mon, 11 Jan 2021 10:35:56 +0100 Subject: Fix docstring The decorator works in revers to what the docstring explained --- bot/utils/function.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/utils/function.py b/bot/utils/function.py index 5fd70e1e8..ab7f45761 100644 --- a/bot/utils/function.py +++ b/bot/utils/function.py @@ -108,7 +108,7 @@ def command_wraps( assigned: t.Sequence[str] = functools.WRAPPER_ASSIGNMENTS, updated: t.Sequence[str] = functools.WRAPPER_UPDATES, ) -> t.Callable[[types.FunctionType], types.FunctionType]: - """Update `wrapped` to look like the decorated function and update globals for discordpy forwardref evaluation.""" + """Update the decorated function to look like `wrapped` and update globals for discordpy forwardref evaluation.""" def decorator(wrapper: types.FunctionType) -> types.FunctionType: return functools.update_wrapper( update_wrapper_globals(wrapper, wrapped), wrapped, assigned, updated -- cgit v1.2.3 From a3145654ab5c90d16f9b4ff53f3df40d7e35f683 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 13 Jan 2021 12:56:57 +0100 Subject: Turn update_single into a normal function The method no longer runs anything asynchronous --- bot/exts/info/doc/_cog.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 7bb819987..f008f2c28 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -78,7 +78,7 @@ class DocCog(commands.Cog): await self.bot.wait_until_guild_available() await self.refresh_inventory() - async def update_single(self, api_package_name: str, base_url: str, package: INVENTORY_DICT) -> None: + def update_single(self, api_package_name: str, base_url: str, package: INVENTORY_DICT) -> None: """ Rebuild the inventory for a single package. @@ -148,7 +148,7 @@ class DocCog(commands.Cog): return self.scheduled_inventories.discard(api_package_name) - await self.update_single(api_package_name, base_url, package) + self.update_single(api_package_name, base_url, package) def ensure_unique_symbol_name( self, @@ -357,7 +357,7 @@ class DocCog(commands.Cog): + "\n".join(f"{key}: {value}" for key, value in body.items()) ) - await self.update_single(package_name, base_url, inventory_dict) + self.update_single(package_name, base_url, inventory_dict) await ctx.send(f"Added the package `{package_name}` to the database and refreshed the inventory.") @docs_group.command(name='deletedoc', aliases=('removedoc', 'rm', 'd')) -- cgit v1.2.3 From a74d7f81f258b4e70221c445b351fe646d385dd5 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 13 Jan 2021 12:57:36 +0100 Subject: Correct return type annotation --- bot/exts/info/doc/_cog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index f008f2c28..ac74e7997 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -124,7 +124,7 @@ class DocCog(commands.Cog): api_package_name: str, base_url: str, inventory_url: str - ) -> Optional[INVENTORY_DICT]: + ) -> None: """ Update the cog's inventory, or reschedule this method to execute again if the remote inventory unreachable. -- cgit v1.2.3 From f3323503ff84b67ae2b8d4412001238937b7f684 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 13 Jan 2021 21:28:21 +0100 Subject: Use different task ids for every inventory reschedule attempts The scheduler can't keep track of multiple tasks with the same id, and rescheduling the update task using the same id within an already scheduled update task caused the new task to get ignored as the old task only got deleted from the scheduler after it was finished --- bot/exts/info/doc/_cog.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index ac74e7997..43407d5ba 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -66,7 +66,7 @@ class DocCog(commands.Cog): self.renamed_symbols = defaultdict(list) self.inventory_scheduler = Scheduler(self.__class__.__name__) - self.scheduled_inventories = set() + self.inventory_reschedule_attempts = defaultdict(int) self.refresh_event = asyncio.Event() self.refresh_event.set() @@ -134,20 +134,20 @@ class DocCog(commands.Cog): package = await fetch_inventory(inventory_url) if not package: - if inventory_url not in self.scheduled_inventories: + attempt = self.inventory_reschedule_attempts[package] + self.inventory_reschedule_attempts[package] += 1 + if attempt == 0: delay = FETCH_RESCHEDULE_DELAY.first else: delay = FETCH_RESCHEDULE_DELAY.repeated log.info(f"Failed to fetch inventory; attempting again in {delay} minutes.") self.inventory_scheduler.schedule_later( delay*60, - api_package_name, + (attempt, api_package_name), self.update_or_reschedule_inventory(api_package_name, base_url, inventory_url) ) - self.scheduled_inventories.add(api_package_name) return - self.scheduled_inventories.discard(api_package_name) self.update_single(api_package_name, base_url, package) def ensure_unique_symbol_name( @@ -209,6 +209,7 @@ class DocCog(commands.Cog): self.refresh_event.clear() log.debug("Refreshing documentation inventory...") self.inventory_scheduler.cancel_all() + self.inventory_reschedule_attempts.clear() # Clear the old base URLS and doc symbols to ensure # that we start from a fresh local dataset. @@ -216,7 +217,6 @@ class DocCog(commands.Cog): self.base_urls.clear() self.doc_symbols.clear() self.renamed_symbols.clear() - self.scheduled_inventories.clear() await self.item_fetcher.clear() # Run all coroutines concurrently - since each of them performs an HTTP -- cgit v1.2.3 From 93ef70f7bcbb638fbdf55fb278cf16c2605db63b Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 13 Jan 2021 21:30:18 +0100 Subject: Simplify control flow Co-authored-by: MarkKoz --- bot/exts/info/doc/_cog.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 43407d5ba..eea380fc0 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -146,9 +146,8 @@ class DocCog(commands.Cog): (attempt, api_package_name), self.update_or_reschedule_inventory(api_package_name, base_url, inventory_url) ) - return - - self.update_single(api_package_name, base_url, package) + else: + self.update_single(api_package_name, base_url, package) def ensure_unique_symbol_name( self, -- cgit v1.2.3 From e7b20b90efb50169aecf865168840a319037c776 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 13 Jan 2021 21:30:40 +0100 Subject: Keep trakck of the init task and cancel it when the cog is unloaded --- bot/exts/info/doc/_cog.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index eea380fc0..aa9642016 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -70,7 +70,7 @@ class DocCog(commands.Cog): self.refresh_event = asyncio.Event() self.refresh_event.set() - self.bot.loop.create_task(self.init_refresh_inventory()) + self.init_refresh_task = self.bot.loop.create_task(self.init_refresh_inventory()) @lock("doc", COMMAND_LOCK_SINGLETON, raise_error=True) async def init_refresh_inventory(self) -> None: @@ -415,4 +415,5 @@ class DocCog(commands.Cog): """Clear scheduled inventories, queued symbols and cleanup task on cog unload.""" self.inventory_scheduler.cancel_all() self.item_fetcher.cleanup_futures_task.cancel() + self.init_refresh_task.cancel() asyncio.create_task(self.item_fetcher.clear()) -- cgit v1.2.3 From a4de9fe294b7626dc81ee191d2d6bce751ad91c7 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 13 Jan 2021 21:31:55 +0100 Subject: Change typehint name casing to PascalCase --- bot/converters.py | 2 +- bot/exts/info/doc/_cog.py | 4 ++-- bot/exts/info/doc/_inventory_parser.py | 11 ++++++----- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/bot/converters.py b/bot/converters.py index 6bbc22c3a..2b383636c 100644 --- a/bot/converters.py +++ b/bot/converters.py @@ -187,7 +187,7 @@ class Inventory(Converter): """ @staticmethod - async def convert(ctx: Context, url: str) -> t.Tuple[str, _inventory_parser.INVENTORY_DICT]: + async def convert(ctx: Context, url: str) -> t.Tuple[str, _inventory_parser.InventoryDict]: """Convert url to Intersphinx inventory URL.""" await ctx.trigger_typing() if (inventory := await _inventory_parser.fetch_inventory(url)) is None: diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index aa9642016..51283a67e 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -21,7 +21,7 @@ from bot.utils.messages import send_denial, wait_for_deletion from bot.utils.scheduling import Scheduler from . import PRIORITY_PACKAGES, doc_cache from ._batch_parser import BatchParser -from ._inventory_parser import INVENTORY_DICT, fetch_inventory +from ._inventory_parser import InventoryDict, fetch_inventory log = logging.getLogger(__name__) @@ -78,7 +78,7 @@ class DocCog(commands.Cog): await self.bot.wait_until_guild_available() await self.refresh_inventory() - def update_single(self, api_package_name: str, base_url: str, package: INVENTORY_DICT) -> None: + def update_single(self, api_package_name: str, base_url: str, package: InventoryDict) -> None: """ Rebuild the inventory for a single package. diff --git a/bot/exts/info/doc/_inventory_parser.py b/bot/exts/info/doc/_inventory_parser.py index 886708867..1615f15bd 100644 --- a/bot/exts/info/doc/_inventory_parser.py +++ b/bot/exts/info/doc/_inventory_parser.py @@ -11,9 +11,10 @@ import bot log = logging.getLogger(__name__) FAILED_REQUEST_ATTEMPTS = 3 -INVENTORY_DICT = DefaultDict[str, List[Tuple[str, str]]] _V2_LINE_RE = re.compile(r'(?x)(.+?)\s+(\S*:\S*)\s+(-?\d+)\s+?(\S*)\s+(.*)') +InventoryDict = DefaultDict[str, List[Tuple[str, str]]] + class ZlibStreamReader: """Class used for decoding zlib data of a stream line by line.""" @@ -43,7 +44,7 @@ class ZlibStreamReader: pos = buf.find(b'\n') -async def _load_v1(stream: aiohttp.StreamReader) -> INVENTORY_DICT: +async def _load_v1(stream: aiohttp.StreamReader) -> InventoryDict: invdata = defaultdict(list) async for line in stream: @@ -59,7 +60,7 @@ async def _load_v1(stream: aiohttp.StreamReader) -> INVENTORY_DICT: return invdata -async def _load_v2(stream: aiohttp.StreamReader) -> INVENTORY_DICT: +async def _load_v2(stream: aiohttp.StreamReader) -> InventoryDict: invdata = defaultdict(list) async for line in ZlibStreamReader(stream): @@ -72,7 +73,7 @@ async def _load_v2(stream: aiohttp.StreamReader) -> INVENTORY_DICT: return invdata -async def _fetch_inventory(url: str) -> INVENTORY_DICT: +async def _fetch_inventory(url: str) -> InventoryDict: """Fetch, parse and return an intersphinx inventory file from an url.""" timeout = aiohttp.ClientTimeout(sock_connect=5, sock_read=5) async with bot.instance.http_session.get(url, timeout=timeout, raise_for_status=True) as response: @@ -94,7 +95,7 @@ async def _fetch_inventory(url: str) -> INVENTORY_DICT: raise ValueError(f"Invalid inventory file at url {url}.") -async def fetch_inventory(url: str) -> Optional[INVENTORY_DICT]: +async def fetch_inventory(url: str) -> Optional[InventoryDict]: """ Get an inventory dict from `url`, retrying `FAILED_REQUEST_ATTEMPTS` times on errors. -- cgit v1.2.3 From d972b7800346b4d1ee88c706354bb1c18ba4b725 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 13 Jan 2021 21:33:56 +0100 Subject: Reuse the redis key instead of creating a new string for the expires set --- bot/exts/info/doc/_redis_cache.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bot/exts/info/doc/_redis_cache.py b/bot/exts/info/doc/_redis_cache.py index 52cb2bc94..cab51c3f1 100644 --- a/bot/exts/info/doc/_redis_cache.py +++ b/bot/exts/info/doc/_redis_cache.py @@ -27,8 +27,8 @@ class DocRedisCache(RedisObject): needs_expire = False with await self._get_pool_connection() as connection: - if item.package+url_key not in self._set_expires: - self._set_expires.add(item.package+url_key) + if redis_key not in self._set_expires: + self._set_expires.add(redis_key) needs_expire = not await connection.exists(redis_key) await connection.hset(redis_key, item.symbol_id, value) -- cgit v1.2.3 From 7342510667ea159fcc83927cb9caee14661c12a8 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 13 Jan 2021 23:18:33 +0100 Subject: Set the user_requested attribute at the start of the coroutine A context switch may occur when we're waiting for the web page response, during which a clear could be triggered. If the event is not set before that we could end up with the dictionary changing sizes, or if a copy was made, a future that'd never finish as it'd be cleared from the queue and the futures dict --- bot/exts/info/doc/_batch_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/exts/info/doc/_batch_parser.py b/bot/exts/info/doc/_batch_parser.py index ebae6efb8..4a6d9b544 100644 --- a/bot/exts/info/doc/_batch_parser.py +++ b/bot/exts/info/doc/_batch_parser.py @@ -108,6 +108,7 @@ class BatchParser: Not safe to run while `self.clear` is running. """ + self._item_futures[doc_item].user_requested = True if (symbols_to_queue := self._page_symbols.get(doc_item.url)) is not None: async with bot.instance.http_session.get(doc_item.url) as response: soup = BeautifulSoup(await response.text(encoding="utf8"), "lxml") @@ -123,7 +124,6 @@ class BatchParser: with suppress(ValueError): # If the item is not in the list then the item is already parsed or is being parsed self._move_to_front(doc_item) - self._item_futures[doc_item].user_requested = True return await self._item_futures[doc_item] async def _parse_queue(self) -> None: -- cgit v1.2.3 From 1bdfdac30d27d67d95c49b5b66a0a4de919afa21 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Thu, 14 Jan 2021 01:10:48 +0100 Subject: Ensure footer is actually max 100 chars Shortening the renamed symbols string to 100 chars is not accurate as the footer also contains a string before that, subtracting its length fixes this. --- bot/exts/info/doc/_cog.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 51283a67e..942d685af 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -262,10 +262,11 @@ class DocCog(commands.Cog): url=f"{symbol_info.url}#{symbol_info.symbol_id}", description=markdown ) - # Show all symbols with the same name that were renamed in the footer. + # Show all symbols with the same name that were renamed in the footer, + # with a max of 100 chars. if symbol in self.renamed_symbols: renamed_symbols = ', '.join(self.renamed_symbols[symbol]) - footer_text = f"Moved: {textwrap.shorten(renamed_symbols, 100, placeholder=' ...')}" + footer_text = f"Moved: {textwrap.shorten(renamed_symbols, 100-7, placeholder=' ...')}" else: footer_text = "" embed.set_footer(text=footer_text) -- cgit v1.2.3 From e86e9f921a4bbbe42a5fb6fd8486425f11af62cf Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Thu, 14 Jan 2021 05:00:22 +0100 Subject: Raise an error or log a warning if there's a global name conflict When wrapper uses a global name, which conflicts with a global name from wrapped's module that wrapped uses for its annotations, we run into a situation that can't be solved without changing one of the names, so an error is raised to give this clearer meaning. The check may be erroneous in some edge cases or the objects the conflicting names refer to can be functionally identical, so the error can be turned into a logged warning. --- bot/utils/function.py | 40 +++++++++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/bot/utils/function.py b/bot/utils/function.py index ab7f45761..4fa7a9f60 100644 --- a/bot/utils/function.py +++ b/bot/utils/function.py @@ -2,15 +2,22 @@ import functools import inspect +import logging import types import typing as t +log = logging.getLogger(__name__) + Argument = t.Union[int, str] BoundArgs = t.OrderedDict[str, t.Any] Decorator = t.Callable[[t.Callable], t.Callable] ArgValGetter = t.Callable[[BoundArgs], t.Any] +class GlobalNameConflictError(Exception): + """Raised when there's a conflict between the globals used to resolve annotations of wrapped and its wrapper.""" + + def get_arg_value(name_or_pos: Argument, arguments: BoundArgs) -> t.Any: """ Return a value from `arguments` based on a name or position. @@ -77,7 +84,12 @@ def get_bound_args(func: t.Callable, args: t.Tuple, kwargs: t.Dict[str, t.Any]) return bound_args.arguments -def update_wrapper_globals(wrapper: types.FunctionType, wrapped: types.FunctionType) -> types.FunctionType: +def update_wrapper_globals( + wrapper: types.FunctionType, + wrapped: types.FunctionType, + *, + error_on_conflict: bool = True, +) -> types.FunctionType: """ Update globals of `wrapper` with the globals from `wrapped`. @@ -88,10 +100,26 @@ def update_wrapper_globals(wrapper: types.FunctionType, wrapped: types.FunctionT This function creates a new function functionally identical to `wrapper`, which has the globals replaced with a merge of `wrapped`s globals and the `wrapper`s globals. - In case a global name from `wrapped` conflicts with a name from `wrapper`'s globals, `wrapper` will win - to keep it functional, but this may cause problems if the name is used as an annotation and - discord.py uses it as a converter on a parameter from `wrapped`. + If `error_on_conflict` is True, an exception will be raised in case `wrapper` and `wrapped` share a global name + that is used by `wrapped`'s typehints, as this can cause incorrect objects being used by discordpy's converters. + The error can be turned into a warning by setting the argument to False. """ + forwardrefs = (ann for ann in wrapped.__annotations__.values() if isinstance(ann, str)) + annotation_global_names = (ann.split(".", maxsplit=1)[0] for ann in forwardrefs) + # Conflicting globals from both functions' modules that are also used in the wrapper and in wrapped's annotations. + shared_globals = set(wrapper.__code__.co_names) & set(annotation_global_names) + shared_globals &= set(wrapped.__globals__) & set(wrapper.__globals__) + if shared_globals: + message = ( + f"wrapper and the wrapped function share the following " + f"global names used by annotations: {', '.join(shared_globals)}. " + f"Resolve the conflicts or pass error_on_conflict=False to suppress this error if this is intentional." + ) + if error_on_conflict: + raise GlobalNameConflictError(message) + else: + log.info(message) + new_globals = wrapper.__globals__.copy() new_globals.update((k, v) for k, v in wrapped.__globals__.items() if k not in wrapper.__code__.co_names) return types.FunctionType( @@ -107,11 +135,13 @@ def command_wraps( wrapped: types.FunctionType, assigned: t.Sequence[str] = functools.WRAPPER_ASSIGNMENTS, updated: t.Sequence[str] = functools.WRAPPER_UPDATES, + *, + error_on_conflict: bool = True, ) -> t.Callable[[types.FunctionType], types.FunctionType]: """Update the decorated function to look like `wrapped` and update globals for discordpy forwardref evaluation.""" def decorator(wrapper: types.FunctionType) -> types.FunctionType: return functools.update_wrapper( - update_wrapper_globals(wrapper, wrapped), wrapped, assigned, updated + update_wrapper_globals(wrapper, wrapped, error_on_conflict=error_on_conflict), wrapped, assigned, updated ) return decorator -- cgit v1.2.3 From b1250515e7d6d3545bcfd850c6286c69239cb420 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Thu, 14 Jan 2021 05:17:07 +0100 Subject: Prevent an inventory refresh while waiting for item cache If an inventory refresh was started while the symbol embed coroutine was suspended, it could cause the parser to try to fetch a non existent future if the markdown was requested after it was cleared but before new inventories were loaded in. --- bot/exts/info/doc/_cog.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 942d685af..7b9dad135 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -70,6 +70,9 @@ class DocCog(commands.Cog): self.refresh_event = asyncio.Event() self.refresh_event.set() + self.symbol_get_event = asyncio.Event() + self.symbol_get_event.set() + self.init_refresh_task = self.bot.loop.create_task(self.init_refresh_inventory()) @lock("doc", COMMAND_LOCK_SINGLETON, raise_error=True) @@ -206,6 +209,7 @@ class DocCog(commands.Cog): async def refresh_inventory(self) -> None: """Refresh internal documentation inventory.""" self.refresh_event.clear() + await self.symbol_get_event.wait() log.debug("Refreshing documentation inventory...") self.inventory_scheduler.cancel_all() self.inventory_reschedule_attempts.clear() @@ -248,7 +252,10 @@ class DocCog(commands.Cog): return None self.bot.stats.incr(f"doc_fetches.{symbol_info.package}") + self.symbol_get_event.clear() markdown = await doc_cache.get(symbol_info) + self.symbol_get_event.set() + if markdown is None: log.debug(f"Redis cache miss for symbol `{symbol}`.") markdown = await self.item_fetcher.get_markdown(symbol_info) -- cgit v1.2.3 From f1103aeade13f964282154d5d1597b81188ce98f Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Fri, 15 Jan 2021 23:11:57 +0100 Subject: Use a defaultdict for item futures To be able to set the attribute at the start of the coro we need to be able to access the item's future before we know about all the other items. This also saves us from having to add them all as the queue parser or get_markdown will create the futures for us dynamically --- bot/exts/info/doc/_batch_parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bot/exts/info/doc/_batch_parser.py b/bot/exts/info/doc/_batch_parser.py index 4a6d9b544..606c5d803 100644 --- a/bot/exts/info/doc/_batch_parser.py +++ b/bot/exts/info/doc/_batch_parser.py @@ -92,7 +92,7 @@ class BatchParser: def __init__(self): self._queue: List[QueueItem] = [] self._page_symbols: Dict[str, List[DocItem]] = defaultdict(list) - self._item_futures: Dict[DocItem, ParseResultFuture] = {} + self._item_futures: Dict[DocItem, ParseResultFuture] = defaultdict(ParseResultFuture) self._parse_task = None self.cleanup_futures_task = bot.instance.loop.create_task(self._cleanup_futures()) @@ -114,7 +114,6 @@ class BatchParser: soup = BeautifulSoup(await response.text(encoding="utf8"), "lxml") self._queue.extend(QueueItem(symbol, soup) for symbol in symbols_to_queue) - self._item_futures.update((symbol, ParseResultFuture()) for symbol in symbols_to_queue) del self._page_symbols[doc_item.url] log.debug(f"Added symbols from {doc_item.url} to parse queue.") @@ -168,6 +167,7 @@ class BatchParser: queue_item = self._queue.pop(item_index) self._queue.append(queue_item) + log.trace(f"Moved {item} to the front of the queue.") def add_item(self, doc_item: DocItem) -> None: """Map a DocItem to its page so that the symbol will be parsed once the page is requested.""" -- cgit v1.2.3 From 69c98d95b436063684d5d004aead85ba3b9514ef Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Thu, 21 Jan 2021 03:51:32 +0100 Subject: Use inspect.unwrap instead of manually unwrapping --- bot/exts/info/source.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/bot/exts/info/source.py b/bot/exts/info/source.py index ae68ef7e8..f03b6a46f 100644 --- a/bot/exts/info/source.py +++ b/bot/exts/info/source.py @@ -68,9 +68,7 @@ class BotSource(commands.Cog): Raise BadArgument if `source_item` is a dynamically-created object (e.g. via internal eval). """ if isinstance(source_item, commands.Command): - source_item = source_item.callback - while hasattr(source_item, "__wrapped__"): - source_item = source_item.__wrapped__ + source_item = inspect.unwrap(source_item.callback) src = source_item.__code__ filename = src.co_filename elif isinstance(source_item, str): -- cgit v1.2.3 From 72a805c779b79ef5c0aeed7a9dd4b2096e3b35c9 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Fri, 22 Jan 2021 11:35:00 +0100 Subject: Fix issues with multiple gets being suspended when a refresh starts With a normal event, if multiple gets were suspended and a refresh started, we'd continue the refresh after the first get finished and set the event which would be the same behaviour as the one it tried to fix. This is avoided by using a counter that's incremented every time a context manager is entered around an event and only setting the event when that counter reaches a zero after everything exited the context mgr --- bot/exts/info/doc/_cog.py | 10 ++++------ bot/utils/lock.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 6 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 7b9dad135..26694ae55 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -16,7 +16,7 @@ from bot.bot import Bot from bot.constants import MODERATION_ROLES, RedirectOutput from bot.converters import Inventory, PackageName, ValidURL from bot.pagination import LinePaginator -from bot.utils.lock import lock +from bot.utils.lock import SharedEvent, lock from bot.utils.messages import send_denial, wait_for_deletion from bot.utils.scheduling import Scheduler from . import PRIORITY_PACKAGES, doc_cache @@ -70,8 +70,7 @@ class DocCog(commands.Cog): self.refresh_event = asyncio.Event() self.refresh_event.set() - self.symbol_get_event = asyncio.Event() - self.symbol_get_event.set() + self.symbol_get_event = SharedEvent() self.init_refresh_task = self.bot.loop.create_task(self.init_refresh_inventory()) @@ -252,9 +251,8 @@ class DocCog(commands.Cog): return None self.bot.stats.incr(f"doc_fetches.{symbol_info.package}") - self.symbol_get_event.clear() - markdown = await doc_cache.get(symbol_info) - self.symbol_get_event.set() + with self.symbol_get_event: + markdown = await doc_cache.get(symbol_info) if markdown is None: log.debug(f"Redis cache miss for symbol `{symbol}`.") diff --git a/bot/utils/lock.py b/bot/utils/lock.py index 997c653a1..b4bb0ebc7 100644 --- a/bot/utils/lock.py +++ b/bot/utils/lock.py @@ -1,3 +1,4 @@ +import asyncio import inspect import logging import types @@ -18,6 +19,35 @@ _IdCallable = Callable[[function.BoundArgs], _IdCallableReturn] ResourceId = Union[Hashable, _IdCallable] +class SharedEvent: + """ + Context manager managing an internal event exposed through the wait coro. + + While any code is executing in this context manager, the underyling event will not be set; + when all of the holders finish the event will be set. + """ + + def __init__(self): + self._active_count = 0 + self._event = asyncio.Event() + self._event.set() + + def __enter__(self): + """Increment the count of the active holders and clear the internal event.""" + self._active_count += 1 + self._event.clear() + + def __exit__(self, _exc_type, _exc_val, _exc_tb): # noqa: ANN001 + """Decrement the count of the active holders; if 0 is reached set the internal event.""" + self._active_count -= 1 + if not self._active_count: + self._event.set() + + async def wait(self) -> None: + """Wait for all active holders to exit.""" + await self._event.wait() + + class LockGuard: """ A context manager which acquires and releases a lock (mutex). -- cgit v1.2.3 From 59ca1cbed6bcf234b9eb277da291bdaeb259e939 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 23 Jan 2021 04:36:26 +0100 Subject: Properly handle cache being cleared Previously the code deleted the entry of all of the DocItems of the page after its contents were requested once, but this caused problems when the cache was cleared when it expired. Instead of deleting the entry to check if it should be queued on the next item request, we keep it and create an entry in the _item_futures dict for all items again and check for containment there. To avoid populating the queue multiple times with the same item in some cases the futures cleanup task will now only run when the queue is empty --- bot/exts/info/doc/_batch_parser.py | 25 +++++++++++++++---------- bot/exts/info/doc/_cog.py | 4 +--- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/bot/exts/info/doc/_batch_parser.py b/bot/exts/info/doc/_batch_parser.py index 606c5d803..42d81e98c 100644 --- a/bot/exts/info/doc/_batch_parser.py +++ b/bot/exts/info/doc/_batch_parser.py @@ -14,6 +14,7 @@ from bs4 import BeautifulSoup import bot from bot.constants import Channels +from bot.utils.lock import lock_arg from . import doc_cache from ._parsing import get_symbol_markdown if TYPE_CHECKING: @@ -92,13 +93,14 @@ class BatchParser: def __init__(self): self._queue: List[QueueItem] = [] self._page_symbols: Dict[str, List[DocItem]] = defaultdict(list) - self._item_futures: Dict[DocItem, ParseResultFuture] = defaultdict(ParseResultFuture) + self._item_futures: Dict[DocItem, ParseResultFuture] = {} self._parse_task = None self.cleanup_futures_task = bot.instance.loop.create_task(self._cleanup_futures()) self.stale_inventory_notifier = StaleInventoryNotifier() + @lock_arg("doc.get_markdown", "doc_item", attrgetter("url"), wait=True) async def get_markdown(self, doc_item: DocItem) -> str: """ Get the result Markdown of `doc_item`. @@ -108,18 +110,20 @@ class BatchParser: Not safe to run while `self.clear` is running. """ - self._item_futures[doc_item].user_requested = True - if (symbols_to_queue := self._page_symbols.get(doc_item.url)) is not None: + if doc_item not in self._item_futures: + self._item_futures.update((symbol, ParseResultFuture()) for symbol in self._page_symbols[doc_item.url]) + self._item_futures[doc_item].user_requested = True + async with bot.instance.http_session.get(doc_item.url) as response: soup = BeautifulSoup(await response.text(encoding="utf8"), "lxml") - self._queue.extend(QueueItem(symbol, soup) for symbol in symbols_to_queue) - del self._page_symbols[doc_item.url] + self._queue.extend(QueueItem(symbol, soup) for symbol in self._page_symbols[doc_item.url]) log.debug(f"Added symbols from {doc_item.url} to parse queue.") if self._parse_task is None: self._parse_task = asyncio.create_task(self._parse_queue()) - + else: + self._item_futures[doc_item].user_requested = True with suppress(ValueError): # If the item is not in the list then the item is already parsed or is being parsed self._move_to_front(doc_item) @@ -196,8 +200,9 @@ class BatchParser: Keeping them around for longer than a second is unnecessary and keeps the parsed Markdown strings alive. """ while True: - current_time = time.time() - for key, future in self._item_futures.copy().items(): - if current_time - future.result_set_time > 5: - del self._item_futures[key] + if not self._queue: + current_time = time.time() + for key, future in self._item_futures.copy().items(): + if current_time - future.result_set_time > 5: + del self._item_futures[key] await asyncio.sleep(5) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 26694ae55..c3458d776 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -408,12 +408,10 @@ class DocCog(commands.Cog): @docs_group.command(name="cleardoccache") @commands.has_any_role(*MODERATION_ROLES) - @lock("doc", COMMAND_LOCK_SINGLETON, raise_error=True) async def clear_cache_command(self, ctx: commands.Context, package_name: PackageName) -> None: """Clear the persistent redis cache for `package`.""" if await doc_cache.delete(package_name): - await self.refresh_inventory() - await ctx.send(f"Successfully cleared the cache for `{package_name}` and refreshed the inventories.") + await ctx.send(f"Successfully cleared the cache for `{package_name}`.") else: await ctx.send("No keys matching the package found.") -- cgit v1.2.3 From c2e3d6fac2ac615dea230671068790d8c9df71ba Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 23 Jan 2021 04:45:44 +0100 Subject: Create a namespace var --- bot/exts/info/doc/__init__.py | 3 ++- bot/exts/info/doc/_batch_parser.py | 4 ++-- bot/exts/info/doc/_cog.py | 10 +++++----- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/bot/exts/info/doc/__init__.py b/bot/exts/info/doc/__init__.py index 2bb43a950..38a8975c0 100644 --- a/bot/exts/info/doc/__init__.py +++ b/bot/exts/info/doc/__init__.py @@ -5,8 +5,9 @@ MAX_SIGNATURE_AMOUNT = 3 PRIORITY_PACKAGES = ( "python", ) +NAMESPACE = "doc" -doc_cache = DocRedisCache(namespace="Docs") +doc_cache = DocRedisCache(namespace=NAMESPACE) def setup(bot: Bot) -> None: diff --git a/bot/exts/info/doc/_batch_parser.py b/bot/exts/info/doc/_batch_parser.py index 42d81e98c..872f08ea9 100644 --- a/bot/exts/info/doc/_batch_parser.py +++ b/bot/exts/info/doc/_batch_parser.py @@ -15,7 +15,7 @@ from bs4 import BeautifulSoup import bot from bot.constants import Channels from bot.utils.lock import lock_arg -from . import doc_cache +from . import NAMESPACE, doc_cache from ._parsing import get_symbol_markdown if TYPE_CHECKING: from ._cog import DocItem @@ -100,7 +100,7 @@ class BatchParser: self.stale_inventory_notifier = StaleInventoryNotifier() - @lock_arg("doc.get_markdown", "doc_item", attrgetter("url"), wait=True) + @lock_arg(NAMESPACE, "doc_item", attrgetter("url"), wait=True) async def get_markdown(self, doc_item: DocItem) -> str: """ Get the result Markdown of `doc_item`. diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index c3458d776..430e8ebcb 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -19,7 +19,7 @@ from bot.pagination import LinePaginator from bot.utils.lock import SharedEvent, lock from bot.utils.messages import send_denial, wait_for_deletion from bot.utils.scheduling import Scheduler -from . import PRIORITY_PACKAGES, doc_cache +from . import NAMESPACE, PRIORITY_PACKAGES, doc_cache from ._batch_parser import BatchParser from ._inventory_parser import InventoryDict, fetch_inventory @@ -74,7 +74,7 @@ class DocCog(commands.Cog): self.init_refresh_task = self.bot.loop.create_task(self.init_refresh_inventory()) - @lock("doc", COMMAND_LOCK_SINGLETON, raise_error=True) + @lock(NAMESPACE, COMMAND_LOCK_SINGLETON, raise_error=True) async def init_refresh_inventory(self) -> None: """Refresh documentation inventory on cog initialization.""" await self.bot.wait_until_guild_available() @@ -330,7 +330,7 @@ class DocCog(commands.Cog): @docs_group.command(name='setdoc', aliases=('s',)) @commands.has_any_role(*MODERATION_ROLES) - @lock("doc", COMMAND_LOCK_SINGLETON, raise_error=True) + @lock(NAMESPACE, COMMAND_LOCK_SINGLETON, raise_error=True) async def set_command( self, ctx: commands.Context, @@ -367,7 +367,7 @@ class DocCog(commands.Cog): @docs_group.command(name='deletedoc', aliases=('removedoc', 'rm', 'd')) @commands.has_any_role(*MODERATION_ROLES) - @lock("doc", COMMAND_LOCK_SINGLETON, raise_error=True) + @lock(NAMESPACE, COMMAND_LOCK_SINGLETON, raise_error=True) async def delete_command(self, ctx: commands.Context, package_name: PackageName) -> None: """ Removes the specified package from the database. @@ -386,7 +386,7 @@ class DocCog(commands.Cog): @docs_group.command(name="refreshdoc", aliases=("rfsh", "r")) @commands.has_any_role(*MODERATION_ROLES) - @lock("doc", COMMAND_LOCK_SINGLETON, raise_error=True) + @lock(NAMESPACE, COMMAND_LOCK_SINGLETON, raise_error=True) async def refresh_command(self, ctx: commands.Context) -> None: """Refresh inventories and show the difference.""" old_inventories = set(self.base_urls) -- cgit v1.2.3 From 1b9aee6239aef1b0a3ce016145c1212e892f7d22 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 23 Jan 2021 05:50:34 +0100 Subject: lock markdownify version to 0.6.1 the 0.6.3 update brought a change that fails to ignore newlines in the html, introducing unnecessary lines into the output --- Pipfile | 2 +- Pipfile.lock | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Pipfile b/Pipfile index 54bd46ce8..2e76d2ede 100644 --- a/Pipfile +++ b/Pipfile @@ -18,7 +18,7 @@ deepdiff = "~=4.0" feedparser = "~=5.2" fuzzywuzzy = "~=0.17" lxml = "~=4.4" -markdownify = "~=0.6.1" +markdownify = "==0.6.1" more_itertools = "~=8.2" python-dateutil = "~=2.8" pyyaml = "~=5.1" diff --git a/Pipfile.lock b/Pipfile.lock index 5aff33383..ec88e5530 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "33874d325a918682da3ae4d833748263695836d0cda4c1b0627ce5a5f29746e5" + "sha256": "cd61b7be63278d2f5b073e98c507aa50affe97e590bb25e37c521754e65bc110" }, "pipfile-spec": 6, "requires": { @@ -376,11 +376,11 @@ }, "markdownify": { "hashes": [ - "sha256:2147197d9c45cdd24d57302b94e01cac44988862960ac42eba730345a31aebbc", - "sha256:3de08764db001e7119cb06481de4ec0b2ea0338fd26cf49bdf16c4475ef44b81" + "sha256:31d7c13ac2ada8bfc7535a25fee6622ca720e1b5f2d4a9cbc429d167c21f886d", + "sha256:7489fd5c601536996a376c4afbcd1dd034db7690af807120681461e82fbc0acc" ], "index": "pypi", - "version": "==0.6.3" + "version": "==0.6.1" }, "more-itertools": { "hashes": [ -- cgit v1.2.3 From f0b468d9c22eea43e36cd14960c23cb2c30cb335 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 23 Jan 2021 06:08:01 +0100 Subject: Avoid errors when the first element is longer than the truncation index --- bot/exts/info/doc/_parsing.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py index 45a81a4cb..0251b0105 100644 --- a/bot/exts/info/doc/_parsing.py +++ b/bot/exts/info/doc/_parsing.py @@ -195,9 +195,16 @@ def _get_truncated_description( return result # Determine the actual truncation index. - # Truncate at the last Markdown element that comes before the truncation index. - markdown_truncate_index = max(cut for cut in markdown_element_ends if cut < truncate_index) - return result[:markdown_truncate_index].strip(_TRUNCATE_STRIP_CHARACTERS) + "..." + possible_truncation_indices = [cut for cut in markdown_element_ends if cut < truncate_index] + if not possible_truncation_indices: + # In case there is no Markdown element ending before the truncation index, use shorten as a fallback. + truncated_result = textwrap.shorten(result, truncate_index) + else: + # Truncate at the last Markdown element that comes before the truncation index. + markdown_truncate_index = max(possible_truncation_indices) + truncated_result = result[:markdown_truncate_index] + + return truncated_result.strip(_TRUNCATE_STRIP_CHARACTERS) + "..." def _create_markdown(signatures: Optional[List[str]], description: Iterable[Tag], url: str) -> str: -- cgit v1.2.3 From f91c1595c80fe68a4e9261ce5277f8d2e94ccfa2 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 23 Jan 2021 06:17:41 +0100 Subject: Wrap whole string in shorten This helps avoid subtracting the length of "Moved: " from the shorten index Co-authored-by: MarkKoz --- bot/exts/info/doc/_cog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 430e8ebcb..a074d8daa 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -271,7 +271,7 @@ class DocCog(commands.Cog): # with a max of 100 chars. if symbol in self.renamed_symbols: renamed_symbols = ', '.join(self.renamed_symbols[symbol]) - footer_text = f"Moved: {textwrap.shorten(renamed_symbols, 100-7, placeholder=' ...')}" + footer_text = textwrap.shorten("Moved: " + renamed_symbols, 100, placeholder=' ...') else: footer_text = "" embed.set_footer(text=footer_text) -- cgit v1.2.3 From 9695d7d8022729efe8fab36eb7ef854aeece8163 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 23 Jan 2021 06:38:17 +0100 Subject: Cancel current task inside coro to not keep track of attempts manually The scheduler shields the coroutine from cancellation so we can cancel the scheduler's tasks inside of it to avoid the error from multiple tasks with the same id trying to be scheduled which the manual tracking of attempts solved Co-authored-by: MarkKoz --- bot/exts/info/doc/_cog.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index a074d8daa..df076f162 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -66,7 +66,6 @@ class DocCog(commands.Cog): self.renamed_symbols = defaultdict(list) self.inventory_scheduler = Scheduler(self.__class__.__name__) - self.inventory_reschedule_attempts = defaultdict(int) self.refresh_event = asyncio.Event() self.refresh_event.set() @@ -136,16 +135,15 @@ class DocCog(commands.Cog): package = await fetch_inventory(inventory_url) if not package: - attempt = self.inventory_reschedule_attempts[package] - self.inventory_reschedule_attempts[package] += 1 - if attempt == 0: - delay = FETCH_RESCHEDULE_DELAY.first - else: + if api_package_name in self.inventory_scheduler: + self.inventory_scheduler.cancel(api_package_name) delay = FETCH_RESCHEDULE_DELAY.repeated + else: + delay = FETCH_RESCHEDULE_DELAY.first log.info(f"Failed to fetch inventory; attempting again in {delay} minutes.") self.inventory_scheduler.schedule_later( delay*60, - (attempt, api_package_name), + api_package_name, self.update_or_reschedule_inventory(api_package_name, base_url, inventory_url) ) else: @@ -211,7 +209,6 @@ class DocCog(commands.Cog): await self.symbol_get_event.wait() log.debug("Refreshing documentation inventory...") self.inventory_scheduler.cancel_all() - self.inventory_reschedule_attempts.clear() # Clear the old base URLS and doc symbols to ensure # that we start from a fresh local dataset. -- cgit v1.2.3 From 82f1c37cff8213963d7950240bc770bec63472dc Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 30 Jan 2021 10:06:02 +0100 Subject: Require a set of names to ignore instead of a blanket ignoring As work is done on the modules the wrapper and wrapped functions are in more conflicts can occur, previously this could be missed as the info log that was done in case the error was suppressed was done when modules were being initialized during which there is a logging spam. --- bot/utils/function.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/bot/utils/function.py b/bot/utils/function.py index 4fa7a9f60..9bc44e753 100644 --- a/bot/utils/function.py +++ b/bot/utils/function.py @@ -88,7 +88,7 @@ def update_wrapper_globals( wrapper: types.FunctionType, wrapped: types.FunctionType, *, - error_on_conflict: bool = True, + ignored_conflict_names: t.Set[str] = frozenset(), ) -> types.FunctionType: """ Update globals of `wrapper` with the globals from `wrapped`. @@ -100,25 +100,22 @@ def update_wrapper_globals( This function creates a new function functionally identical to `wrapper`, which has the globals replaced with a merge of `wrapped`s globals and the `wrapper`s globals. - If `error_on_conflict` is True, an exception will be raised in case `wrapper` and `wrapped` share a global name - that is used by `wrapped`'s typehints, as this can cause incorrect objects being used by discordpy's converters. - The error can be turned into a warning by setting the argument to False. + An exception will be raised in case `wrapper` and `wrapped` share a global name that is used by + `wrapped`'s typehints and is not in `ignored_conflict_names`, + as this can cause incorrect objects being used by discordpy's converters. """ - forwardrefs = (ann for ann in wrapped.__annotations__.values() if isinstance(ann, str)) - annotation_global_names = (ann.split(".", maxsplit=1)[0] for ann in forwardrefs) + annotation_global_names = ( + ann.split(".", maxsplit=1)[0] for ann in wrapped.__annotations__.values() if isinstance(ann, str) + ) # Conflicting globals from both functions' modules that are also used in the wrapper and in wrapped's annotations. shared_globals = set(wrapper.__code__.co_names) & set(annotation_global_names) - shared_globals &= set(wrapped.__globals__) & set(wrapper.__globals__) + shared_globals &= set(wrapped.__globals__) & set(wrapper.__globals__) - ignored_conflict_names if shared_globals: - message = ( + raise GlobalNameConflictError( f"wrapper and the wrapped function share the following " - f"global names used by annotations: {', '.join(shared_globals)}. " - f"Resolve the conflicts or pass error_on_conflict=False to suppress this error if this is intentional." + f"global names used by annotations: {', '.join(shared_globals)}. Resolve the conflicts or add " + f"the name to the `ignored_conflict_names` set to suppress this error if this is intentional." ) - if error_on_conflict: - raise GlobalNameConflictError(message) - else: - log.info(message) new_globals = wrapper.__globals__.copy() new_globals.update((k, v) for k, v in wrapped.__globals__.items() if k not in wrapper.__code__.co_names) @@ -136,12 +133,15 @@ def command_wraps( assigned: t.Sequence[str] = functools.WRAPPER_ASSIGNMENTS, updated: t.Sequence[str] = functools.WRAPPER_UPDATES, *, - error_on_conflict: bool = True, + ignored_conflict_names: t.Set[str] = frozenset(), ) -> t.Callable[[types.FunctionType], types.FunctionType]: """Update the decorated function to look like `wrapped` and update globals for discordpy forwardref evaluation.""" def decorator(wrapper: types.FunctionType) -> types.FunctionType: return functools.update_wrapper( - update_wrapper_globals(wrapper, wrapped, error_on_conflict=error_on_conflict), wrapped, assigned, updated + update_wrapper_globals(wrapper, wrapped, ignored_conflict_names=ignored_conflict_names), + wrapped, + assigned, + updated, ) return decorator -- cgit v1.2.3 From aace6002d8587b5a79c1ba456ac045a7351152dd Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Tue, 2 Feb 2021 21:53:44 +0100 Subject: Attempt to resolve first part of the argument in case of a failed lookup --- bot/exts/info/doc/_cog.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index df076f162..16baa6320 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -244,8 +244,18 @@ class DocCog(commands.Cog): symbol_info = self.doc_symbols.get(symbol) if symbol_info is None: - log.debug("Symbol does not exist.") - return None + if symbol.count(" "): + # If an invalid symbol contains a space, check if the command was invoked + # in the format !d + symbol = symbol.split(" ", maxsplit=1)[0] + symbol_info = self.doc_symbols.get(symbol) + if symbol_info is None: + log.debug("Symbol does not exist.") + return None + else: + log.debug("Symbol does not exist.") + return None + self.bot.stats.incr(f"doc_fetches.{symbol_info.package}") with self.symbol_get_event: -- cgit v1.2.3 From fd67924e9fb606626800bca81c2dd159cbf5c93b Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 3 Feb 2021 10:19:52 +0100 Subject: Import module directly to avoid circular dependency and use of type --- bot/exts/info/doc/_batch_parser.py | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/bot/exts/info/doc/_batch_parser.py b/bot/exts/info/doc/_batch_parser.py index 872f08ea9..072545e66 100644 --- a/bot/exts/info/doc/_batch_parser.py +++ b/bot/exts/info/doc/_batch_parser.py @@ -7,7 +7,7 @@ from collections import defaultdict from contextlib import suppress from functools import partial from operator import attrgetter -from typing import Dict, List, NamedTuple, TYPE_CHECKING, Union +from typing import Dict, List, NamedTuple, Union import discord from bs4 import BeautifulSoup @@ -15,10 +15,8 @@ from bs4 import BeautifulSoup import bot from bot.constants import Channels from bot.utils.lock import lock_arg -from . import NAMESPACE, doc_cache +from . import NAMESPACE, _cog, doc_cache from ._parsing import get_symbol_markdown -if TYPE_CHECKING: - from ._cog import DocItem log = logging.getLogger(__name__) @@ -35,7 +33,7 @@ class StaleInventoryNotifier: await bot.instance.wait_until_guild_available() self._dev_log = bot.instance.get_channel(Channels.dev_log) - async def send_warning(self, item: DocItem) -> None: + async def send_warning(self, item: _cog.DocItem) -> None: """Send a warning to dev log is one wasn't already sent for `item`'s url.""" if item.url not in self._warned_urls: self._warned_urls.add(item.url) @@ -50,11 +48,11 @@ class StaleInventoryNotifier: class QueueItem(NamedTuple): """Contains a symbol and the BeautifulSoup object needed to parse it.""" - symbol: DocItem + symbol: _cog.DocItem soup: BeautifulSoup - def __eq__(self, other: Union[QueueItem, DocItem]): - if isinstance(other, type(self.symbol)): + def __eq__(self, other: Union[QueueItem, _cog.DocItem]): + if isinstance(other, _cog.DocItem): return self.symbol == other return NamedTuple.__eq__(self, other) @@ -92,8 +90,8 @@ class BatchParser: def __init__(self): self._queue: List[QueueItem] = [] - self._page_symbols: Dict[str, List[DocItem]] = defaultdict(list) - self._item_futures: Dict[DocItem, ParseResultFuture] = {} + self._page_symbols: Dict[str, List[_cog.DocItem]] = defaultdict(list) + self._item_futures: Dict[_cog.DocItem, ParseResultFuture] = {} self._parse_task = None self.cleanup_futures_task = bot.instance.loop.create_task(self._cleanup_futures()) @@ -101,7 +99,7 @@ class BatchParser: self.stale_inventory_notifier = StaleInventoryNotifier() @lock_arg(NAMESPACE, "doc_item", attrgetter("url"), wait=True) - async def get_markdown(self, doc_item: DocItem) -> str: + async def get_markdown(self, doc_item: _cog.DocItem) -> str: """ Get the result Markdown of `doc_item`. @@ -163,7 +161,7 @@ class BatchParser: self._parse_task = None log.trace("Finished parsing queue.") - def _move_to_front(self, item: Union[QueueItem, DocItem]) -> None: + def _move_to_front(self, item: Union[QueueItem, _cog.DocItem]) -> None: """Move `item` to the front of the parse queue.""" # The parse queue stores soups along with the doc symbols in QueueItem objects, # in case we're moving a DocItem we have to get the associated QueueItem first and then move it. @@ -173,7 +171,7 @@ class BatchParser: self._queue.append(queue_item) log.trace(f"Moved {item} to the front of the queue.") - def add_item(self, doc_item: DocItem) -> None: + def add_item(self, doc_item: _cog.DocItem) -> None: """Map a DocItem to its page so that the symbol will be parsed once the page is requested.""" self._page_symbols[doc_item.url].append(doc_item) -- cgit v1.2.3 From 1df9459c2c9c33f3e6dbcbd1b76415610aade192 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 3 Feb 2021 11:49:53 +0100 Subject: Use more consistent naming for doc items and symbols Trailing commas were also added where missing --- bot/exts/info/doc/_batch_parser.py | 32 +++++------ bot/exts/info/doc/_cog.py | 108 ++++++++++++++++++------------------- bot/exts/info/doc/_parsing.py | 2 +- 3 files changed, 71 insertions(+), 71 deletions(-) diff --git a/bot/exts/info/doc/_batch_parser.py b/bot/exts/info/doc/_batch_parser.py index 072545e66..2edf05ff0 100644 --- a/bot/exts/info/doc/_batch_parser.py +++ b/bot/exts/info/doc/_batch_parser.py @@ -33,27 +33,27 @@ class StaleInventoryNotifier: await bot.instance.wait_until_guild_available() self._dev_log = bot.instance.get_channel(Channels.dev_log) - async def send_warning(self, item: _cog.DocItem) -> None: + async def send_warning(self, doc_item: _cog.DocItem) -> None: """Send a warning to dev log is one wasn't already sent for `item`'s url.""" - if item.url not in self._warned_urls: - self._warned_urls.add(item.url) + if doc_item.url not in self._warned_urls: + self._warned_urls.add(doc_item.url) await self._init_task embed = discord.Embed( - description=f"Doc item `{item.symbol_id=}` present in loaded documentation inventories " - f"not found on [site]({item.url}), inventories may need to be refreshed." + description=f"Doc item `{doc_item.symbol_id=}` present in loaded documentation inventories " + f"not found on [site]({doc_item.url}), inventories may need to be refreshed." ) await self._dev_log.send(embed=embed) class QueueItem(NamedTuple): - """Contains a symbol and the BeautifulSoup object needed to parse it.""" + """Contains a doc_item and the BeautifulSoup object needed to parse it.""" - symbol: _cog.DocItem + doc_item: _cog.DocItem soup: BeautifulSoup def __eq__(self, other: Union[QueueItem, _cog.DocItem]): if isinstance(other, _cog.DocItem): - return self.symbol == other + return self.doc_item == other return NamedTuple.__eq__(self, other) @@ -83,14 +83,14 @@ class BatchParser: """ Get the Markdown of all symbols on a page and send them to redis when a symbol is requested. - DocItems are added through the `add_item` method which adds them to the `_page_symbols` dict. + DocItems are added through the `add_item` method which adds them to the `_page_doc_items` dict. `get_markdown` is used to fetch the Markdown; when this is used for the first time on a page, all of the symbols are queued to be parsed to avoid multiple web requests to the same page. """ def __init__(self): self._queue: List[QueueItem] = [] - self._page_symbols: Dict[str, List[_cog.DocItem]] = defaultdict(list) + self._page_doc_items: Dict[str, List[_cog.DocItem]] = defaultdict(list) self._item_futures: Dict[_cog.DocItem, ParseResultFuture] = {} self._parse_task = None @@ -109,14 +109,14 @@ class BatchParser: Not safe to run while `self.clear` is running. """ if doc_item not in self._item_futures: - self._item_futures.update((symbol, ParseResultFuture()) for symbol in self._page_symbols[doc_item.url]) + self._item_futures.update((item, ParseResultFuture()) for item in self._page_doc_items[doc_item.url]) self._item_futures[doc_item].user_requested = True async with bot.instance.http_session.get(doc_item.url) as response: soup = BeautifulSoup(await response.text(encoding="utf8"), "lxml") - self._queue.extend(QueueItem(symbol, soup) for symbol in self._page_symbols[doc_item.url]) - log.debug(f"Added symbols from {doc_item.url} to parse queue.") + self._queue.extend(QueueItem(item, soup) for item in self._page_doc_items[doc_item.url]) + log.debug(f"Added items from {doc_item.url} to parse queue.") if self._parse_task is None: self._parse_task = asyncio.create_task(self._parse_queue()) @@ -139,7 +139,7 @@ class BatchParser: item, soup = self._queue.pop() try: if (future := self._item_futures[item]).done(): - # Some items are present in the inventories multiple times under different symbols, + # Some items are present in the inventories multiple times under different symbol names, # if we already parsed an equal item, we can just skip it. continue @@ -173,7 +173,7 @@ class BatchParser: def add_item(self, doc_item: _cog.DocItem) -> None: """Map a DocItem to its page so that the symbol will be parsed once the page is requested.""" - self._page_symbols[doc_item.url].append(doc_item) + self._page_doc_items[doc_item.url].append(doc_item) async def clear(self) -> None: """ @@ -186,7 +186,7 @@ class BatchParser: if self._parse_task is not None: self._parse_task.cancel() self._queue.clear() - self._page_symbols.clear() + self._page_doc_items.clear() self._item_futures.clear() async def _cleanup_futures(self) -> None: diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 16baa6320..0ff775ac7 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -92,31 +92,31 @@ class DocCog(commands.Cog): self.base_urls[api_package_name] = base_url for group, items in package.items(): - for symbol, relative_doc_url in items: + for symbol_name, relative_doc_url in items: # e.g. get 'class' from 'py:class' group_name = group.split(":")[1] - if (original_symbol := self.doc_symbols.get(symbol)) is not None: + if (original_item := self.doc_symbols.get(symbol_name)) is not None: replaced_symbol_name = self.ensure_unique_symbol_name( api_package_name, group_name, - original_symbol, - symbol, + original_item, + symbol_name, ) if replaced_symbol_name is not None: - symbol = replaced_symbol_name + symbol_name = replaced_symbol_name relative_url_path, _, symbol_id = relative_doc_url.partition("#") # Intern fields that have shared content so we're not storing unique strings for every object - symbol_item = DocItem( + doc_item = DocItem( api_package_name, sys.intern(group_name), base_url, sys.intern(relative_url_path), - symbol_id + symbol_id, ) - self.doc_symbols[symbol] = symbol_item - self.item_fetcher.add_item(symbol_item) + self.doc_symbols[symbol_name] = doc_item + self.item_fetcher.add_item(doc_item) log.trace(f"Fetched inventory for {api_package_name}.") @@ -124,7 +124,7 @@ class DocCog(commands.Cog): self, api_package_name: str, base_url: str, - inventory_url: str + inventory_url: str, ) -> None: """ Update the cog's inventory, or reschedule this method to execute again if the remote inventory unreachable. @@ -144,7 +144,7 @@ class DocCog(commands.Cog): self.inventory_scheduler.schedule_later( delay*60, api_package_name, - self.update_or_reschedule_inventory(api_package_name, base_url, inventory_url) + self.update_or_reschedule_inventory(api_package_name, base_url, inventory_url), ) else: self.update_single(api_package_name, base_url, package) @@ -154,7 +154,7 @@ class DocCog(commands.Cog): package_name: str, group_name: str, original_item: DocItem, - symbol_name: str + symbol_name: str, ) -> Optional[str]: """ Ensure `symbol_name` doesn't overwrite an another symbol in `doc_symbols`. @@ -166,42 +166,42 @@ class DocCog(commands.Cog): """ # Certain groups are added as prefixes to disambiguate the symbols. if group_name in FORCE_PREFIX_GROUPS: - new_symbol = f"{group_name}.{symbol_name}" - if new_symbol in self.doc_symbols: + new_symbol_name = f"{group_name}.{symbol_name}" + if new_symbol_name in self.doc_symbols: # If there's still a conflict, prefix with package name. - new_symbol = f"{package_name}.{new_symbol}" - self.renamed_symbols[symbol_name].append(new_symbol) - return new_symbol + new_symbol_name = f"{package_name}.{new_symbol_name}" + self.renamed_symbols[symbol_name].append(new_symbol_name) + return new_symbol_name # The existing symbol with which the current symbol conflicts should have a group prefix. # It currently doesn't have the group prefix because it's only added once there's a conflict. elif (original_symbol_group := original_item.group) in FORCE_PREFIX_GROUPS: - overridden_symbol = f"{original_symbol_group}.{symbol_name}" - if overridden_symbol in self.doc_symbols: + overridden_symbol_name = f"{original_symbol_group}.{symbol_name}" + if overridden_symbol_name in self.doc_symbols: # If there's still a conflict, prefix with package name. - overridden_symbol = f"{original_item.package}.{overridden_symbol}" + overridden_symbol_name = f"{original_item.package}.{overridden_symbol_name}" - self.doc_symbols[overridden_symbol] = original_item - self.renamed_symbols[symbol_name].append(overridden_symbol) + self.doc_symbols[overridden_symbol_name] = original_item + self.renamed_symbols[symbol_name].append(overridden_symbol_name) elif package_name in PRIORITY_PACKAGES: - overridden_symbol = f"{original_item.package}.{symbol_name}" - if overridden_symbol in self.doc_symbols: + overridden_symbol_name = f"{original_item.package}.{symbol_name}" + if overridden_symbol_name in self.doc_symbols: # If there's still a conflict, add the symbol's group in the middle. - overridden_symbol = f"{original_item.package}.{original_item.group}.{symbol_name}" + overridden_symbol_name = f"{original_item.package}.{original_item.group}.{symbol_name}" - self.doc_symbols[overridden_symbol] = original_item - self.renamed_symbols[symbol_name].append(overridden_symbol) + self.doc_symbols[overridden_symbol_name] = original_item + self.renamed_symbols[symbol_name].append(overridden_symbol_name) # If we can't specially handle the symbol through its group or package, # fall back to prepending its package name to the front. else: - new_symbol = f"{package_name}.{symbol_name}" - if new_symbol in self.doc_symbols: + new_symbol_name = f"{package_name}.{symbol_name}" + if new_symbol_name in self.doc_symbols: # If there's still a conflict, add the symbol's group in the middle. - new_symbol = f"{package_name}.{group_name}.{symbol_name}" - self.renamed_symbols[symbol_name].append(new_symbol) - return new_symbol + new_symbol_name = f"{package_name}.{group_name}.{symbol_name}" + self.renamed_symbols[symbol_name].append(new_symbol_name) + return new_symbol_name async def refresh_inventory(self) -> None: """Refresh internal documentation inventory.""" @@ -229,7 +229,7 @@ class DocCog(commands.Cog): log.debug("Finished inventory refresh.") self.refresh_event.set() - async def get_symbol_embed(self, symbol: str) -> Optional[discord.Embed]: + async def get_symbol_embed(self, symbol_name: str) -> Optional[discord.Embed]: """ Attempt to scrape and fetch the data for the given `symbol`, and build an embed from its contents. @@ -237,47 +237,47 @@ class DocCog(commands.Cog): First check the DocRedisCache before querying the cog's `BatchParser`. """ - log.trace(f"Building embed for symbol `{symbol}`") + log.trace(f"Building embed for symbol `{symbol_name}`") if not self.refresh_event.is_set(): log.debug("Waiting for inventories to be refreshed before processing item.") await self.refresh_event.wait() - symbol_info = self.doc_symbols.get(symbol) - if symbol_info is None: - if symbol.count(" "): + doc_item = self.doc_symbols.get(symbol_name) + if doc_item is None: + if symbol_name.count(" "): # If an invalid symbol contains a space, check if the command was invoked # in the format !d - symbol = symbol.split(" ", maxsplit=1)[0] - symbol_info = self.doc_symbols.get(symbol) - if symbol_info is None: + symbol_name = symbol_name.split(" ", maxsplit=1)[0] + doc_item = self.doc_symbols.get(symbol_name) + if doc_item is None: log.debug("Symbol does not exist.") return None else: log.debug("Symbol does not exist.") return None - self.bot.stats.incr(f"doc_fetches.{symbol_info.package}") + self.bot.stats.incr(f"doc_fetches.{doc_item.package}") with self.symbol_get_event: - markdown = await doc_cache.get(symbol_info) + markdown = await doc_cache.get(doc_item) if markdown is None: - log.debug(f"Redis cache miss for symbol `{symbol}`.") - markdown = await self.item_fetcher.get_markdown(symbol_info) + log.debug(f"Redis cache miss for symbol `{symbol_name}`.") + markdown = await self.item_fetcher.get_markdown(doc_item) if markdown is not None: - await doc_cache.set(symbol_info, markdown) + await doc_cache.set(doc_item, markdown) else: markdown = "Unable to parse the requested symbol." embed = discord.Embed( - title=discord.utils.escape_markdown(symbol), - url=f"{symbol_info.url}#{symbol_info.symbol_id}", + title=discord.utils.escape_markdown(symbol_name), + url=f"{doc_item.url}#{doc_item.symbol_id}", description=markdown ) # Show all symbols with the same name that were renamed in the footer, # with a max of 100 chars. - if symbol in self.renamed_symbols: - renamed_symbols = ', '.join(self.renamed_symbols[symbol]) + if symbol_name in self.renamed_symbols: + renamed_symbols = ', '.join(self.renamed_symbols[symbol_name]) footer_text = textwrap.shorten("Moved: " + renamed_symbols, 100, placeholder=' ...') else: footer_text = "" @@ -285,12 +285,12 @@ class DocCog(commands.Cog): return embed @commands.group(name='docs', aliases=('doc', 'd'), invoke_without_command=True) - async def docs_group(self, ctx: commands.Context, *, symbol: Optional[str]) -> None: + async def docs_group(self, ctx: commands.Context, *, symbol_name: Optional[str]) -> None: """Look up documentation for Python symbols.""" - await self.get_command(ctx, symbol=symbol) + await self.get_command(ctx, symbol_name=symbol_name) @docs_group.command(name='getdoc', aliases=('g',)) - async def get_command(self, ctx: commands.Context, *, symbol: Optional[str]) -> None: + async def get_command(self, ctx: commands.Context, *, symbol_name: Optional[str]) -> None: """ Return a documentation embed for a given symbol. @@ -302,7 +302,7 @@ class DocCog(commands.Cog): !docs aiohttp.ClientSession !docs getdoc aiohttp.ClientSession """ - if not symbol: + if not symbol_name: inventory_embed = discord.Embed( title=f"All inventories (`{len(self.base_urls)}` total)", colour=discord.Colour.blue() @@ -317,7 +317,7 @@ class DocCog(commands.Cog): await ctx.send(embed=inventory_embed) else: - symbol = symbol.strip("`") + symbol = symbol_name.strip("`") # Fetching documentation for a symbol (at least for the first time, since # caching is used) takes quite some time, so let's send typing to indicate # that we got the command, but are still working on it. diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py index 0251b0105..8e1b4d7a1 100644 --- a/bot/exts/info/doc/_parsing.py +++ b/bot/exts/info/doc/_parsing.py @@ -232,7 +232,7 @@ def _create_markdown(signatures: Optional[List[str]], description: Iterable[Tag] def get_symbol_markdown(soup: BeautifulSoup, symbol_data: DocItem) -> Optional[str]: """ - Return parsed markdown of the passed symbol using the passed in soup, truncated to fit within a discord message. + Return parsed markdown of the passed item using the passed in soup, truncated to fit within a discord message. The method of parsing and what information gets included depends on the symbol's group. """ -- cgit v1.2.3 From a09886d6356be9ea5a98a7deea0cebf31e510095 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 3 Feb 2021 12:05:27 +0100 Subject: Remove url lock The items are added to the futures dict before a context switch can occur, making the subsequent requests to the url skip the queue extend and suspend at the future await --- bot/exts/info/doc/_batch_parser.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/bot/exts/info/doc/_batch_parser.py b/bot/exts/info/doc/_batch_parser.py index 2edf05ff0..c16cf6d28 100644 --- a/bot/exts/info/doc/_batch_parser.py +++ b/bot/exts/info/doc/_batch_parser.py @@ -14,8 +14,7 @@ from bs4 import BeautifulSoup import bot from bot.constants import Channels -from bot.utils.lock import lock_arg -from . import NAMESPACE, _cog, doc_cache +from . import _cog, doc_cache from ._parsing import get_symbol_markdown log = logging.getLogger(__name__) @@ -98,7 +97,6 @@ class BatchParser: self.stale_inventory_notifier = StaleInventoryNotifier() - @lock_arg(NAMESPACE, "doc_item", attrgetter("url"), wait=True) async def get_markdown(self, doc_item: _cog.DocItem) -> str: """ Get the result Markdown of `doc_item`. -- cgit v1.2.3 From ef5a98595ec647198f3d06375d2c1d4a5a54bf02 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 3 Feb 2021 12:07:13 +0100 Subject: Move BeautifulSoup parsing into an executor --- bot/exts/info/doc/_batch_parser.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bot/exts/info/doc/_batch_parser.py b/bot/exts/info/doc/_batch_parser.py index c16cf6d28..13e197587 100644 --- a/bot/exts/info/doc/_batch_parser.py +++ b/bot/exts/info/doc/_batch_parser.py @@ -111,7 +111,10 @@ class BatchParser: self._item_futures[doc_item].user_requested = True async with bot.instance.http_session.get(doc_item.url) as response: - soup = BeautifulSoup(await response.text(encoding="utf8"), "lxml") + soup = await bot.instance.loop.run_in_executor( + None, + partial(BeautifulSoup, await response.text(encoding="utf8"), "lxml") + ) self._queue.extend(QueueItem(item, soup) for item in self._page_doc_items[doc_item.url]) log.debug(f"Added items from {doc_item.url} to parse queue.") -- cgit v1.2.3 From c9039b1d012172e7ef3f0ea030420a58db1cbd2d Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 3 Feb 2021 12:08:33 +0100 Subject: Create a task for setting the redis result instead of awaiting The queue parsing doesn't depend on anything with redis, so the await only delays the result being set on the future. --- bot/exts/info/doc/_batch_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/exts/info/doc/_batch_parser.py b/bot/exts/info/doc/_batch_parser.py index 13e197587..2407a603a 100644 --- a/bot/exts/info/doc/_batch_parser.py +++ b/bot/exts/info/doc/_batch_parser.py @@ -149,7 +149,7 @@ class BatchParser: partial(get_symbol_markdown, soup, item), ) if markdown is not None: - await doc_cache.set(item, markdown) + asyncio.create_task(doc_cache.set(item, markdown)) else: asyncio.create_task(self.stale_inventory_notifier.send_warning(item)) except Exception as e: -- cgit v1.2.3 From b1c8e62a7e8ae600a672c4ad3e33b607c8570890 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 3 Feb 2021 12:25:51 +0100 Subject: Use create_task util --- bot/exts/info/doc/_batch_parser.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bot/exts/info/doc/_batch_parser.py b/bot/exts/info/doc/_batch_parser.py index 2407a603a..d18a455d8 100644 --- a/bot/exts/info/doc/_batch_parser.py +++ b/bot/exts/info/doc/_batch_parser.py @@ -14,6 +14,7 @@ from bs4 import BeautifulSoup import bot from bot.constants import Channels +from bot.utils import scheduling from . import _cog, doc_cache from ._parsing import get_symbol_markdown @@ -149,9 +150,9 @@ class BatchParser: partial(get_symbol_markdown, soup, item), ) if markdown is not None: - asyncio.create_task(doc_cache.set(item, markdown)) + scheduling.create_task(doc_cache.set(item, markdown)) else: - asyncio.create_task(self.stale_inventory_notifier.send_warning(item)) + scheduling.create_task(self.stale_inventory_notifier.send_warning(item)) except Exception as e: log.exception(f"Unexpected error when handling {item}") future.set_exception(e) -- cgit v1.2.3 From 9d755707178f2c53bea209c42ab4e3154b0a6a60 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 3 Feb 2021 12:42:56 +0100 Subject: Avoid from import on _batch_parser The tests import the modules the other way around causing a circular import --- bot/exts/info/doc/_cog.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 0ff775ac7..e2204bd4a 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -19,8 +19,7 @@ from bot.pagination import LinePaginator from bot.utils.lock import SharedEvent, lock from bot.utils.messages import send_denial, wait_for_deletion from bot.utils.scheduling import Scheduler -from . import NAMESPACE, PRIORITY_PACKAGES, doc_cache -from ._batch_parser import BatchParser +from . import NAMESPACE, PRIORITY_PACKAGES, _batch_parser, doc_cache from ._inventory_parser import InventoryDict, fetch_inventory log = logging.getLogger(__name__) @@ -62,7 +61,7 @@ class DocCog(commands.Cog): self.base_urls = {} self.bot = bot self.doc_symbols: Dict[str, DocItem] = {} - self.item_fetcher = BatchParser() + self.item_fetcher = _batch_parser.BatchParser() self.renamed_symbols = defaultdict(list) self.inventory_scheduler = Scheduler(self.__class__.__name__) -- cgit v1.2.3 From 49527d94dd792ee3ac81d6f3ee309fcd4f2c63ad Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Fri, 5 Feb 2021 21:51:34 +0100 Subject: Remove unnecessary use of partial run_in_executor can provide args to the func it's passed in, making the use of partial unnecessary. This will also make it more convenient to move to asyncio.to_thread when the codebase is switched to python 3.9 --- bot/exts/info/doc/_batch_parser.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/bot/exts/info/doc/_batch_parser.py b/bot/exts/info/doc/_batch_parser.py index d18a455d8..b3f72bb89 100644 --- a/bot/exts/info/doc/_batch_parser.py +++ b/bot/exts/info/doc/_batch_parser.py @@ -5,7 +5,6 @@ import logging import time from collections import defaultdict from contextlib import suppress -from functools import partial from operator import attrgetter from typing import Dict, List, NamedTuple, Union @@ -114,7 +113,9 @@ class BatchParser: async with bot.instance.http_session.get(doc_item.url) as response: soup = await bot.instance.loop.run_in_executor( None, - partial(BeautifulSoup, await response.text(encoding="utf8"), "lxml") + BeautifulSoup, + await response.text(encoding="utf8"), + "lxml", ) self._queue.extend(QueueItem(item, soup) for item in self._page_doc_items[doc_item.url]) @@ -145,10 +146,7 @@ class BatchParser: # if we already parsed an equal item, we can just skip it. continue - markdown = await bot.instance.loop.run_in_executor( - None, - partial(get_symbol_markdown, soup, item), - ) + markdown = await bot.instance.loop.run_in_executor(None, get_symbol_markdown, soup, item) if markdown is not None: scheduling.create_task(doc_cache.set(item, markdown)) else: -- cgit v1.2.3 From 1d72334d3dcff4d82dde3e9ca5a1edc0989114f2 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Tue, 16 Feb 2021 02:06:49 +0100 Subject: Fix docstring typos --- bot/exts/info/doc/_batch_parser.py | 4 ++-- bot/exts/info/doc/_cog.py | 2 +- bot/exts/info/doc/_html.py | 2 +- bot/exts/info/doc/_parsing.py | 12 ++++++------ 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/bot/exts/info/doc/_batch_parser.py b/bot/exts/info/doc/_batch_parser.py index b3f72bb89..b140843b6 100644 --- a/bot/exts/info/doc/_batch_parser.py +++ b/bot/exts/info/doc/_batch_parser.py @@ -33,7 +33,7 @@ class StaleInventoryNotifier: self._dev_log = bot.instance.get_channel(Channels.dev_log) async def send_warning(self, doc_item: _cog.DocItem) -> None: - """Send a warning to dev log is one wasn't already sent for `item`'s url.""" + """Send a warning to dev log if one wasn't already sent for `item`'s url.""" if doc_item.url not in self._warned_urls: self._warned_urls.add(doc_item.url) await self._init_task @@ -132,7 +132,7 @@ class BatchParser: async def _parse_queue(self) -> None: """ - Parse all item from the queue, setting their result markdown on the futures and sending them to redis. + Parse all items from the queue, setting their result Markdown on the futures and sending them to redis. The coroutine will run as long as the queue is not empty, resetting `self._parse_task` to None when finished. """ diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index e2204bd4a..b6b9b2171 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -126,7 +126,7 @@ class DocCog(commands.Cog): inventory_url: str, ) -> None: """ - Update the cog's inventory, or reschedule this method to execute again if the remote inventory unreachable. + Update the cog's inventory, or reschedule this method to execute again if the remote inventory is unreachable. The first attempt is rescheduled to execute in `FETCH_RESCHEDULE_DELAY.first` minutes, the subsequent attempts in `FETCH_RESCHEDULE_DELAY.repeated` minutes. diff --git a/bot/exts/info/doc/_html.py b/bot/exts/info/doc/_html.py index f9fe542ce..2884a3cf1 100644 --- a/bot/exts/info/doc/_html.py +++ b/bot/exts/info/doc/_html.py @@ -100,7 +100,7 @@ def get_general_description(start_element: Tag) -> List[Union[Tag, NavigableStri """ Get page content to a table or a tag with its class in `SEARCH_END_TAG_ATTRS`. - A headerlink a tag is attempted to be found to skip repeating the symbol information in the description, + A headerlink tag is attempted to be found to skip repeating the symbol information in the description, if it's found it's used as the tag to start the search from instead of the `start_element`. """ child_tags = _find_recursive_children_until_tag(start_element, _class_filter_factory(["section"]), limit=100) diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py index 8e1b4d7a1..3350aac0a 100644 --- a/bot/exts/info/doc/_parsing.py +++ b/bot/exts/info/doc/_parsing.py @@ -106,7 +106,7 @@ def _split_parameters(parameters_string: str) -> Iterator[str]: def _truncate_signatures(signatures: Collection[str]) -> Union[List[str], Collection[str]]: """ - Truncate passed signatures to not exceed `_MAX_SIGNAUTRES_LENGTH`. + Truncate passed signatures to not exceed `_MAX_SIGNATURES_LENGTH`. If the signatures need to be truncated, parameters are collapsed until they fit withing the limit. Individual signatures can consist of max 1, 2, ..., `_MAX_SIGNATURE_AMOUNT` lines of text, @@ -149,10 +149,10 @@ def _get_truncated_description( max_lines: int, ) -> str: """ - Truncate markdown from `elements` to be at most `max_length` characters when rendered or `max_lines` newlines. + Truncate the Markdown from `elements` to be at most `max_length` characters when rendered or `max_lines` newlines. `max_length` limits the length of the rendered characters in the string, - with the real string length limited to `_MAX_DESCRIPTION_LENGTH` to accommodate discord length limits + with the real string length limited to `_MAX_DESCRIPTION_LENGTH` to accommodate discord length limits. """ result = "" markdown_element_ends = [] @@ -209,10 +209,10 @@ def _get_truncated_description( def _create_markdown(signatures: Optional[List[str]], description: Iterable[Tag], url: str) -> str: """ - Create a markdown string with the signatures at the top, and the converted html description below them. + Create a Markdown string with the signatures at the top, and the converted html description below them. The signatures are wrapped in python codeblocks, separated from the description by a newline. - The result markdown string is max 750 rendered characters for the description with signatures at the start. + The result Markdown string is max 750 rendered characters for the description with signatures at the start. """ description = _get_truncated_description( description, @@ -232,7 +232,7 @@ def _create_markdown(signatures: Optional[List[str]], description: Iterable[Tag] def get_symbol_markdown(soup: BeautifulSoup, symbol_data: DocItem) -> Optional[str]: """ - Return parsed markdown of the passed item using the passed in soup, truncated to fit within a discord message. + Return parsed Markdown of the passed item using the passed in soup, truncated to fit within a discord message. The method of parsing and what information gets included depends on the symbol's group. """ -- cgit v1.2.3 From e607600cdd1084566319d4283bd747d772627121 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Fri, 19 Feb 2021 00:01:55 +0100 Subject: Simplify the _split_parameters implementation The main simplification was getting rid of keeping track of string depth which was unnecessary, as we can just always skip them as was being done for strings inside of brackets. The branching was also simplified to make sure less unnecessary checks were being done with a bit less confusing elifs. --- bot/exts/info/doc/_parsing.py | 48 ++++++++++++++++--------------------------- 1 file changed, 18 insertions(+), 30 deletions(-) diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py index 3350aac0a..280a0c8f2 100644 --- a/bot/exts/info/doc/_parsing.py +++ b/bot/exts/info/doc/_parsing.py @@ -63,43 +63,31 @@ def _split_parameters(parameters_string: str) -> Iterator[str]: last_split = 0 depth = 0 current_search: Optional[BracketPair] = None - quote_character = None enumerated_string = enumerate(parameters_string) for index, character in enumerated_string: - if quote_character is None and character in _BRACKET_PAIRS: - if current_search is None: - current_search = _BRACKET_PAIRS[character] - depth = 1 - elif character == current_search.opening_bracket: - depth += 1 + if character in {"'", '"'}: + # Skip everything inside of strings, regardless of the depth. + quote_character = character + for index, character in enumerated_string: + if character == quote_character and _is_closing_quote(parameters_string, index): + break - elif character in {"'", '"'}: - if current_search is not None: - # We're currently searching for a bracket, skip all characters that belong to the string - # to avoid false positives of closing brackets - quote_character = character - for index, character in enumerated_string: - if character == quote_character and _is_closing_quote(parameters_string, index): - break + elif current_search is None: + if (current_search := _BRACKET_PAIRS.get(character)) is not None: + depth = 1 + elif character == ",": + yield parameters_string[last_split:index] + last_split = index + 1 - elif depth == 0: + else: + if character == current_search.opening_bracket: depth += 1 - quote_character = character - elif character == quote_character: - if _is_closing_quote(parameters_string, index): - depth -= 1 - if depth == 0: - quote_character = None - elif current_search is not None and character == current_search.closing_bracket: - depth -= 1 - if depth == 0: - current_search = None - - elif depth == 0 and character == ",": - yield parameters_string[last_split:index] - last_split = index + 1 + elif character == current_search.closing_bracket: + depth -= 1 + if depth == 0: + current_search = None yield parameters_string[last_split:] -- cgit v1.2.3 From 6badbd4cb85bd688363ed5d57e174b43b4788f66 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Fri, 19 Feb 2021 14:39:34 +0100 Subject: Simplify condition --- bot/exts/info/doc/_parsing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py index 280a0c8f2..173051650 100644 --- a/bot/exts/info/doc/_parsing.py +++ b/bot/exts/info/doc/_parsing.py @@ -101,7 +101,7 @@ def _truncate_signatures(signatures: Collection[str]) -> Union[List[str], Collec inversely proportional to the amount of signatures. A maximum of `_MAX_SIGNATURE_AMOUNT` signatures is assumed to be passed. """ - if not sum(len(signature) for signature in signatures) > _MAX_SIGNATURES_LENGTH: + if sum(len(signature) for signature in signatures) <= _MAX_SIGNATURES_LENGTH: return signatures max_signature_length = _EMBED_CODE_BLOCK_LINE_LENGTH * (MAX_SIGNATURE_AMOUNT + 1 - len(signatures)) -- cgit v1.2.3 From 88e951f94aa0c1780e54798b5f3af72b75502ef4 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 20 Feb 2021 01:21:42 +0100 Subject: Use "inventories" when referring to the cog's collection of sphinx invs Previously in some context inventory referred both to a single "inventory" that we got from a remote objects.inv and to the internal cog inventories. Always referring to the cog's inventories as plural The update_single docstring was also changed from rebuild to build, as the method doesn't handle anything with a preexisting inventory with the same symbols being in the cog's inventories --- bot/exts/info/doc/_cog.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index b6b9b2171..ed67abb79 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -76,11 +76,11 @@ class DocCog(commands.Cog): async def init_refresh_inventory(self) -> None: """Refresh documentation inventory on cog initialization.""" await self.bot.wait_until_guild_available() - await self.refresh_inventory() + await self.refresh_inventories() def update_single(self, api_package_name: str, base_url: str, package: InventoryDict) -> None: """ - Rebuild the inventory for a single package. + Build the inventory for a single package. Where: * `package_name` is the package name to use, appears in the log @@ -126,7 +126,7 @@ class DocCog(commands.Cog): inventory_url: str, ) -> None: """ - Update the cog's inventory, or reschedule this method to execute again if the remote inventory is unreachable. + Update the cog's inventories, or reschedule this method to execute again if the remote inventory is unreachable. The first attempt is rescheduled to execute in `FETCH_RESCHEDULE_DELAY.first` minutes, the subsequent attempts in `FETCH_RESCHEDULE_DELAY.repeated` minutes. @@ -202,8 +202,8 @@ class DocCog(commands.Cog): self.renamed_symbols[symbol_name].append(new_symbol_name) return new_symbol_name - async def refresh_inventory(self) -> None: - """Refresh internal documentation inventory.""" + async def refresh_inventories(self) -> None: + """Refresh internal documentation inventories.""" self.refresh_event.clear() await self.symbol_get_event.wait() log.debug("Refreshing documentation inventory...") @@ -369,7 +369,7 @@ class DocCog(commands.Cog): ) self.update_single(package_name, base_url, inventory_dict) - await ctx.send(f"Added the package `{package_name}` to the database and refreshed the inventory.") + await ctx.send(f"Added the package `{package_name}` to the database and updated the inventories.") @docs_group.command(name='deletedoc', aliases=('removedoc', 'rm', 'd')) @commands.has_any_role(*MODERATION_ROLES) @@ -386,9 +386,9 @@ class DocCog(commands.Cog): async with ctx.typing(): # Rebuild the inventory to ensure that everything # that was from this package is properly deleted. - await self.refresh_inventory() + await self.refresh_inventories() await doc_cache.delete(package_name) - await ctx.send(f"Successfully deleted `{package_name}` and refreshed the inventory.") + await ctx.send(f"Successfully deleted `{package_name}` and refreshed the inventories.") @docs_group.command(name="refreshdoc", aliases=("rfsh", "r")) @commands.has_any_role(*MODERATION_ROLES) @@ -397,7 +397,7 @@ class DocCog(commands.Cog): """Refresh inventories and show the difference.""" old_inventories = set(self.base_urls) with ctx.typing(): - await self.refresh_inventory() + await self.refresh_inventories() new_inventories = set(self.base_urls) if added := ", ".join(new_inventories - old_inventories): -- cgit v1.2.3 From e7d7f958b60045a447d5460e740a703654450a0c Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 20 Feb 2021 01:23:41 +0100 Subject: Remove unnecessary comments The comments explain things that should be clear, or basic concepts --- bot/exts/info/doc/_cog.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index ed67abb79..eb3de9d46 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -209,16 +209,11 @@ class DocCog(commands.Cog): log.debug("Refreshing documentation inventory...") self.inventory_scheduler.cancel_all() - # Clear the old base URLS and doc symbols to ensure - # that we start from a fresh local dataset. - # Also, reset the cache used for fetching documentation. self.base_urls.clear() self.doc_symbols.clear() self.renamed_symbols.clear() await self.item_fetcher.clear() - # Run all coroutines concurrently - since each of them performs an HTTP - # request, this speeds up fetching the inventory data heavily. coros = [ self.update_or_reschedule_inventory( package["package"], package["base_url"], package["inventory_url"] @@ -317,9 +312,6 @@ class DocCog(commands.Cog): else: symbol = symbol_name.strip("`") - # Fetching documentation for a symbol (at least for the first time, since - # caching is used) takes quite some time, so let's send typing to indicate - # that we got the command, but are still working on it. async with ctx.typing(): doc_embed = await self.get_symbol_embed(symbol) @@ -384,8 +376,6 @@ class DocCog(commands.Cog): await self.bot.api_client.delete(f'bot/documentation-links/{package_name}') async with ctx.typing(): - # Rebuild the inventory to ensure that everything - # that was from this package is properly deleted. await self.refresh_inventories() await doc_cache.delete(package_name) await ctx.send(f"Successfully deleted `{package_name}` and refreshed the inventories.") -- cgit v1.2.3 From 30a39e9bed8ea9c50a6851504dead1d9c8ed7539 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 20 Feb 2021 01:27:07 +0100 Subject: cleanup->clean_up --- bot/exts/info/doc/_batch_parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bot/exts/info/doc/_batch_parser.py b/bot/exts/info/doc/_batch_parser.py index b140843b6..9956878cf 100644 --- a/bot/exts/info/doc/_batch_parser.py +++ b/bot/exts/info/doc/_batch_parser.py @@ -93,7 +93,7 @@ class BatchParser: self._item_futures: Dict[_cog.DocItem, ParseResultFuture] = {} self._parse_task = None - self.cleanup_futures_task = bot.instance.loop.create_task(self._cleanup_futures()) + self.cleanup_futures_task = bot.instance.loop.create_task(self._clean_up_futures()) self.stale_inventory_notifier = StaleInventoryNotifier() @@ -189,7 +189,7 @@ class BatchParser: self._page_doc_items.clear() self._item_futures.clear() - async def _cleanup_futures(self) -> None: + async def _clean_up_futures(self) -> None: """ Clear old futures from internal results. -- cgit v1.2.3 From 181aa5732b28181800cb663d52be42fd70c3226c Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 20 Feb 2021 01:36:09 +0100 Subject: Add deletedoccache alias to the clear cache command --- bot/exts/info/doc/_cog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index eb3de9d46..1e498237a 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -402,7 +402,7 @@ class DocCog(commands.Cog): ) await ctx.send(embed=embed) - @docs_group.command(name="cleardoccache") + @docs_group.command(name="cleardoccache", aliases=("deletedoccache",)) @commands.has_any_role(*MODERATION_ROLES) async def clear_cache_command(self, ctx: commands.Context, package_name: PackageName) -> None: """Clear the persistent redis cache for `package`.""" -- cgit v1.2.3 From 70912a98bf29bcd8cc9052adb587380533ab5102 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 21 Feb 2021 01:56:22 +0100 Subject: Add lt and gt angle brackets to bracket pairs --- bot/exts/info/doc/_parsing.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py index 173051650..f6e25937e 100644 --- a/bot/exts/info/doc/_parsing.py +++ b/bot/exts/info/doc/_parsing.py @@ -42,6 +42,7 @@ _BRACKET_PAIRS = { "{": BracketPair("{", "}"), "(": BracketPair("(", ")"), "[": BracketPair("[", "]"), + "<": BracketPair("<", ">"), } -- cgit v1.2.3 From 8f6d11a7694d6dea50d94d7918f686834283c858 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 21 Feb 2021 02:54:09 +0100 Subject: Add unittests for _split_signature --- tests/bot/exts/info/doc/__init__.py | 0 tests/bot/exts/info/doc/test_parsing.py | 59 +++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) create mode 100644 tests/bot/exts/info/doc/__init__.py create mode 100644 tests/bot/exts/info/doc/test_parsing.py diff --git a/tests/bot/exts/info/doc/__init__.py b/tests/bot/exts/info/doc/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/bot/exts/info/doc/test_parsing.py b/tests/bot/exts/info/doc/test_parsing.py new file mode 100644 index 000000000..f302b38fc --- /dev/null +++ b/tests/bot/exts/info/doc/test_parsing.py @@ -0,0 +1,59 @@ +from unittest import TestCase + +from bot.exts.info.doc import _parsing as parsing + + +class SignatureSplitter(TestCase): + + def test_basic_split(self): + test_cases = ( + ("0,0,0", ["0", "0", "0"]), + ("0,a=0,a=0", ["0", "a=0", "a=0"]), + ) + self._run_tests(test_cases) + + def test_commas_ignored_in_brackets(self): + test_cases = ( + ("0,[0,0],0,[0,0],0", ["0", "[0,0]", "0", "[0,0]", "0"]), + ("(0,),0,(0,(0,),0),0", ["(0,)", "0", "(0,(0,),0)", "0"]), + ) + self._run_tests(test_cases) + + def test_mixed_brackets(self): + tests_cases = ( + ("[0,{0},0],0,{0:0},0", ["[0,{0},0]", "0", "{0:0}", "0"]), + ("([0],0,0),0,(0,0),0", ["([0],0,0)", "0", "(0,0)", "0"]), + ("([(0,),(0,)],0),0", ["([(0,),(0,)],0)", "0"]), + ) + self._run_tests(tests_cases) + + def test_string_contents_ignored(self): + test_cases = ( + ("'0,0',0,',',0", ["'0,0'", "0", "','", "0"]), + ("0,[']',0],0", ["0", "[']',0]", "0"]), + ("{0,0,'}}',0,'{'},0", ["{0,0,'}}',0,'{'}", "0"]), + ) + self._run_tests(test_cases) + + def test_mixed_quotes(self): + test_cases = ( + ("\"0',0',\",'0,0',0", ["\"0',0',\"", "'0,0'", "0"]), + ("\",',\",'\",',0", ["\",',\"", "'\",'", "0"]), + ) + self._run_tests(test_cases) + + def test_real_signatures(self): + test_cases = ( + ("start, stop[, step]", ["start", " stop[, step]"]), + ("object=b'', encoding='utf-8', errors='strict'", ["object=b''", " encoding='utf-8'", " errors='strict'"]), + ( + "typename, field_names, *, rename=False, defaults=None, module=None", + ["typename", " field_names", " *", " rename=False", " defaults=None", " module=None"] + ), + ) + self._run_tests(test_cases) + + def _run_tests(self, test_cases): + for input_string, expected_output in test_cases: + with self.subTest(input_string=input_string): + self.assertEqual(list(parsing._split_parameters(input_string)), expected_output) -- cgit v1.2.3 From cec78fd1eb3b8da00fe8d2c5057fbbb417ac0255 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 21 Feb 2021 03:04:32 +0100 Subject: Correct length adjustment the placeholder contains a space and is actually 4 chars because of that with a comma that adds up to 5 characters in the signature instead of 4 --- bot/exts/info/doc/_parsing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py index f6e25937e..c26af8ac3 100644 --- a/bot/exts/info/doc/_parsing.py +++ b/bot/exts/info/doc/_parsing.py @@ -118,7 +118,7 @@ def _truncate_signatures(signatures: Collection[str]) -> Union[List[str], Collec parameters_string = parameters_match[1] running_length = len(signature) - len(parameters_string) for parameter in _split_parameters(parameters_string): - if (len(parameter) + running_length) <= max_signature_length - 4: # account for comma and placeholder + if (len(parameter) + running_length) <= max_signature_length - 5: # account for comma and placeholder truncated_signature.append(parameter) running_length += len(parameter) + 1 else: -- cgit v1.2.3 From 95f6dd89a528be327e9c52b47948e3d8138590ed Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 21 Feb 2021 03:17:57 +0100 Subject: Remove redundant group check As we check for non dt symbol names to be sure in case something pops up , we can skip the initial group check as all the symbols from those groups should point to non dt tags. --- bot/exts/info/doc/_parsing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py index c26af8ac3..f9b4f9d8a 100644 --- a/bot/exts/info/doc/_parsing.py +++ b/bot/exts/info/doc/_parsing.py @@ -231,7 +231,7 @@ def get_symbol_markdown(soup: BeautifulSoup, symbol_data: DocItem) -> Optional[s signature = None # Modules, doc pages and labels don't point to description list tags but to tags like divs, # no special parsing can be done so we only try to include what's under them. - if symbol_data.group in {"module", "doc", "label"} or symbol_heading.name != "dt": + if symbol_heading.name != "dt": description = get_general_description(symbol_heading) elif symbol_data.group in _NO_SIGNATURE_GROUPS: -- cgit v1.2.3 From d620a38bd03b7452474e12dc8a8531a868e7055d Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Tue, 23 Feb 2021 03:31:39 +0100 Subject: Update docstrings --- bot/exts/info/doc/_cog.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 1e498237a..0b0611cbc 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -86,7 +86,7 @@ class DocCog(commands.Cog): * `package_name` is the package name to use, appears in the log * `base_url` is the root documentation URL for the specified package, used to build absolute paths that link to specific symbols - * `inventory_url` is the absolute URL to the intersphinx inventory. + * `package` are the InventoryDict contents of a intersphinx inventory. """ self.base_urls[api_package_name] = base_url @@ -225,7 +225,7 @@ class DocCog(commands.Cog): async def get_symbol_embed(self, symbol_name: str) -> Optional[discord.Embed]: """ - Attempt to scrape and fetch the data for the given `symbol`, and build an embed from its contents. + Attempt to scrape and fetch the data for the given `symbol_name`, and build an embed from its contents. If the symbol is known, an Embed with documentation about it is returned. -- cgit v1.2.3 From cbd6054956cf3bd0646e0577f510faebf077cfe5 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 24 Feb 2021 14:23:59 +0100 Subject: Allow wildcard match for clear cache command --- bot/exts/info/doc/_cog.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 0b0611cbc..09ce04dd4 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -7,14 +7,14 @@ import textwrap from collections import defaultdict from contextlib import suppress from types import SimpleNamespace -from typing import Dict, NamedTuple, Optional +from typing import Dict, NamedTuple, Optional, Union import discord from discord.ext import commands from bot.bot import Bot from bot.constants import MODERATION_ROLES, RedirectOutput -from bot.converters import Inventory, PackageName, ValidURL +from bot.converters import Inventory, PackageName, ValidURL, allowed_strings from bot.pagination import LinePaginator from bot.utils.lock import SharedEvent, lock from bot.utils.messages import send_denial, wait_for_deletion @@ -404,7 +404,11 @@ class DocCog(commands.Cog): @docs_group.command(name="cleardoccache", aliases=("deletedoccache",)) @commands.has_any_role(*MODERATION_ROLES) - async def clear_cache_command(self, ctx: commands.Context, package_name: PackageName) -> None: + async def clear_cache_command( + self, + ctx: commands.Context, + package_name: Union[PackageName, allowed_strings("*")] # noqa: F722 + ) -> None: """Clear the persistent redis cache for `package`.""" if await doc_cache.delete(package_name): await ctx.send(f"Successfully cleared the cache for `{package_name}`.") -- cgit v1.2.3 From d65d130f1efb2dc5b4a72f025cd7abb3371bd663 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 24 Feb 2021 14:27:08 +0100 Subject: Use 4 spaces for hanging indent --- bot/exts/info/doc/_cog.py | 18 +++++++++--------- bot/exts/info/doc/_html.py | 12 ++++++------ bot/exts/info/doc/_parsing.py | 8 ++++---- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 09ce04dd4..95a772df3 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -120,10 +120,10 @@ class DocCog(commands.Cog): log.trace(f"Fetched inventory for {api_package_name}.") async def update_or_reschedule_inventory( - self, - api_package_name: str, - base_url: str, - inventory_url: str, + self, + api_package_name: str, + base_url: str, + inventory_url: str, ) -> None: """ Update the cog's inventories, or reschedule this method to execute again if the remote inventory is unreachable. @@ -149,11 +149,11 @@ class DocCog(commands.Cog): self.update_single(api_package_name, base_url, package) def ensure_unique_symbol_name( - self, - package_name: str, - group_name: str, - original_item: DocItem, - symbol_name: str, + self, + package_name: str, + group_name: str, + original_item: DocItem, + symbol_name: str, ) -> Optional[str]: """ Ensure `symbol_name` doesn't overwrite an another symbol in `doc_symbols`. diff --git a/bot/exts/info/doc/_html.py b/bot/exts/info/doc/_html.py index 2884a3cf1..701684b88 100644 --- a/bot/exts/info/doc/_html.py +++ b/bot/exts/info/doc/_html.py @@ -46,12 +46,12 @@ class Strainer(SoupStrainer): def _find_elements_until_tag( - start_element: PageElement, - end_tag_filter: Union[Container[str], Callable[[Tag], bool]], - *, - func: Callable, - include_strings: bool = False, - limit: int = None, + start_element: PageElement, + end_tag_filter: Union[Container[str], Callable[[Tag], bool]], + *, + func: Callable, + include_strings: bool = False, + limit: int = None, ) -> List[Union[Tag, NavigableString]]: """ Get all elements up to `limit` or until a tag matching `tag_filter` is found. diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py index f9b4f9d8a..6b2d31cdd 100644 --- a/bot/exts/info/doc/_parsing.py +++ b/bot/exts/info/doc/_parsing.py @@ -132,10 +132,10 @@ def _truncate_signatures(signatures: Collection[str]) -> Union[List[str], Collec def _get_truncated_description( - elements: Iterable[Union[Tag, NavigableString]], - markdown_converter: DocMarkdownConverter, - max_length: int, - max_lines: int, + elements: Iterable[Union[Tag, NavigableString]], + markdown_converter: DocMarkdownConverter, + max_length: int, + max_lines: int, ) -> str: """ Truncate the Markdown from `elements` to be at most `max_length` characters when rendered or `max_lines` newlines. -- cgit v1.2.3 From 41cfe3f805e53c43ec18585d203e0b80ed59afda Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 27 Feb 2021 02:52:05 +0100 Subject: Get the last index instead of using max The last index will always be the largest one so there's no need for max to search for it --- bot/exts/info/doc/_parsing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py index 6b2d31cdd..b422b4f24 100644 --- a/bot/exts/info/doc/_parsing.py +++ b/bot/exts/info/doc/_parsing.py @@ -190,7 +190,7 @@ def _get_truncated_description( truncated_result = textwrap.shorten(result, truncate_index) else: # Truncate at the last Markdown element that comes before the truncation index. - markdown_truncate_index = max(possible_truncation_indices) + markdown_truncate_index = possible_truncation_indices[-1] truncated_result = result[:markdown_truncate_index] return truncated_result.strip(_TRUNCATE_STRIP_CHARACTERS) + "..." -- cgit v1.2.3 From a95cbc501a813b18d4e11bacefef0d447578e6fe Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 27 Feb 2021 11:07:47 +0100 Subject: Add digits to package covnerter --- bot/converters.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bot/converters.py b/bot/converters.py index 2b383636c..be1f1329f 100644 --- a/bot/converters.py +++ b/bot/converters.py @@ -134,13 +134,13 @@ class PackageName(Converter): Package names are used for stats and are restricted to the a-z and _ characters. """ - PACKAGE_NAME_RE = re.compile(r"[^a-z_]") + PACKAGE_NAME_RE = re.compile(r"[^a-z0-9_]") @classmethod async def convert(cls, ctx: Context, argument: str) -> str: """Checks whether the given string is a valid package name.""" if cls.PACKAGE_NAME_RE.search(argument): - raise BadArgument("The provided package name is not valid; please only use the _ and a-z characters.") + raise BadArgument("The provided package name is not valid; please only use the _, 0-9 and a-z characters.") return argument -- cgit v1.2.3 From 02c0d1535b46922096d53967e2938bbb3a56ef82 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 28 Feb 2021 00:40:52 +0100 Subject: Add new symbols to front of queue instead of extending the end Using extend caused old items, including the ones that were requested by users and pushed to the front, to be pushed back by all of the items on the new page, possibly significantly delaying their parsing --- bot/exts/info/doc/_batch_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/exts/info/doc/_batch_parser.py b/bot/exts/info/doc/_batch_parser.py index 9956878cf..780fb16d9 100644 --- a/bot/exts/info/doc/_batch_parser.py +++ b/bot/exts/info/doc/_batch_parser.py @@ -118,7 +118,7 @@ class BatchParser: "lxml", ) - self._queue.extend(QueueItem(item, soup) for item in self._page_doc_items[doc_item.url]) + self._queue[:0] = (QueueItem(item, soup) for item in self._page_doc_items[doc_item.url]) log.debug(f"Added items from {doc_item.url} to parse queue.") if self._parse_task is None: -- cgit v1.2.3 From 85b1d7751c3cf46c007a4194d984a1921684456b Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 3 Mar 2021 14:21:24 +0100 Subject: Use common check for early exit This introduces a possibly redundant check for the doc_item being None but results in flatter code with less duplication --- bot/exts/info/doc/_cog.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 95a772df3..0c255c449 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -237,18 +237,15 @@ class DocCog(commands.Cog): await self.refresh_event.wait() doc_item = self.doc_symbols.get(symbol_name) + if doc_item is None and " " in symbol_name: + # If an invalid symbol contains a space, check if the command was invoked + # in the format !d + symbol_name = symbol_name.split(" ", maxsplit=1)[0] + doc_item = self.doc_symbols.get(symbol_name) + if doc_item is None: - if symbol_name.count(" "): - # If an invalid symbol contains a space, check if the command was invoked - # in the format !d - symbol_name = symbol_name.split(" ", maxsplit=1)[0] - doc_item = self.doc_symbols.get(symbol_name) - if doc_item is None: - log.debug("Symbol does not exist.") - return None - else: - log.debug("Symbol does not exist.") - return None + log.debug("Symbol does not exist.") + return None self.bot.stats.incr(f"doc_fetches.{doc_item.package}") -- cgit v1.2.3 From d9d637930e486716f5143bcb1b64bc309e3c55eb Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Thu, 4 Mar 2021 19:07:13 +0100 Subject: Use deque instead of a list As we extend the left side of a list now, using a deque that provides a direct method for it is more fitting. fixup! Use deque instead of a list --- bot/exts/info/doc/_batch_parser.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/bot/exts/info/doc/_batch_parser.py b/bot/exts/info/doc/_batch_parser.py index 780fb16d9..95538f364 100644 --- a/bot/exts/info/doc/_batch_parser.py +++ b/bot/exts/info/doc/_batch_parser.py @@ -1,12 +1,13 @@ from __future__ import annotations import asyncio +import collections import logging import time from collections import defaultdict from contextlib import suppress from operator import attrgetter -from typing import Dict, List, NamedTuple, Union +from typing import Deque, Dict, List, NamedTuple, Union import discord from bs4 import BeautifulSoup @@ -88,7 +89,7 @@ class BatchParser: """ def __init__(self): - self._queue: List[QueueItem] = [] + self._queue: Deque[QueueItem] = collections.deque() self._page_doc_items: Dict[str, List[_cog.DocItem]] = defaultdict(list) self._item_futures: Dict[_cog.DocItem, ParseResultFuture] = {} self._parse_task = None @@ -118,7 +119,7 @@ class BatchParser: "lxml", ) - self._queue[:0] = (QueueItem(item, soup) for item in self._page_doc_items[doc_item.url]) + self._queue.extendleft(QueueItem(item, soup) for item in self._page_doc_items[doc_item.url]) log.debug(f"Added items from {doc_item.url} to parse queue.") if self._parse_task is None: @@ -126,7 +127,7 @@ class BatchParser: else: self._item_futures[doc_item].user_requested = True with suppress(ValueError): - # If the item is not in the list then the item is already parsed or is being parsed + # If the item is not in the queue then the item is already parsed or is being parsed self._move_to_front(doc_item) return await self._item_futures[doc_item] @@ -166,7 +167,8 @@ class BatchParser: # The parse queue stores soups along with the doc symbols in QueueItem objects, # in case we're moving a DocItem we have to get the associated QueueItem first and then move it. item_index = self._queue.index(item) - queue_item = self._queue.pop(item_index) + queue_item = self._queue[item_index] + del self._queue[item_index] self._queue.append(queue_item) log.trace(f"Moved {item} to the front of the queue.") -- cgit v1.2.3 From bd8323501712ed0fc313c502f3e0bd567c111328 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Thu, 4 Mar 2021 21:42:28 +0100 Subject: Move the seconds of a week timedelta into a constant --- bot/exts/info/doc/_redis_cache.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bot/exts/info/doc/_redis_cache.py b/bot/exts/info/doc/_redis_cache.py index cab51c3f1..7de2f3806 100644 --- a/bot/exts/info/doc/_redis_cache.py +++ b/bot/exts/info/doc/_redis_cache.py @@ -7,6 +7,8 @@ from async_rediscache.types.base import RedisObject, namespace_lock if TYPE_CHECKING: from ._cog import DocItem +WEEK_SECONDS = datetime.timedelta(weeks=1).total_seconds() + class DocRedisCache(RedisObject): """Interface for redis functionality needed by the Doc cog.""" @@ -33,7 +35,7 @@ class DocRedisCache(RedisObject): await connection.hset(redis_key, item.symbol_id, value) if needs_expire: - await connection.expire(redis_key, datetime.timedelta(weeks=1).total_seconds()) + await connection.expire(redis_key, WEEK_SECONDS) @namespace_lock async def get(self, item: DocItem) -> Optional[str]: -- cgit v1.2.3 From f993a11c0461e57e853dfc0e296fc32dcfc2b265 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Thu, 4 Mar 2021 21:49:38 +0100 Subject: Typo and docstring style changes Co-authored-by: MarkKoz fixup! Docstring typos and style changes --- bot/converters.py | 2 +- bot/exts/info/doc/_batch_parser.py | 6 +++--- bot/exts/info/doc/_cog.py | 4 ++-- bot/exts/info/doc/_html.py | 8 ++++---- bot/utils/lock.py | 2 +- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/bot/converters.py b/bot/converters.py index be1f1329f..4fbf3c124 100644 --- a/bot/converters.py +++ b/bot/converters.py @@ -140,7 +140,7 @@ class PackageName(Converter): async def convert(cls, ctx: Context, argument: str) -> str: """Checks whether the given string is a valid package name.""" if cls.PACKAGE_NAME_RE.search(argument): - raise BadArgument("The provided package name is not valid; please only use the _, 0-9 and a-z characters.") + raise BadArgument("The provided package name is not valid; please only use the _, 0-9, and a-z characters.") return argument diff --git a/bot/exts/info/doc/_batch_parser.py b/bot/exts/info/doc/_batch_parser.py index 95538f364..45ca17e5e 100644 --- a/bot/exts/info/doc/_batch_parser.py +++ b/bot/exts/info/doc/_batch_parser.py @@ -46,7 +46,7 @@ class StaleInventoryNotifier: class QueueItem(NamedTuple): - """Contains a doc_item and the BeautifulSoup object needed to parse it.""" + """Contains a `DocItem` and the `BeautifulSoup` object needed to parse it.""" doc_item: _cog.DocItem soup: BeautifulSoup @@ -120,7 +120,7 @@ class BatchParser: ) self._queue.extendleft(QueueItem(item, soup) for item in self._page_doc_items[doc_item.url]) - log.debug(f"Added items from {doc_item.url} to parse queue.") + log.debug(f"Added items from {doc_item.url} to the parse queue.") if self._parse_task is None: self._parse_task = asyncio.create_task(self._parse_queue()) @@ -181,7 +181,7 @@ class BatchParser: """ Clear all internal symbol data. - All currently requested items are waited to be parsed before clearing. + Wait for all user-requested symbols to be parsed before clearing the parser. """ for future in filter(attrgetter("user_requested"), self._item_futures.values()): await future diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 0c255c449..8300f11d1 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -83,10 +83,10 @@ class DocCog(commands.Cog): Build the inventory for a single package. Where: - * `package_name` is the package name to use, appears in the log + * `package_name` is the package name to use in logs and when qualifying symbols * `base_url` is the root documentation URL for the specified package, used to build absolute paths that link to specific symbols - * `package` are the InventoryDict contents of a intersphinx inventory. + * `package` is the content of a intersphinx inventory. """ self.base_urls[api_package_name] = base_url diff --git a/bot/exts/info/doc/_html.py b/bot/exts/info/doc/_html.py index 701684b88..334b82e98 100644 --- a/bot/exts/info/doc/_html.py +++ b/bot/exts/info/doc/_html.py @@ -54,7 +54,7 @@ def _find_elements_until_tag( limit: int = None, ) -> List[Union[Tag, NavigableString]]: """ - Get all elements up to `limit` or until a tag matching `tag_filter` is found. + Get all elements up to `limit` or until a tag matching `end_tag_filter` is found. `end_tag_filter` can be either a container of string names to check against, or a filtering callable that's applied to tags. @@ -86,7 +86,7 @@ _find_previous_siblings_until_tag = partial(_find_elements_until_tag, func=Beaut def _class_filter_factory(class_names: Iterable[str]) -> Callable[[Tag], bool]: - """Create callable that returns True when the passed in tag's class is in `class_names` or when it's is a table.""" + """Create callable that returns True when the passed in tag's class is in `class_names` or when it's a table.""" def match_tag(tag: Tag) -> bool: for attr in class_names: if attr in tag.get("class", ()): @@ -100,8 +100,8 @@ def get_general_description(start_element: Tag) -> List[Union[Tag, NavigableStri """ Get page content to a table or a tag with its class in `SEARCH_END_TAG_ATTRS`. - A headerlink tag is attempted to be found to skip repeating the symbol information in the description, - if it's found it's used as the tag to start the search from instead of the `start_element`. + A headerlink tag is attempted to be found to skip repeating the symbol information in the description. + If it's found it's used as the tag to start the search from instead of the `start_element`. """ child_tags = _find_recursive_children_until_tag(start_element, _class_filter_factory(["section"]), limit=100) header = next(filter(_class_filter_factory(["headerlink"]), child_tags), None) diff --git a/bot/utils/lock.py b/bot/utils/lock.py index b4c93f063..ec6f92cd4 100644 --- a/bot/utils/lock.py +++ b/bot/utils/lock.py @@ -23,7 +23,7 @@ class SharedEvent: """ Context manager managing an internal event exposed through the wait coro. - While any code is executing in this context manager, the underyling event will not be set; + While any code is executing in this context manager, the underlying event will not be set; when all of the holders finish the event will be set. """ -- cgit v1.2.3 From 8c2aa1de81fc55c9e33312a086b98faf0a8cab47 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Fri, 5 Mar 2021 00:08:11 +0100 Subject: Do not set redis results in get_symbol_embed The redis results are already being set in the BatchParser for all symbols --- bot/exts/info/doc/_cog.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 8300f11d1..5f9366228 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -255,9 +255,7 @@ class DocCog(commands.Cog): if markdown is None: log.debug(f"Redis cache miss for symbol `{symbol_name}`.") markdown = await self.item_fetcher.get_markdown(doc_item) - if markdown is not None: - await doc_cache.set(doc_item, markdown) - else: + if markdown is None: markdown = "Unable to parse the requested symbol." embed = discord.Embed( -- cgit v1.2.3 From ed750b03efa792205b1e624e49dd318cda9d1312 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Fri, 5 Mar 2021 00:28:23 +0100 Subject: Set the result of the future instead of an exception when avaialble --- bot/exts/info/doc/_batch_parser.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bot/exts/info/doc/_batch_parser.py b/bot/exts/info/doc/_batch_parser.py index 45ca17e5e..f5e16a60b 100644 --- a/bot/exts/info/doc/_batch_parser.py +++ b/bot/exts/info/doc/_batch_parser.py @@ -141,6 +141,7 @@ class BatchParser: try: while self._queue: item, soup = self._queue.pop() + markdown = None try: if (future := self._item_futures[item]).done(): # Some items are present in the inventories multiple times under different symbol names, @@ -154,7 +155,10 @@ class BatchParser: scheduling.create_task(self.stale_inventory_notifier.send_warning(item)) except Exception as e: log.exception(f"Unexpected error when handling {item}") - future.set_exception(e) + if markdown is not None: + future.set_result(markdown) + else: + future.set_exception(e) else: future.set_result(markdown) await asyncio.sleep(0.1) -- cgit v1.2.3 From fdafa7423596d8a11b5c25a7f6a9ab47ed3ce6b6 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Fri, 5 Mar 2021 01:08:43 +0100 Subject: Do _item_futures cleanup in _parse_queue instead of a concurrent task The doc_cache coro was changed to be awaited directly instead of creating a task to ensure the cache is populated before the item is deleted --- bot/exts/info/doc/_batch_parser.py | 36 ++++-------------------------------- bot/exts/info/doc/_cog.py | 1 - 2 files changed, 4 insertions(+), 33 deletions(-) diff --git a/bot/exts/info/doc/_batch_parser.py b/bot/exts/info/doc/_batch_parser.py index f5e16a60b..d80b62d88 100644 --- a/bot/exts/info/doc/_batch_parser.py +++ b/bot/exts/info/doc/_batch_parser.py @@ -3,7 +3,6 @@ from __future__ import annotations import asyncio import collections import logging -import time from collections import defaultdict from contextlib import suppress from operator import attrgetter @@ -63,20 +62,11 @@ class ParseResultFuture(asyncio.Future): `user_requested` is set by the parser when a Future is requested by an user and moved to the front, allowing the futures to only be waited for when clearing if they were user requested. - - `result_set_time` provides the time at which the future's result has been set, - or -inf if the result hasn't been set yet """ def __init__(self): super().__init__() self.user_requested = False - self.result_set_time = float("inf") - - def set_result(self, result: str, /) -> None: - """Set `self.result_set_time` to current time when the result is set.""" - self.result_set_time = time.time() - super().set_result(result) class BatchParser: @@ -91,11 +81,9 @@ class BatchParser: def __init__(self): self._queue: Deque[QueueItem] = collections.deque() self._page_doc_items: Dict[str, List[_cog.DocItem]] = defaultdict(list) - self._item_futures: Dict[_cog.DocItem, ParseResultFuture] = {} + self._item_futures: Dict[_cog.DocItem, ParseResultFuture] = defaultdict(ParseResultFuture) self._parse_task = None - self.cleanup_futures_task = bot.instance.loop.create_task(self._clean_up_futures()) - self.stale_inventory_notifier = StaleInventoryNotifier() async def get_markdown(self, doc_item: _cog.DocItem) -> str: @@ -107,8 +95,7 @@ class BatchParser: Not safe to run while `self.clear` is running. """ - if doc_item not in self._item_futures: - self._item_futures.update((item, ParseResultFuture()) for item in self._page_doc_items[doc_item.url]) + if doc_item not in self._item_futures and doc_item not in self._queue: self._item_futures[doc_item].user_requested = True async with bot.instance.http_session.get(doc_item.url) as response: @@ -150,7 +137,7 @@ class BatchParser: markdown = await bot.instance.loop.run_in_executor(None, get_symbol_markdown, soup, item) if markdown is not None: - scheduling.create_task(doc_cache.set(item, markdown)) + await doc_cache.set(item, markdown) else: scheduling.create_task(self.stale_inventory_notifier.send_warning(item)) except Exception as e: @@ -161,6 +148,7 @@ class BatchParser: future.set_exception(e) else: future.set_result(markdown) + del self._item_futures[item] await asyncio.sleep(0.1) finally: self._parse_task = None @@ -194,19 +182,3 @@ class BatchParser: self._queue.clear() self._page_doc_items.clear() self._item_futures.clear() - - async def _clean_up_futures(self) -> None: - """ - Clear old futures from internal results. - - After a future is set, we only need to wait for old requests to its associated `DocItem` to finish - as all new requests will get the value from the redis cache in the cog first. - Keeping them around for longer than a second is unnecessary and keeps the parsed Markdown strings alive. - """ - while True: - if not self._queue: - current_time = time.time() - for key, future in self._item_futures.copy().items(): - if current_time - future.result_set_time > 5: - del self._item_futures[key] - await asyncio.sleep(5) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 5f9366228..80f85d625 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -413,6 +413,5 @@ class DocCog(commands.Cog): def cog_unload(self) -> None: """Clear scheduled inventories, queued symbols and cleanup task on cog unload.""" self.inventory_scheduler.cancel_all() - self.item_fetcher.cleanup_futures_task.cancel() self.init_refresh_task.cancel() asyncio.create_task(self.item_fetcher.clear()) -- cgit v1.2.3 From 0b8dab1840ba4f14b41f18a88d0fd870dfeec7fe Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Fri, 5 Mar 2021 01:19:16 +0100 Subject: Add comments for purpose of DocItem attributes --- bot/exts/info/doc/_cog.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 80f85d625..fd8ed2008 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -42,11 +42,11 @@ COMMAND_LOCK_SINGLETON = "inventory refresh" class DocItem(NamedTuple): """Holds inventory symbol information.""" - package: str - group: str - base_url: str - relative_url_path: str - symbol_id: str + package: str # Name of the package name the symbol is from + group: str # Interpshinx "role" of the symbol, for example `label` or `method` + base_url: str # Absolute path to to which the relative path resolves, same for all items with the same package + relative_url_path: str # Relative path to the page where the symbol is located + symbol_id: str # Fragment id used to locate the symbol on the page @property def url(self) -> str: -- cgit v1.2.3 From 8e556bd52b62881de594b6d73365aa0b0498c766 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Fri, 5 Mar 2021 01:23:44 +0100 Subject: Use clearer branching Co-authored-by: MarkKoz --- bot/exts/info/doc/_parsing.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py index b422b4f24..7549efeac 100644 --- a/bot/exts/info/doc/_parsing.py +++ b/bot/exts/info/doc/_parsing.py @@ -211,12 +211,10 @@ def _create_markdown(signatures: Optional[List[str]], description: Iterable[Tag] ) description = _WHITESPACE_AFTER_NEWLINES_RE.sub('', description) if signatures is not None: - formatted_markdown = "".join(f"```py\n{signature}```" for signature in _truncate_signatures(signatures)) + signature = "".join(f"```py\n{signature}```" for signature in _truncate_signatures(signatures)) + return f"{signature}\n{description}" else: - formatted_markdown = "" - formatted_markdown += f"\n{description}" - - return formatted_markdown + return description def get_symbol_markdown(soup: BeautifulSoup, symbol_data: DocItem) -> Optional[str]: -- cgit v1.2.3 From 398bbdd2080934ef643d5fc98db6358f28fec051 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Fri, 5 Mar 2021 01:26:09 +0100 Subject: Remove placeholder in shorten call --- bot/exts/info/doc/_parsing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py index 7549efeac..b1b09ccc7 100644 --- a/bot/exts/info/doc/_parsing.py +++ b/bot/exts/info/doc/_parsing.py @@ -187,7 +187,7 @@ def _get_truncated_description( possible_truncation_indices = [cut for cut in markdown_element_ends if cut < truncate_index] if not possible_truncation_indices: # In case there is no Markdown element ending before the truncation index, use shorten as a fallback. - truncated_result = textwrap.shorten(result, truncate_index) + truncated_result = textwrap.shorten(result, truncate_index, placeholder="") else: # Truncate at the last Markdown element that comes before the truncation index. markdown_truncate_index = possible_truncation_indices[-1] -- cgit v1.2.3 From c2c0dc2a8caced134422c005d010e7dd10cf7466 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Fri, 5 Mar 2021 01:32:06 +0100 Subject: Account for ellipses when determining the truncation description index --- bot/exts/info/doc/_parsing.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py index b1b09ccc7..43e78ddca 100644 --- a/bot/exts/info/doc/_parsing.py +++ b/bot/exts/info/doc/_parsing.py @@ -170,14 +170,14 @@ def _get_truncated_description( if not markdown_element_ends: return "" - # Determine the "hard" truncation index. + # Determine the "hard" truncation index. Account for the ellipsis placeholder for the max length. newline_truncate_index = find_nth_occurrence(result, "\n", max_lines) - if newline_truncate_index is not None and newline_truncate_index < _MAX_DESCRIPTION_LENGTH: + if newline_truncate_index is not None and newline_truncate_index < _MAX_DESCRIPTION_LENGTH - 3: # Truncate based on maximum lines if there are more than the maximum number of lines. truncate_index = newline_truncate_index else: # There are less than the maximum number of lines; truncate based on the max char length. - truncate_index = _MAX_DESCRIPTION_LENGTH + truncate_index = _MAX_DESCRIPTION_LENGTH - 3 # Nothing needs to be truncated if the last element ends before the truncation index. if truncate_index >= markdown_element_ends[-1]: -- cgit v1.2.3 From 9c28041dcfb33b273823ef6d5fec3abbe3f1a4c8 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Fri, 5 Mar 2021 01:37:26 +0100 Subject: Add comments to the parsing module Co-authored-by: MarkKoz --- bot/exts/info/doc/_parsing.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py index 43e78ddca..e7b8b695b 100644 --- a/bot/exts/info/doc/_parsing.py +++ b/bot/exts/info/doc/_parsing.py @@ -49,7 +49,7 @@ _BRACKET_PAIRS = { def _is_closing_quote(search_string: str, index: int) -> bool: """Check whether the quote at `index` inside `search_string` can be a closing quote.""" if search_string[index - 1] != "\\": - return True + return True # The quote is not escaped. elif search_string[index - 2] == "\\": return True return False @@ -69,7 +69,7 @@ def _split_parameters(parameters_string: str) -> Iterator[str]: for index, character in enumerated_string: if character in {"'", '"'}: # Skip everything inside of strings, regardless of the depth. - quote_character = character + quote_character = character # The closing quote must equal the opening quote. for index, character in enumerated_string: if character == quote_character and _is_closing_quote(parameters_string, index): break @@ -103,6 +103,7 @@ def _truncate_signatures(signatures: Collection[str]) -> Union[List[str], Collec A maximum of `_MAX_SIGNATURE_AMOUNT` signatures is assumed to be passed. """ if sum(len(signature) for signature in signatures) <= _MAX_SIGNATURES_LENGTH: + # Total length of signatures is under the length limit; no truncation needed. return signatures max_signature_length = _EMBED_CODE_BLOCK_LINE_LENGTH * (MAX_SIGNATURE_AMOUNT + 1 - len(signatures)) @@ -111,6 +112,7 @@ def _truncate_signatures(signatures: Collection[str]) -> Union[List[str], Collec signature = signature.strip() if len(signature) > max_signature_length: if (parameters_match := _PARAMETERS_RE.search(signature)) is None: + # The signature has no parameters or the regex failed; perform a simple truncation of the text. formatted_signatures.append(textwrap.shorten(signature, max_signature_length)) continue @@ -118,14 +120,17 @@ def _truncate_signatures(signatures: Collection[str]) -> Union[List[str], Collec parameters_string = parameters_match[1] running_length = len(signature) - len(parameters_string) for parameter in _split_parameters(parameters_string): + # Check if including this parameter would still be within the maximum length. if (len(parameter) + running_length) <= max_signature_length - 5: # account for comma and placeholder truncated_signature.append(parameter) running_length += len(parameter) + 1 else: + # There's no more room for this parameter. Truncate the parameter list and put it in the signature. truncated_signature.append(" ...") formatted_signatures.append(signature.replace(parameters_string, ",".join(truncated_signature))) break else: + # The current signature is under the length limit; no truncation needed. formatted_signatures.append(signature) return formatted_signatures @@ -144,7 +149,7 @@ def _get_truncated_description( with the real string length limited to `_MAX_DESCRIPTION_LENGTH` to accommodate discord length limits. """ result = "" - markdown_element_ends = [] + markdown_element_ends = [] # Stores indices into `result` which point to the end boundary of each Markdown element. rendered_length = 0 tag_end_index = 0 -- cgit v1.2.3 From 4c423a8d97035e9b7f67413f63b0241b027cd1fc Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Fri, 5 Mar 2021 01:37:58 +0100 Subject: Use placeholder consistent with others in the cog --- bot/exts/info/doc/_parsing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py index e7b8b695b..fc38ff82a 100644 --- a/bot/exts/info/doc/_parsing.py +++ b/bot/exts/info/doc/_parsing.py @@ -113,7 +113,7 @@ def _truncate_signatures(signatures: Collection[str]) -> Union[List[str], Collec if len(signature) > max_signature_length: if (parameters_match := _PARAMETERS_RE.search(signature)) is None: # The signature has no parameters or the regex failed; perform a simple truncation of the text. - formatted_signatures.append(textwrap.shorten(signature, max_signature_length)) + formatted_signatures.append(textwrap.shorten(signature, max_signature_length, placeholder="...")) continue truncated_signature = [] -- cgit v1.2.3 From 33d6df2eae9d235db3405966e3f55db970582632 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Fri, 5 Mar 2021 01:47:58 +0100 Subject: Explain use of various containers in the cog Co-authored-by: MarkKoz --- bot/exts/info/doc/_cog.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index fd8ed2008..cedd31f55 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -58,10 +58,13 @@ class DocCog(commands.Cog): """A set of commands for querying & displaying documentation.""" def __init__(self, bot: Bot): + # Contains URLs to documentation home pages. + # Used to calculate inventory diffs on refreshes and to display all currently stored inventories. self.base_urls = {} self.bot = bot - self.doc_symbols: Dict[str, DocItem] = {} + self.doc_symbols: Dict[str, DocItem] = {} # Maps symbol names to objects containing their metadata. self.item_fetcher = _batch_parser.BatchParser() + # Maps a conflicting symbol name to a list of the new, disambiguated names created from conflicts with the name. self.renamed_symbols = defaultdict(list) self.inventory_scheduler = Scheduler(self.__class__.__name__) -- cgit v1.2.3 From e811070c0909f596e1767ee955b302b5f60a16d8 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Fri, 5 Mar 2021 01:48:29 +0100 Subject: Rename params to clearer and more concise alternatives --- bot/exts/info/doc/_cog.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index cedd31f55..8dcc1eff3 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -81,7 +81,7 @@ class DocCog(commands.Cog): await self.bot.wait_until_guild_available() await self.refresh_inventories() - def update_single(self, api_package_name: str, base_url: str, package: InventoryDict) -> None: + def update_single(self, package_name: str, base_url: str, inventory: InventoryDict) -> None: """ Build the inventory for a single package. @@ -91,16 +91,16 @@ class DocCog(commands.Cog): absolute paths that link to specific symbols * `package` is the content of a intersphinx inventory. """ - self.base_urls[api_package_name] = base_url + self.base_urls[package_name] = base_url - for group, items in package.items(): + for group, items in inventory.items(): for symbol_name, relative_doc_url in items: # e.g. get 'class' from 'py:class' group_name = group.split(":")[1] if (original_item := self.doc_symbols.get(symbol_name)) is not None: replaced_symbol_name = self.ensure_unique_symbol_name( - api_package_name, + package_name, group_name, original_item, symbol_name, @@ -111,7 +111,7 @@ class DocCog(commands.Cog): relative_url_path, _, symbol_id = relative_doc_url.partition("#") # Intern fields that have shared content so we're not storing unique strings for every object doc_item = DocItem( - api_package_name, + package_name, sys.intern(group_name), base_url, sys.intern(relative_url_path), @@ -120,7 +120,7 @@ class DocCog(commands.Cog): self.doc_symbols[symbol_name] = doc_item self.item_fetcher.add_item(doc_item) - log.trace(f"Fetched inventory for {api_package_name}.") + log.trace(f"Fetched inventory for {package_name}.") async def update_or_reschedule_inventory( self, -- cgit v1.2.3 From 3d4df68eb875a0e7042be387bb50561b917d1e40 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Fri, 5 Mar 2021 02:09:14 +0100 Subject: Move future assignment and check outside of the try No exceptions can be raised from the two lines of code because of the data structures used, moving it out makes for flatter code. --- bot/exts/info/doc/_batch_parser.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/bot/exts/info/doc/_batch_parser.py b/bot/exts/info/doc/_batch_parser.py index d80b62d88..a626008d2 100644 --- a/bot/exts/info/doc/_batch_parser.py +++ b/bot/exts/info/doc/_batch_parser.py @@ -129,12 +129,13 @@ class BatchParser: while self._queue: item, soup = self._queue.pop() markdown = None - try: - if (future := self._item_futures[item]).done(): - # Some items are present in the inventories multiple times under different symbol names, - # if we already parsed an equal item, we can just skip it. - continue + if (future := self._item_futures[item]).done(): + # Some items are present in the inventories multiple times under different symbol names, + # if we already parsed an equal item, we can just skip it. + continue + + try: markdown = await bot.instance.loop.run_in_executor(None, get_symbol_markdown, soup, item) if markdown is not None: await doc_cache.set(item, markdown) -- cgit v1.2.3 From 218455259e9d520bcf3b48c3d8d57b1924f31cc9 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Fri, 5 Mar 2021 02:22:17 +0100 Subject: Correct typehint --- bot/exts/info/doc/_batch_parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bot/exts/info/doc/_batch_parser.py b/bot/exts/info/doc/_batch_parser.py index a626008d2..d88b32208 100644 --- a/bot/exts/info/doc/_batch_parser.py +++ b/bot/exts/info/doc/_batch_parser.py @@ -6,7 +6,7 @@ import logging from collections import defaultdict from contextlib import suppress from operator import attrgetter -from typing import Deque, Dict, List, NamedTuple, Union +from typing import Deque, Dict, List, NamedTuple, Optional, Union import discord from bs4 import BeautifulSoup @@ -86,7 +86,7 @@ class BatchParser: self.stale_inventory_notifier = StaleInventoryNotifier() - async def get_markdown(self, doc_item: _cog.DocItem) -> str: + async def get_markdown(self, doc_item: _cog.DocItem) -> Optional[str]: """ Get the result Markdown of `doc_item`. -- cgit v1.2.3 From 64e5ba42675f0940995d75d2a3340791acd260c2 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Fri, 5 Mar 2021 02:27:35 +0100 Subject: Set future result to None on exceptions We can still provide th user with at least the link to the docs, for which we already have handling in the cog with a generic "unable to parse message", using exceptions for that would mean setting it here, immediately catching it and then providing the same or very similar message. --- bot/exts/info/doc/_batch_parser.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/bot/exts/info/doc/_batch_parser.py b/bot/exts/info/doc/_batch_parser.py index d88b32208..a809fed78 100644 --- a/bot/exts/info/doc/_batch_parser.py +++ b/bot/exts/info/doc/_batch_parser.py @@ -141,14 +141,9 @@ class BatchParser: await doc_cache.set(item, markdown) else: scheduling.create_task(self.stale_inventory_notifier.send_warning(item)) - except Exception as e: + except Exception: log.exception(f"Unexpected error when handling {item}") - if markdown is not None: - future.set_result(markdown) - else: - future.set_exception(e) - else: - future.set_result(markdown) + future.set_result(markdown) del self._item_futures[item] await asyncio.sleep(0.1) finally: -- cgit v1.2.3 From c3a516ce6d69e774c3a0d441b0ca2b4a1af774be Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Fri, 5 Mar 2021 02:31:32 +0100 Subject: Add comment explaining purpose of create_task over await --- bot/exts/info/doc/_batch_parser.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bot/exts/info/doc/_batch_parser.py b/bot/exts/info/doc/_batch_parser.py index a809fed78..da0984a91 100644 --- a/bot/exts/info/doc/_batch_parser.py +++ b/bot/exts/info/doc/_batch_parser.py @@ -140,6 +140,7 @@ class BatchParser: if markdown is not None: await doc_cache.set(item, markdown) else: + # Don't wait for this coro as the parsing doesn't depend on anything it does. scheduling.create_task(self.stale_inventory_notifier.send_warning(item)) except Exception: log.exception(f"Unexpected error when handling {item}") -- cgit v1.2.3 From af3c1140c99058e6681f26e8f72b973935df7ad8 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Fri, 5 Mar 2021 02:32:07 +0100 Subject: Use scheduling's create_task --- bot/exts/info/doc/_batch_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/exts/info/doc/_batch_parser.py b/bot/exts/info/doc/_batch_parser.py index da0984a91..f56f4e283 100644 --- a/bot/exts/info/doc/_batch_parser.py +++ b/bot/exts/info/doc/_batch_parser.py @@ -110,7 +110,7 @@ class BatchParser: log.debug(f"Added items from {doc_item.url} to the parse queue.") if self._parse_task is None: - self._parse_task = asyncio.create_task(self._parse_queue()) + self._parse_task = scheduling.create_task(self._parse_queue()) else: self._item_futures[doc_item].user_requested = True with suppress(ValueError): -- cgit v1.2.3 From 7f1f47104eaa7ad7ca38ecad846f32b6567060d0 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Fri, 5 Mar 2021 02:36:40 +0100 Subject: Name tasks --- bot/exts/info/doc/_batch_parser.py | 11 ++++++++--- bot/exts/info/doc/_cog.py | 7 +++++-- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/bot/exts/info/doc/_batch_parser.py b/bot/exts/info/doc/_batch_parser.py index f56f4e283..369bb462c 100644 --- a/bot/exts/info/doc/_batch_parser.py +++ b/bot/exts/info/doc/_batch_parser.py @@ -24,7 +24,10 @@ class StaleInventoryNotifier: """Handle sending notifications about stale inventories through `DocItem`s to dev log.""" def __init__(self): - self._init_task = bot.instance.loop.create_task(self._init_channel()) + self._init_task = bot.instance.loop.create_task( + self._init_channel(), + name="StaleInventoryNotifier channel init" + ) self._warned_urls = set() async def _init_channel(self) -> None: @@ -110,7 +113,7 @@ class BatchParser: log.debug(f"Added items from {doc_item.url} to the parse queue.") if self._parse_task is None: - self._parse_task = scheduling.create_task(self._parse_queue()) + self._parse_task = scheduling.create_task(self._parse_queue(), name="Queue parse") else: self._item_futures[doc_item].user_requested = True with suppress(ValueError): @@ -141,7 +144,9 @@ class BatchParser: await doc_cache.set(item, markdown) else: # Don't wait for this coro as the parsing doesn't depend on anything it does. - scheduling.create_task(self.stale_inventory_notifier.send_warning(item)) + scheduling.create_task( + self.stale_inventory_notifier.send_warning(item), name="Stale inventory warning" + ) except Exception: log.exception(f"Unexpected error when handling {item}") future.set_result(markdown) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 8dcc1eff3..60f6d8eea 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -73,7 +73,10 @@ class DocCog(commands.Cog): self.refresh_event.set() self.symbol_get_event = SharedEvent() - self.init_refresh_task = self.bot.loop.create_task(self.init_refresh_inventory()) + self.init_refresh_task = self.bot.loop.create_task( + self.init_refresh_inventory(), + name="Doc inventory init" + ) @lock(NAMESPACE, COMMAND_LOCK_SINGLETON, raise_error=True) async def init_refresh_inventory(self) -> None: @@ -417,4 +420,4 @@ class DocCog(commands.Cog): """Clear scheduled inventories, queued symbols and cleanup task on cog unload.""" self.inventory_scheduler.cancel_all() self.init_refresh_task.cancel() - asyncio.create_task(self.item_fetcher.clear()) + asyncio.create_task(self.item_fetcher.clear(), name="DocCog.item_fetcher unload clear") -- cgit v1.2.3 From 150cb3371040e0fefbe24702ca80ce2808014f6f Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Fri, 5 Mar 2021 02:39:14 +0100 Subject: Rename markup_hint to Markup --- bot/exts/info/doc/_html.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bot/exts/info/doc/_html.py b/bot/exts/info/doc/_html.py index 334b82e98..94efd81b7 100644 --- a/bot/exts/info/doc/_html.py +++ b/bot/exts/info/doc/_html.py @@ -33,9 +33,9 @@ class Strainer(SoupStrainer): log.warning("`text` is not a supported kwarg in the custom strainer.") super().__init__(**kwargs) - markup_hint = Union[PageElement, List["markup_hint"]] + Markup = Union[PageElement, List["Markup"]] - def search(self, markup: markup_hint) -> Union[PageElement, str]: + def search(self, markup: Markup) -> Union[PageElement, str]: """Extend default SoupStrainer behaviour to allow matching both `Tag`s` and `NavigableString`s.""" if isinstance(markup, str): # Let everything through the text filter if we're including strings and tags. -- cgit v1.2.3 From f7b56c533df7bf8c520f5cf69df5bf6dd62cf2dc Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Fri, 5 Mar 2021 02:44:25 +0100 Subject: Clarify the use of _set_expires and needs_expire --- bot/exts/info/doc/_redis_cache.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bot/exts/info/doc/_redis_cache.py b/bot/exts/info/doc/_redis_cache.py index 7de2f3806..ad764816f 100644 --- a/bot/exts/info/doc/_redis_cache.py +++ b/bot/exts/info/doc/_redis_cache.py @@ -30,6 +30,9 @@ class DocRedisCache(RedisObject): with await self._get_pool_connection() as connection: if redis_key not in self._set_expires: + # An expire is only set if the key didn't exist before. + # If this is the first time setting values for this key check if it exists and add it to + # `_set_expires` to prevent redundant checks for subsequent uses with items from the same page. self._set_expires.add(redis_key) needs_expire = not await connection.exists(redis_key) -- cgit v1.2.3 From 74cbe44625a1e6e2e39f77b2663794d3ab5aaf58 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Fri, 5 Mar 2021 18:22:46 +0100 Subject: Correct tests cases The tests were not adjusted after the converter was corrected to accept digits --- tests/bot/test_converters.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/bot/test_converters.py b/tests/bot/test_converters.py index 231798a92..4af84dde5 100644 --- a/tests/bot/test_converters.py +++ b/tests/bot/test_converters.py @@ -80,7 +80,7 @@ class ConverterTests(unittest.IsolatedAsyncioTestCase): async def test_package_name_for_valid(self): """PackageName returns valid package names unchanged.""" - test_values = ('foo', 'le_mon') + test_values = ('foo', 'le_mon', 'num83r') for name in test_values: with self.subTest(identifier=name): @@ -89,7 +89,7 @@ class ConverterTests(unittest.IsolatedAsyncioTestCase): async def test_package_name_for_invalid(self): """PackageName raises the proper exception for invalid package names.""" - test_values = ('text_with_a_dot.', 'UpperCaseName', "num83r") + test_values = ('text_with_a_dot.', 'UpperCaseName', 'dashed-name') for name in test_values: with self.subTest(identifier=name): -- cgit v1.2.3 From 4f5f284d3eec46b9209d19142d5c21456c4c403a Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 6 Mar 2021 03:46:25 +0100 Subject: Abstract logic from create_symbol_embed into additional methods The method was also renamed from get_symbol_embed to create_symbol_embed --- bot/exts/info/doc/_cog.py | 54 +++++++++++++++++++++++++++++++---------------- 1 file changed, 36 insertions(+), 18 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 60f6d8eea..64e204fad 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -229,19 +229,13 @@ class DocCog(commands.Cog): log.debug("Finished inventory refresh.") self.refresh_event.set() - async def get_symbol_embed(self, symbol_name: str) -> Optional[discord.Embed]: + def get_symbol_item(self, symbol_name: str) -> Optional[DocItem]: """ - Attempt to scrape and fetch the data for the given `symbol_name`, and build an embed from its contents. - - If the symbol is known, an Embed with documentation about it is returned. + Get the `DocItem` associated with `symbol_name` from the `doc_symbols` dict. - First check the DocRedisCache before querying the cog's `BatchParser`. + If the doc item is not found directly from the name and the name contains a space, + the first word of the name will be attempted to be used to get the item. """ - log.trace(f"Building embed for symbol `{symbol_name}`") - if not self.refresh_event.is_set(): - log.debug("Waiting for inventories to be refreshed before processing item.") - await self.refresh_event.wait() - doc_item = self.doc_symbols.get(symbol_name) if doc_item is None and " " in symbol_name: # If an invalid symbol contains a space, check if the command was invoked @@ -249,25 +243,49 @@ class DocCog(commands.Cog): symbol_name = symbol_name.split(" ", maxsplit=1)[0] doc_item = self.doc_symbols.get(symbol_name) - if doc_item is None: - log.debug("Symbol does not exist.") - return None + return doc_item - self.bot.stats.incr(f"doc_fetches.{doc_item.package}") + async def get_symbol_markdown(self, doc_item: DocItem) -> str: + """ + Get the Markdown from the symbol `doc_item` refers to. + First a redis lookup is attempted, if that fails the `item_fetcher` + is used to fetch the page and parse the HTML from it into Markdown. + """ with self.symbol_get_event: markdown = await doc_cache.get(doc_item) if markdown is None: - log.debug(f"Redis cache miss for symbol `{symbol_name}`.") + log.debug(f"Redis cache miss with {doc_item}.") markdown = await self.item_fetcher.get_markdown(doc_item) if markdown is None: - markdown = "Unable to parse the requested symbol." + return "Unable to parse the requested symbol." + return markdown + + async def create_symbol_embed(self, symbol_name: str) -> Optional[discord.Embed]: + """ + Attempt to scrape and fetch the data for the given `symbol_name`, and build an embed from its contents. + + If the symbol is known, an Embed with documentation about it is returned. + + First check the DocRedisCache before querying the cog's `BatchParser`. + """ + log.trace(f"Building embed for symbol `{symbol_name}`") + if not self.refresh_event.is_set(): + log.debug("Waiting for inventories to be refreshed before processing item.") + await self.refresh_event.wait() + + doc_item = self.get_symbol_item(symbol_name) + if doc_item is None: + log.debug("Symbol does not exist.") + return None + + self.bot.stats.incr(f"doc_fetches.{doc_item.package}") embed = discord.Embed( title=discord.utils.escape_markdown(symbol_name), url=f"{doc_item.url}#{doc_item.symbol_id}", - description=markdown + description=await self.get_symbol_markdown(doc_item) ) # Show all symbols with the same name that were renamed in the footer, # with a max of 100 chars. @@ -314,7 +332,7 @@ class DocCog(commands.Cog): else: symbol = symbol_name.strip("`") async with ctx.typing(): - doc_embed = await self.get_symbol_embed(symbol) + doc_embed = await self.create_symbol_embed(symbol) if doc_embed is None: error_message = await send_denial(ctx, "No documentation found for the requested symbol.") -- cgit v1.2.3 From 7d596f5d8fb454f00288b0a6fbd60789c5dd17be Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 6 Mar 2021 03:47:21 +0100 Subject: Create the footer text before an inventory refresh can occur --- bot/exts/info/doc/_cog.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 64e204fad..c01e0f36a 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -282,11 +282,6 @@ class DocCog(commands.Cog): self.bot.stats.incr(f"doc_fetches.{doc_item.package}") - embed = discord.Embed( - title=discord.utils.escape_markdown(symbol_name), - url=f"{doc_item.url}#{doc_item.symbol_id}", - description=await self.get_symbol_markdown(doc_item) - ) # Show all symbols with the same name that were renamed in the footer, # with a max of 100 chars. if symbol_name in self.renamed_symbols: @@ -294,6 +289,12 @@ class DocCog(commands.Cog): footer_text = textwrap.shorten("Moved: " + renamed_symbols, 100, placeholder=' ...') else: footer_text = "" + + embed = discord.Embed( + title=discord.utils.escape_markdown(symbol_name), + url=f"{doc_item.url}#{doc_item.symbol_id}", + description=await self.get_symbol_markdown(doc_item) + ) embed.set_footer(text=footer_text) return embed -- cgit v1.2.3 From 51a11cc4b1ff9a4de0dfa33490ae7fceec96423d Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 6 Mar 2021 03:56:18 +0100 Subject: Handle unexpected errors when requesting markdown --- bot/exts/info/doc/_cog.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index c01e0f36a..0334f6001 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -9,6 +9,7 @@ from contextlib import suppress from types import SimpleNamespace from typing import Dict, NamedTuple, Optional, Union +import aiohttp import discord from discord.ext import commands @@ -257,7 +258,17 @@ class DocCog(commands.Cog): if markdown is None: log.debug(f"Redis cache miss with {doc_item}.") - markdown = await self.item_fetcher.get_markdown(doc_item) + try: + markdown = await self.item_fetcher.get_markdown(doc_item) + + except aiohttp.ClientError as e: + log.warning(f"A network error has occurred when requesting parsing of {doc_item}.", exc_info=e) + return "Unable to parse the requested symbol due to a network error." + + except Exception: + log.exception(f"An unexpected error has occurred when requesting parsing of {doc_item}.") + return "Unable to parse the requested symbol due to an error." + if markdown is None: return "Unable to parse the requested symbol." return markdown -- cgit v1.2.3 From c53bff5771ded98b4ffc5c50fdd1634056889b07 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sat, 6 Mar 2021 17:22:00 +0100 Subject: Remove superfluous comment After the move to a separate method, the docstring now documents the behaviour so a comment is unnecessary --- bot/exts/info/doc/_cog.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 0334f6001..fb45d0bbb 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -239,8 +239,6 @@ class DocCog(commands.Cog): """ doc_item = self.doc_symbols.get(symbol_name) if doc_item is None and " " in symbol_name: - # If an invalid symbol contains a space, check if the command was invoked - # in the format !d symbol_name = symbol_name.split(" ", maxsplit=1)[0] doc_item = self.doc_symbols.get(symbol_name) -- cgit v1.2.3 From dc7eef432189aaaf0ea8b0d16588852306104957 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 7 Mar 2021 05:18:02 +0100 Subject: Handle arbitrary amount of backslashes preceding the quote char Tests for this were added additionally --- bot/exts/info/doc/_parsing.py | 19 ++++++++----------- tests/bot/exts/info/doc/test_parsing.py | 7 +++++++ 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py index fc38ff82a..57c991ae0 100644 --- a/bot/exts/info/doc/_parsing.py +++ b/bot/exts/info/doc/_parsing.py @@ -46,15 +46,6 @@ _BRACKET_PAIRS = { } -def _is_closing_quote(search_string: str, index: int) -> bool: - """Check whether the quote at `index` inside `search_string` can be a closing quote.""" - if search_string[index - 1] != "\\": - return True # The quote is not escaped. - elif search_string[index - 2] == "\\": - return True - return False - - def _split_parameters(parameters_string: str) -> Iterator[str]: """ Split parameters of a signature into individual parameter strings on commas. @@ -70,9 +61,15 @@ def _split_parameters(parameters_string: str) -> Iterator[str]: if character in {"'", '"'}: # Skip everything inside of strings, regardless of the depth. quote_character = character # The closing quote must equal the opening quote. - for index, character in enumerated_string: - if character == quote_character and _is_closing_quote(parameters_string, index): + preceding_backslashes = 0 + for _, character in enumerated_string: + # If an odd number of backslashes precedes the quote, it was escaped. + if character == quote_character and not preceding_backslashes % 2: break + if character == "\\": + preceding_backslashes += 1 + else: + preceding_backslashes = 0 elif current_search is None: if (current_search := _BRACKET_PAIRS.get(character)) is not None: diff --git a/tests/bot/exts/info/doc/test_parsing.py b/tests/bot/exts/info/doc/test_parsing.py index f302b38fc..1663d8491 100644 --- a/tests/bot/exts/info/doc/test_parsing.py +++ b/tests/bot/exts/info/doc/test_parsing.py @@ -42,6 +42,13 @@ class SignatureSplitter(TestCase): ) self._run_tests(test_cases) + def test_quote_escaped(self): + test_cases = ( + (r"'\',','\\',0", [r"'\','", r"'\\'", "0"]), + (r"'0\',0\\\'\\',0", [r"'0\',0\\\'\\'", "0"]), + ) + self._run_tests(test_cases) + def test_real_signatures(self): test_cases = ( ("start, stop[, step]", ["start", " stop[, step]"]), -- cgit v1.2.3 From e61a5216bd19adcbc689fe2f18f969b94ce72e8f Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 10 Mar 2021 04:04:14 +0100 Subject: Hold the symbol_get_event in the entire body of create_symbol_embed While the previous code was safe, the synchronization was spread out over different modules and was hard to wrap around. Additionally changes could introduce context switches without the author being aware of them causing potential race conditions with the refresh. Moving the whole body into the with block solves both of these issues --- bot/exts/info/doc/_cog.py | 48 +++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index fb45d0bbb..24b571ddb 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -251,8 +251,7 @@ class DocCog(commands.Cog): First a redis lookup is attempted, if that fails the `item_fetcher` is used to fetch the page and parse the HTML from it into Markdown. """ - with self.symbol_get_event: - markdown = await doc_cache.get(doc_item) + markdown = await doc_cache.get(doc_item) if markdown is None: log.debug(f"Redis cache miss with {doc_item}.") @@ -283,29 +282,30 @@ class DocCog(commands.Cog): if not self.refresh_event.is_set(): log.debug("Waiting for inventories to be refreshed before processing item.") await self.refresh_event.wait() + # Ensure a refresh can't run in case of a context switch until the with block is exited + with self.symbol_get_event: + doc_item = self.get_symbol_item(symbol_name) + if doc_item is None: + log.debug("Symbol does not exist.") + return None + + self.bot.stats.incr(f"doc_fetches.{doc_item.package}") + + # Show all symbols with the same name that were renamed in the footer, + # with a max of 100 chars. + if symbol_name in self.renamed_symbols: + renamed_symbols = ', '.join(self.renamed_symbols[symbol_name]) + footer_text = textwrap.shorten("Moved: " + renamed_symbols, 100, placeholder=' ...') + else: + footer_text = "" - doc_item = self.get_symbol_item(symbol_name) - if doc_item is None: - log.debug("Symbol does not exist.") - return None - - self.bot.stats.incr(f"doc_fetches.{doc_item.package}") - - # Show all symbols with the same name that were renamed in the footer, - # with a max of 100 chars. - if symbol_name in self.renamed_symbols: - renamed_symbols = ', '.join(self.renamed_symbols[symbol_name]) - footer_text = textwrap.shorten("Moved: " + renamed_symbols, 100, placeholder=' ...') - else: - footer_text = "" - - embed = discord.Embed( - title=discord.utils.escape_markdown(symbol_name), - url=f"{doc_item.url}#{doc_item.symbol_id}", - description=await self.get_symbol_markdown(doc_item) - ) - embed.set_footer(text=footer_text) - return embed + embed = discord.Embed( + title=discord.utils.escape_markdown(symbol_name), + url=f"{doc_item.url}#{doc_item.symbol_id}", + description=await self.get_symbol_markdown(doc_item) + ) + embed.set_footer(text=footer_text) + return embed @commands.group(name='docs', aliases=('doc', 'd'), invoke_without_command=True) async def docs_group(self, ctx: commands.Context, *, symbol_name: Optional[str]) -> None: -- cgit v1.2.3 From 522ed426f08845f3843aa3f60284205d1e36dfe8 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Wed, 10 Mar 2021 04:35:09 +0100 Subject: Use a clearer approach with less duplicate code Co-authored-by: MarkKoz --- bot/exts/info/doc/_cog.py | 81 ++++++++++++++++++++--------------------------- 1 file changed, 35 insertions(+), 46 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 24b571ddb..9e41c6f1e 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -102,15 +102,11 @@ class DocCog(commands.Cog): # e.g. get 'class' from 'py:class' group_name = group.split(":")[1] - if (original_item := self.doc_symbols.get(symbol_name)) is not None: - replaced_symbol_name = self.ensure_unique_symbol_name( - package_name, - group_name, - original_item, - symbol_name, - ) - if replaced_symbol_name is not None: - symbol_name = replaced_symbol_name + symbol_name = self.ensure_unique_symbol_name( + package_name, + group_name, + symbol_name, + ) relative_url_path, _, symbol_id = relative_doc_url.partition("#") # Intern fields that have shared content so we're not storing unique strings for every object @@ -155,59 +151,52 @@ class DocCog(commands.Cog): else: self.update_single(api_package_name, base_url, package) - def ensure_unique_symbol_name( - self, - package_name: str, - group_name: str, - original_item: DocItem, - symbol_name: str, - ) -> Optional[str]: + def ensure_unique_symbol_name(self, package_name: str, group_name: str, symbol_name: str) -> str: """ Ensure `symbol_name` doesn't overwrite an another symbol in `doc_symbols`. - Should only be called with symbol names that already have a conflict in `doc_symbols`. + For conflicts, rename either the current symbol or the existing symbol with which it conflicts. + Store the new name in `renamed_symbols` and return the name to use for the symbol. - If None is returned, space was created for `symbol_name` in `doc_symbols` instead of - the symbol name being changed. + If the existing symbol was renamed or there was no conflict, the returned name is equivalent to `symbol_name`. """ + if (item := self.doc_symbols.get(symbol_name)) is None: + return symbol_name # There's no conflict so it's fine to simply use the given symbol name. + + def rename(prefix: str, *, rename_extant: bool = False) -> str: + new_name = f"{prefix}.{symbol_name}" + if new_name in self.doc_symbols: + # If there's still a conflict, qualify the name further. + if rename_extant: + new_name = f"{item.package}.{item.group}.{symbol_name}" + else: + new_name = f"{package_name}.{group_name}.{symbol_name}" + + self.renamed_symbols[symbol_name].append(new_name) + + if rename_extant: + # Instead of renaming the current symbol, rename the symbol with which it conflicts. + self.doc_symbols[new_name] = self.doc_symbols[symbol_name] + return symbol_name + else: + return new_name + # Certain groups are added as prefixes to disambiguate the symbols. if group_name in FORCE_PREFIX_GROUPS: - new_symbol_name = f"{group_name}.{symbol_name}" - if new_symbol_name in self.doc_symbols: - # If there's still a conflict, prefix with package name. - new_symbol_name = f"{package_name}.{new_symbol_name}" - self.renamed_symbols[symbol_name].append(new_symbol_name) - return new_symbol_name + return rename(group_name) # The existing symbol with which the current symbol conflicts should have a group prefix. # It currently doesn't have the group prefix because it's only added once there's a conflict. - elif (original_symbol_group := original_item.group) in FORCE_PREFIX_GROUPS: - overridden_symbol_name = f"{original_symbol_group}.{symbol_name}" - if overridden_symbol_name in self.doc_symbols: - # If there's still a conflict, prefix with package name. - overridden_symbol_name = f"{original_item.package}.{overridden_symbol_name}" - - self.doc_symbols[overridden_symbol_name] = original_item - self.renamed_symbols[symbol_name].append(overridden_symbol_name) + elif item.group in FORCE_PREFIX_GROUPS: + return rename(item.group, rename_extant=True) elif package_name in PRIORITY_PACKAGES: - overridden_symbol_name = f"{original_item.package}.{symbol_name}" - if overridden_symbol_name in self.doc_symbols: - # If there's still a conflict, add the symbol's group in the middle. - overridden_symbol_name = f"{original_item.package}.{original_item.group}.{symbol_name}" - - self.doc_symbols[overridden_symbol_name] = original_item - self.renamed_symbols[symbol_name].append(overridden_symbol_name) + return rename(item.package, rename_extant=True) # If we can't specially handle the symbol through its group or package, # fall back to prepending its package name to the front. else: - new_symbol_name = f"{package_name}.{symbol_name}" - if new_symbol_name in self.doc_symbols: - # If there's still a conflict, add the symbol's group in the middle. - new_symbol_name = f"{package_name}.{group_name}.{symbol_name}" - self.renamed_symbols[symbol_name].append(new_symbol_name) - return new_symbol_name + return rename(package_name) async def refresh_inventories(self) -> None: """Refresh internal documentation inventories.""" -- cgit v1.2.3 From 8d93afa4047d3e87fdd1bff6f003e1cfb44bd01c Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Fri, 12 Mar 2021 04:48:20 +0100 Subject: Correct length limits to embed limits Previously the code used limits that apply to raw messages, not embeds. Both the description and footer limits are separate, while their individual limits are 2048 chars instead of 2000. The footer overhead was removed from the max description length and the footer is now truncated to 200 chars which is roughly 2 lines --- bot/exts/info/doc/_cog.py | 2 +- bot/exts/info/doc/_parsing.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 9e41c6f1e..bf49e0aee 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -284,7 +284,7 @@ class DocCog(commands.Cog): # with a max of 100 chars. if symbol_name in self.renamed_symbols: renamed_symbols = ', '.join(self.renamed_symbols[symbol_name]) - footer_text = textwrap.shorten("Moved: " + renamed_symbols, 100, placeholder=' ...') + footer_text = textwrap.shorten("Moved: " + renamed_symbols, 200, placeholder=' ...') else: footer_text = "" diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py index 57c991ae0..b06aebd45 100644 --- a/bot/exts/info/doc/_parsing.py +++ b/bot/exts/info/doc/_parsing.py @@ -33,8 +33,8 @@ _NO_SIGNATURE_GROUPS = { _EMBED_CODE_BLOCK_LINE_LENGTH = 61 # _MAX_SIGNATURE_AMOUNT code block wrapped lines with py syntax highlight _MAX_SIGNATURES_LENGTH = (_EMBED_CODE_BLOCK_LINE_LENGTH + 8) * MAX_SIGNATURE_AMOUNT -# Maximum discord message length - signatures on top - space for footer -_MAX_DESCRIPTION_LENGTH = 1900 - _MAX_SIGNATURES_LENGTH +# Maximum embed description length - signatures on top +_MAX_DESCRIPTION_LENGTH = 2048 - _MAX_SIGNATURES_LENGTH _TRUNCATE_STRIP_CHARACTERS = "!?:;." + string.whitespace BracketPair = namedtuple("BracketPair", ["opening_bracket", "closing_bracket"]) -- cgit v1.2.3 From 3beebb973f3cceb5281d1901535185276c9f4714 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Tue, 16 Mar 2021 02:07:28 +0100 Subject: Update the symbol_name when fetching the DocItem from get_symbol_item Moving the block handling the fetching into a separate method meant that symbol_name was no longer updated inside the create_symbol_embed method, causing the whole message to be included in the embed title in case the space shortcut was used --- bot/exts/info/doc/_cog.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index bf49e0aee..5af95717b 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -7,7 +7,7 @@ import textwrap from collections import defaultdict from contextlib import suppress from types import SimpleNamespace -from typing import Dict, NamedTuple, Optional, Union +from typing import Dict, NamedTuple, Optional, Tuple, Union import aiohttp import discord @@ -219,11 +219,11 @@ class DocCog(commands.Cog): log.debug("Finished inventory refresh.") self.refresh_event.set() - def get_symbol_item(self, symbol_name: str) -> Optional[DocItem]: + def get_symbol_item(self, symbol_name: str) -> Tuple[str, Optional[DocItem]]: """ - Get the `DocItem` associated with `symbol_name` from the `doc_symbols` dict. + Get the `DocItem` and the symbol name used to fetch it from the `doc_symbols` dict. - If the doc item is not found directly from the name and the name contains a space, + If the doc item is not found directly from the passed in name and the name contains a space, the first word of the name will be attempted to be used to get the item. """ doc_item = self.doc_symbols.get(symbol_name) @@ -231,7 +231,7 @@ class DocCog(commands.Cog): symbol_name = symbol_name.split(" ", maxsplit=1)[0] doc_item = self.doc_symbols.get(symbol_name) - return doc_item + return symbol_name, doc_item async def get_symbol_markdown(self, doc_item: DocItem) -> str: """ @@ -273,7 +273,7 @@ class DocCog(commands.Cog): await self.refresh_event.wait() # Ensure a refresh can't run in case of a context switch until the with block is exited with self.symbol_get_event: - doc_item = self.get_symbol_item(symbol_name) + symbol_name, doc_item = self.get_symbol_item(symbol_name) if doc_item is None: log.debug("Symbol does not exist.") return None -- cgit v1.2.3 From ba91d5a530aa9958b7549cc03fecfb95112d52ca Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Tue, 16 Mar 2021 02:35:20 +0100 Subject: Replace shorten with custom algo to find good cutoff points shorten collapses the whitespace, causing issues with codeblocks --- bot/exts/info/doc/_parsing.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py index b06aebd45..b3402f655 100644 --- a/bot/exts/info/doc/_parsing.py +++ b/bot/exts/info/doc/_parsing.py @@ -188,8 +188,21 @@ def _get_truncated_description( # Determine the actual truncation index. possible_truncation_indices = [cut for cut in markdown_element_ends if cut < truncate_index] if not possible_truncation_indices: - # In case there is no Markdown element ending before the truncation index, use shorten as a fallback. - truncated_result = textwrap.shorten(result, truncate_index, placeholder="") + # In case there is no Markdown element ending before the truncation index, try to find a good cutoff point. + force_truncated = result[:truncate_index] + # If there is an incomplete codeblock, cut it out. + if force_truncated.count("```") % 2: + force_truncated = force_truncated[:force_truncated.rfind("```")] + # Search for substrings to truncate at, with decreasing desirability. + for string_ in ("\n\n", "\n", ". ", ", ", ",", " "): + cutoff = force_truncated.rfind(string_) + + if cutoff != -1: + truncated_result = force_truncated[:cutoff] + break + else: + truncated_result = force_truncated + else: # Truncate at the last Markdown element that comes before the truncation index. markdown_truncate_index = possible_truncation_indices[-1] -- cgit v1.2.3 From bb5054c1aa8abcbd91a524bb532d2677f2029d97 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Thu, 25 Mar 2021 15:03:53 +0100 Subject: swap single quotes to double quotes where they were unnecessary --- bot/exts/info/doc/_cog.py | 24 ++++++++++++------------ bot/exts/info/doc/_inventory_parser.py | 12 ++++++------ bot/exts/info/doc/_parsing.py | 4 ++-- 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 5af95717b..a06bfcbaf 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -213,7 +213,7 @@ class DocCog(commands.Cog): coros = [ self.update_or_reschedule_inventory( package["package"], package["base_url"], package["inventory_url"] - ) for package in await self.bot.api_client.get('bot/documentation-links') + ) for package in await self.bot.api_client.get("bot/documentation-links") ] await asyncio.gather(*coros) log.debug("Finished inventory refresh.") @@ -283,8 +283,8 @@ class DocCog(commands.Cog): # Show all symbols with the same name that were renamed in the footer, # with a max of 100 chars. if symbol_name in self.renamed_symbols: - renamed_symbols = ', '.join(self.renamed_symbols[symbol_name]) - footer_text = textwrap.shorten("Moved: " + renamed_symbols, 200, placeholder=' ...') + renamed_symbols = ", ".join(self.renamed_symbols[symbol_name]) + footer_text = textwrap.shorten("Moved: " + renamed_symbols, 200, placeholder=" ...") else: footer_text = "" @@ -296,12 +296,12 @@ class DocCog(commands.Cog): embed.set_footer(text=footer_text) return embed - @commands.group(name='docs', aliases=('doc', 'd'), invoke_without_command=True) + @commands.group(name="docs", aliases=("doc", "d"), invoke_without_command=True) async def docs_group(self, ctx: commands.Context, *, symbol_name: Optional[str]) -> None: """Look up documentation for Python symbols.""" await self.get_command(ctx, symbol_name=symbol_name) - @docs_group.command(name='getdoc', aliases=('g',)) + @docs_group.command(name="getdoc", aliases=("g",)) async def get_command(self, ctx: commands.Context, *, symbol_name: Optional[str]) -> None: """ Return a documentation embed for a given symbol. @@ -344,7 +344,7 @@ class DocCog(commands.Cog): msg = await ctx.send(embed=doc_embed) await wait_for_deletion(msg, (ctx.author.id,)) - @docs_group.command(name='setdoc', aliases=('s',)) + @docs_group.command(name="setdoc", aliases=("s",)) @commands.has_any_role(*MODERATION_ROLES) @lock(NAMESPACE, COMMAND_LOCK_SINGLETON, raise_error=True) async def set_command( @@ -367,11 +367,11 @@ class DocCog(commands.Cog): """ inventory_url, inventory_dict = inventory body = { - 'package': package_name, - 'base_url': base_url, - 'inventory_url': inventory_url + "package": package_name, + "base_url": base_url, + "inventory_url": inventory_url } - await self.bot.api_client.post('bot/documentation-links', json=body) + await self.bot.api_client.post("bot/documentation-links", json=body) log.info( f"User @{ctx.author} ({ctx.author.id}) added a new documentation package:\n" @@ -381,7 +381,7 @@ class DocCog(commands.Cog): self.update_single(package_name, base_url, inventory_dict) await ctx.send(f"Added the package `{package_name}` to the database and updated the inventories.") - @docs_group.command(name='deletedoc', aliases=('removedoc', 'rm', 'd')) + @docs_group.command(name="deletedoc", aliases=("removedoc", "rm", "d")) @commands.has_any_role(*MODERATION_ROLES) @lock(NAMESPACE, COMMAND_LOCK_SINGLETON, raise_error=True) async def delete_command(self, ctx: commands.Context, package_name: PackageName) -> None: @@ -391,7 +391,7 @@ class DocCog(commands.Cog): Example: !docs deletedoc aiohttp """ - await self.bot.api_client.delete(f'bot/documentation-links/{package_name}') + await self.bot.api_client.delete(f"bot/documentation-links/{package_name}") async with ctx.typing(): await self.refresh_inventories() diff --git a/bot/exts/info/doc/_inventory_parser.py b/bot/exts/info/doc/_inventory_parser.py index 1615f15bd..80d5841a0 100644 --- a/bot/exts/info/doc/_inventory_parser.py +++ b/bot/exts/info/doc/_inventory_parser.py @@ -50,12 +50,12 @@ async def _load_v1(stream: aiohttp.StreamReader) -> InventoryDict: async for line in stream: name, type_, location = line.decode().rstrip().split(maxsplit=2) # version 1 did not add anchors to the location - if type_ == 'mod': - type_ = 'py:module' - location += '#module-' + name + if type_ == "mod": + type_ = "py:module" + location += "#module-" + name else: - type_ = 'py:' + type_ - location += '#' + name + type_ = "py:" + type_ + location += "#" + name invdata[type_].append((name, location)) return invdata @@ -66,7 +66,7 @@ async def _load_v2(stream: aiohttp.StreamReader) -> InventoryDict: async for line in ZlibStreamReader(stream): m = _V2_LINE_RE.match(line.rstrip()) name, type_, _prio, location, _dispname = m.groups() # ignore the parsed items we don't need - if location.endswith('$'): + if location.endswith("$"): location = location[:-1] + name invdata[type_].append((name, location)) diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py index b3402f655..bf840b96f 100644 --- a/bot/exts/info/doc/_parsing.py +++ b/bot/exts/info/doc/_parsing.py @@ -224,7 +224,7 @@ def _create_markdown(signatures: Optional[List[str]], description: Iterable[Tag] max_length=750, max_lines=13 ) - description = _WHITESPACE_AFTER_NEWLINES_RE.sub('', description) + description = _WHITESPACE_AFTER_NEWLINES_RE.sub("", description) if signatures is not None: signature = "".join(f"```py\n{signature}```" for signature in _truncate_signatures(signatures)) return f"{signature}\n{description}" @@ -253,4 +253,4 @@ def get_symbol_markdown(soup: BeautifulSoup, symbol_data: DocItem) -> Optional[s else: signature = get_signatures(symbol_heading) description = get_dd_description(symbol_heading) - return _create_markdown(signature, description, symbol_data.url).replace('¶', '').strip() + return _create_markdown(signature, description, symbol_data.url).replace("¶", "").strip() -- cgit v1.2.3 From bc25bfdf42cdaaba924a7ad6de1dc06a9b381285 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Fri, 26 Mar 2021 13:37:21 +0100 Subject: Ensure the base url ends with a slash A base url without a trailing slash won't join properly with the relative paths, raising an error my prevent some mistakes when a new inventory is added --- bot/exts/info/doc/_cog.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index a06bfcbaf..ff67b0e61 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -365,6 +365,8 @@ class DocCog(commands.Cog): https://docs.python.org/3/ \ https://docs.python.org/3/objects.inv """ + if not base_url.endswith("/"): + raise commands.BadArgument("The base url must end with a slash.") inventory_url, inventory_dict = inventory body = { "package": package_name, -- cgit v1.2.3 From e2f80e6914adefee712992fa56540872aef45468 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Thu, 1 Apr 2021 20:04:12 +0200 Subject: Add missing 'attempts' Co-authored-by: Kieran Siek --- bot/converters.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bot/converters.py b/bot/converters.py index 6ea2d887b..3bf05cfb3 100644 --- a/bot/converters.py +++ b/bot/converters.py @@ -192,7 +192,9 @@ class Inventory(Converter): """Convert url to Intersphinx inventory URL.""" await ctx.trigger_typing() if (inventory := await _inventory_parser.fetch_inventory(url)) is None: - raise BadArgument(f"Failed to fetch inventory file after {_inventory_parser.FAILED_REQUEST_ATTEMPTS}.") + raise BadArgument( + f"Failed to fetch inventory file after {_inventory_parser.FAILED_REQUEST_ATTEMPTS} attempts." + ) return url, inventory -- cgit v1.2.3 From b436dcf9aa11d188f5646fe795e428f99be30b6f Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Thu, 8 Apr 2021 10:11:42 +0200 Subject: Use 'Similar names' instead of 'moved' in footer The meaning of 'moved' may not have been clear for people that weren't familiar with how the system works Co-authored-by: MarkKoz --- bot/exts/info/doc/_cog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index ff67b0e61..7352deb8c 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -284,7 +284,7 @@ class DocCog(commands.Cog): # with a max of 100 chars. if symbol_name in self.renamed_symbols: renamed_symbols = ", ".join(self.renamed_symbols[symbol_name]) - footer_text = textwrap.shorten("Moved: " + renamed_symbols, 200, placeholder=" ...") + footer_text = textwrap.shorten("Similar names: " + renamed_symbols, 200, placeholder=" ...") else: footer_text = "" -- cgit v1.2.3 From 417c6d321b0e384fe4c689b931c899f4f043d38e Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Thu, 8 Apr 2021 10:14:37 +0200 Subject: update comment --- bot/exts/info/doc/_cog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 7352deb8c..2a8016fb8 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -281,7 +281,7 @@ class DocCog(commands.Cog): self.bot.stats.incr(f"doc_fetches.{doc_item.package}") # Show all symbols with the same name that were renamed in the footer, - # with a max of 100 chars. + # with a max of 200 chars. if symbol_name in self.renamed_symbols: renamed_symbols = ", ".join(self.renamed_symbols[symbol_name]) footer_text = textwrap.shorten("Similar names: " + renamed_symbols, 200, placeholder=" ...") -- cgit v1.2.3 From fcfa287e96bf0eedcb2fe0bc2e004794324beeb2 Mon Sep 17 00:00:00 2001 From: ToxicKidz <78174417+ToxicKidz@users.noreply.github.com> Date: Mon, 12 Apr 2021 11:54:13 -0400 Subject: Remove reactions from everyone when paginating and waiting for trashcan reaction. (#1471) * Remove reactions from everyone * Make flake8 happy * Make flake8 happy again * Remove reactions in check functions * Make flake8 happy for the last time * Update bot/pagination.py Co-authored-by: Kieran Siek * Make create_task one line and return False in checks * Fix return so it returns either True or False * Use scheduling.create_task and suppress HTTPException * Suppress HTTPException in scheduling.create_task * Remove double if-statements Co-authored-by: Mark * change suppress_exceptions to suppressed_exceptions * Make suppressed_exceptions a kwargs for _log_task_exception * Update scheduling.create_task call to correspond with *args * Fix NameError: reaction, user -> reaction_, user_ * Update scheduling.create_task call to correspond with *args in messages.wait_for_deletion * reaction -> reaction_ * Ignore reactions from the bot * Fix type annotations for create_task * Refactor add_reaction check to a separate function * Name the remove_reaction task Co-authored-by: Kieran Siek Co-authored-by: Mark --- bot/pagination.py | 36 ++++++++------------------- bot/utils/messages.py | 66 +++++++++++++++++++++++++++++++++++++++---------- bot/utils/scheduling.py | 10 ++++---- 3 files changed, 68 insertions(+), 44 deletions(-) diff --git a/bot/pagination.py b/bot/pagination.py index 3b16cc9ff..c5c84afd9 100644 --- a/bot/pagination.py +++ b/bot/pagination.py @@ -2,14 +2,14 @@ import asyncio import logging import typing as t from contextlib import suppress +from functools import partial import discord -from discord import Member from discord.abc import User from discord.ext.commands import Context, Paginator from bot import constants -from bot.constants import MODERATION_ROLES +from bot.utils import messages FIRST_EMOJI = "\u23EE" # [:track_previous:] LEFT_EMOJI = "\u2B05" # [:arrow_left:] @@ -220,29 +220,6 @@ class LinePaginator(Paginator): >>> embed.set_author(name="Some Operation", url=url, icon_url=icon) >>> await LinePaginator.paginate([line for line in lines], ctx, embed) """ - def event_check(reaction_: discord.Reaction, user_: discord.Member) -> bool: - """Make sure that this reaction is what we want to operate on.""" - no_restrictions = ( - # The reaction was by a whitelisted user - user_.id == restrict_to_user.id - # The reaction was by a moderator - or isinstance(user_, Member) and any(role.id in MODERATION_ROLES for role in user_.roles) - ) - - return ( - # Conditions for a successful pagination: - all(( - # Reaction is on this message - reaction_.message.id == message.id, - # Reaction is one of the pagination emotes - str(reaction_.emoji) in PAGINATION_EMOJI, - # Reaction was not made by the Bot - user_.id != ctx.bot.user.id, - # There were no restrictions - no_restrictions - )) - ) - paginator = cls(prefix=prefix, suffix=suffix, max_size=max_size, max_lines=max_lines, scale_to_size=scale_to_size) current_page = 0 @@ -303,9 +280,16 @@ class LinePaginator(Paginator): log.trace(f"Adding reaction: {repr(emoji)}") await message.add_reaction(emoji) + check = partial( + messages.reaction_check, + message_id=message.id, + allowed_emoji=PAGINATION_EMOJI, + allowed_users=(restrict_to_user.id,), + ) + while True: try: - reaction, user = await ctx.bot.wait_for("reaction_add", timeout=timeout, check=event_check) + reaction, user = await ctx.bot.wait_for("reaction_add", timeout=timeout, check=check) log.trace(f"Got reaction: {reaction}") except asyncio.TimeoutError: log.debug("Timed out waiting for a reaction") diff --git a/bot/utils/messages.py b/bot/utils/messages.py index 0bcaed43d..2beead6af 100644 --- a/bot/utils/messages.py +++ b/bot/utils/messages.py @@ -3,6 +3,7 @@ import contextlib import logging import random import re +from functools import partial from io import BytesIO from typing import List, Optional, Sequence, Union @@ -12,24 +13,66 @@ from discord.ext.commands import Context import bot from bot.constants import Emojis, MODERATION_ROLES, NEGATIVE_REPLIES +from bot.utils import scheduling log = logging.getLogger(__name__) +def reaction_check( + reaction: discord.Reaction, + user: discord.abc.User, + *, + message_id: int, + allowed_emoji: Sequence[str], + allowed_users: Sequence[int], + allow_mods: bool = True, +) -> bool: + """ + Check if a reaction's emoji and author are allowed and the message is `message_id`. + + If the user is not allowed, remove the reaction. Ignore reactions made by the bot. + If `allow_mods` is True, allow users with moderator roles even if they're not in `allowed_users`. + """ + right_reaction = ( + user != bot.instance.user + and reaction.message.id == message_id + and str(reaction.emoji) in allowed_emoji + ) + if not right_reaction: + return False + + is_moderator = ( + allow_mods + and any(role.id in MODERATION_ROLES for role in getattr(user, "roles", [])) + ) + + if user.id in allowed_users or is_moderator: + log.trace(f"Allowed reaction {reaction} by {user} on {reaction.message.id}.") + return True + else: + log.trace(f"Removing reaction {reaction} by {user} on {reaction.message.id}: disallowed user.") + scheduling.create_task( + reaction.message.remove_reaction(reaction.emoji, user), + HTTPException, # Suppress the HTTPException if adding the reaction fails + name=f"remove_reaction-{reaction}-{reaction.message.id}-{user}" + ) + return False + + async def wait_for_deletion( message: discord.Message, - user_ids: Sequence[discord.abc.Snowflake], + user_ids: Sequence[int], deletion_emojis: Sequence[str] = (Emojis.trashcan,), timeout: float = 60 * 5, attach_emojis: bool = True, - allow_moderation_roles: bool = True + allow_mods: bool = True ) -> None: """ Wait for up to `timeout` seconds for a reaction by any of the specified `user_ids` to delete the message. An `attach_emojis` bool may be specified to determine whether to attach the given `deletion_emojis` to the message in the given `context`. - An `allow_moderation_roles` bool may also be specified to allow anyone with a role in `MODERATION_ROLES` to delete + An `allow_mods` bool may also be specified to allow anyone with a role in `MODERATION_ROLES` to delete the message. """ if message.guild is None: @@ -43,16 +86,13 @@ async def wait_for_deletion( log.trace(f"Aborting wait_for_deletion: message {message.id} deleted prematurely.") return - def check(reaction: discord.Reaction, user: discord.Member) -> bool: - """Check that the deletion emoji is reacted by the appropriate user.""" - return ( - reaction.message.id == message.id - and str(reaction.emoji) in deletion_emojis - and ( - user.id in user_ids - or allow_moderation_roles and any(role.id in MODERATION_ROLES for role in user.roles) - ) - ) + check = partial( + reaction_check, + message_id=message.id, + allowed_emoji=deletion_emojis, + allowed_users=user_ids, + allow_mods=allow_mods, + ) with contextlib.suppress(asyncio.TimeoutError): await bot.instance.wait_for('reaction_add', check=check, timeout=timeout) diff --git a/bot/utils/scheduling.py b/bot/utils/scheduling.py index 6843bae88..2dc485f24 100644 --- a/bot/utils/scheduling.py +++ b/bot/utils/scheduling.py @@ -161,18 +161,18 @@ class Scheduler: self._log.error(f"Error in task #{task_id} {id(done_task)}!", exc_info=exception) -def create_task(*args, **kwargs) -> asyncio.Task: +def create_task(coro: t.Awaitable, *suppressed_exceptions: t.Type[Exception], **kwargs) -> asyncio.Task: """Wrapper for `asyncio.create_task` which logs exceptions raised in the task.""" - task = asyncio.create_task(*args, **kwargs) - task.add_done_callback(_log_task_exception) + task = asyncio.create_task(coro, **kwargs) + task.add_done_callback(partial(_log_task_exception, suppressed_exceptions=suppressed_exceptions)) return task -def _log_task_exception(task: asyncio.Task) -> None: +def _log_task_exception(task: asyncio.Task, *, suppressed_exceptions: t.Tuple[t.Type[Exception]]) -> None: """Retrieve and log the exception raised in `task` if one exists.""" with contextlib.suppress(asyncio.CancelledError): exception = task.exception() # Log the exception if one exists. - if exception: + if exception and not isinstance(exception, suppressed_exceptions): log = logging.getLogger(__name__) log.error(f"Error in task {task.get_name()} {id(task)}!", exc_info=exception) -- cgit v1.2.3