From 8756c741035d007a5d3f3309b877f56b9ccd0ef1 Mon Sep 17 00:00:00 2001 From: Numerlor <25886452+Numerlor@users.noreply.github.com> Date: Sun, 21 Jun 2020 00:59:32 +0200 Subject: Account for `NavigableString`s when gathering text. `find_next()` only goes to tags, leaving out text outside of them when parsing. --- bot/cogs/doc.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/bot/cogs/doc.py b/bot/cogs/doc.py index 51323e64f..d64e6692f 100644 --- a/bot/cogs/doc.py +++ b/bot/cogs/doc.py @@ -11,7 +11,7 @@ from urllib.parse import urljoin import discord from bs4 import BeautifulSoup -from bs4.element import PageElement, Tag +from bs4.element import NavigableString, PageElement, Tag from discord.errors import NotFound from discord.ext import commands from markdownify import MarkdownConverter @@ -377,7 +377,9 @@ class Doc(commands.Cog): tag_filter: Union[Tuple[str], Callable[[Tag], bool]] ) -> Optional[str]: """ - Get all text from

elements until a tag matching `tag_filter` is found, max 1000 elements searched. + Get all text from

elements and strings until a tag matching `tag_filter` is found. + + Max 1000 elements are searched to avoid going through whole pages when no matching tag is found. `tag_filter` can be either a tuple of string names to check against, or a filtering callable that's applied to the tags. @@ -389,7 +391,11 @@ class Doc(commands.Cog): if element is None: break - element = element.find_next() + element = element.next + while isinstance(element, NavigableString): + text += element + element = element.next + if element.name == "p": text += str(element) -- cgit v1.2.3