diff options
author | 2025-01-29 18:51:18 -0500 | |
---|---|---|
committer | 2025-01-29 18:51:18 -0500 | |
commit | 7d424b8d2d5b1445e753cb9fbdffb91346a641e9 (patch) | |
tree | e1d863f0d4a983c8e678dd748cc5ec0299516705 | |
parent | Apply token filters to text attachment content. (diff) |
Add helper function for extracting attachment text.
Implements a somewhat arbitrary limit on how much text content is passed along for filtering, to avoid wasting compute time on large attachments that aren't intended to be read (such as CSVs)
-rw-r--r-- | bot/exts/filtering/filtering.py | 9 |
1 files changed, 8 insertions, 1 deletions
diff --git a/bot/exts/filtering/filtering.py b/bot/exts/filtering/filtering.py index a281aff79..f902ee9ec 100644 --- a/bot/exts/filtering/filtering.py +++ b/bot/exts/filtering/filtering.py @@ -67,6 +67,13 @@ OFFENSIVE_MSG_DELETE_TIME = datetime.timedelta(days=7) WEEKLY_REPORT_ISO_DAY = 3 # 1=Monday, 7=Sunday +async def _extract_text_file_content(att: discord.Attachment) -> str: + """Extract up to the first 30 lines and first 2000 characters (whichever is shorter) of an attachment.""" + file_lines: list[str] = (await att.read()).decode().splitlines() + first_n_lines = "\n".join(file_lines[:30])[:2_000] + return f"{att.filename}: {first_n_lines}" + + class Filtering(Cog): """Filtering and alerting for content posted on the server.""" @@ -226,7 +233,7 @@ class Filtering(Cog): ctx = FilterContext.from_message(Event.MESSAGE, msg, None, self.message_cache) text_contents = [ - f"{a.filename}: " + (await a.read()).decode() + await _extract_text_file_content(a) for a in msg.attachments if a.content_type.startswith("text") ] if text_contents: |