Add helper function for extracting attachment text.

Implements a somewhat arbitrary limit on how much text content is passed along for filtering, to avoid wasting compute time on large attachments that aren't intended to be read (such as CSVs)
author: Steele Farnsworth <[email protected]> 2025-01-29 18:51:18 -0500
committer: Steele Farnsworth <[email protected]> 2025-01-29 18:51:18 -0500
commit: 7d424b8d2d5b1445e753cb9fbdffb91346a641e9 (patch)
tree: e1d863f0d4a983c8e678dd748cc5ec0299516705
parent: Apply token filters to text attachment content. (diff)
1 files changed, 8 insertions, 1 deletions
diff --git a/bot/exts/filtering/filtering.py b/bot/exts/filtering/filtering.py
index a281aff79..f902ee9ec 100644
--- a/bot/exts/filtering/filtering.py
+++ b/bot/exts/filtering/filtering.py
@@ -67,6 +67,13 @@ OFFENSIVE_MSG_DELETE_TIME = datetime.timedelta(days=7)
 WEEKLY_REPORT_ISO_DAY = 3  # 1=Monday, 7=Sunday
 
 
+async def _extract_text_file_content(att: discord.Attachment) -> str:
+    """Extract up to the first 30 lines and first 2000 characters (whichever is shorter) of an attachment."""
+    file_lines: list[str] = (await att.read()).decode().splitlines()
+    first_n_lines = "\n".join(file_lines[:30])[:2_000]
+    return f"{att.filename}: {first_n_lines}"
+
+
 class Filtering(Cog):
     """Filtering and alerting for content posted on the server."""
 
@@ -226,7 +233,7 @@ class Filtering(Cog):
         ctx = FilterContext.from_message(Event.MESSAGE, msg, None, self.message_cache)
 
         text_contents = [
-            f"{a.filename}: " + (await a.read()).decode()
+            await _extract_text_file_content(a)
             for a in msg.attachments if a.content_type.startswith("text")
         ]
         if text_contents:
author	Steele Farnsworth <[email protected]>	2025-01-29 18:51:18 -0500
committer	Steele Farnsworth <[email protected]>	2025-01-29 18:51:18 -0500
commit	7d424b8d2d5b1445e753cb9fbdffb91346a641e9 (patch)
tree	e1d863f0d4a983c8e678dd748cc5ec0299516705
parent	Apply token filters to text attachment content. (diff)