diff options
| author | 2025-01-29 18:51:18 -0500 | |
|---|---|---|
| committer | 2025-01-29 18:51:18 -0500 | |
| commit | 7d424b8d2d5b1445e753cb9fbdffb91346a641e9 (patch) | |
| tree | e1d863f0d4a983c8e678dd748cc5ec0299516705 | |
| parent | Apply token filters to text attachment content. (diff) | |
Add helper function for extracting attachment text.
Implements a somewhat arbitrary limit on how much text content is passed along for filtering, to avoid wasting compute time on large attachments that aren't intended to be read (such as CSVs)
Diffstat (limited to '')
| -rw-r--r-- | bot/exts/filtering/filtering.py | 9 |
1 files changed, 8 insertions, 1 deletions
diff --git a/bot/exts/filtering/filtering.py b/bot/exts/filtering/filtering.py index a281aff79..f902ee9ec 100644 --- a/bot/exts/filtering/filtering.py +++ b/bot/exts/filtering/filtering.py @@ -67,6 +67,13 @@ OFFENSIVE_MSG_DELETE_TIME = datetime.timedelta(days=7) WEEKLY_REPORT_ISO_DAY = 3 # 1=Monday, 7=Sunday +async def _extract_text_file_content(att: discord.Attachment) -> str: + """Extract up to the first 30 lines and first 2000 characters (whichever is shorter) of an attachment.""" + file_lines: list[str] = (await att.read()).decode().splitlines() + first_n_lines = "\n".join(file_lines[:30])[:2_000] + return f"{att.filename}: {first_n_lines}" + + class Filtering(Cog): """Filtering and alerting for content posted on the server.""" @@ -226,7 +233,7 @@ class Filtering(Cog): ctx = FilterContext.from_message(Event.MESSAGE, msg, None, self.message_cache) text_contents = [ - f"{a.filename}: " + (await a.read()).decode() + await _extract_text_file_content(a) for a in msg.attachments if a.content_type.startswith("text") ] if text_contents: |