Limit document size for website scraper (#354)

Limits the size of scraped HTML documents to prevent out of memory errors. The scraper will stop reading from the response when it encounters the closing head tag, or if the read content's size exceeds a max limit. Fixes #345
2025-09-22 00:49:34 +02:00 · 2022-10-07 21:18:18 +02:00
parent 277c1c76e3
commit 2fd7704816
2 changed files with 80 additions and 2 deletions
--- a/bookmarks/services/website_loader.py
+++ b/bookmarks/services/website_loader.py
@@ -1,9 +1,12 @@
+import logging
 from dataclasses import dataclass

 import requests
 from bs4 import BeautifulSoup
 from charset_normalizer import from_bytes

+logger = logging.getLogger(__name__)
+

@dataclass
 class WebsiteMetadata:
@@ -33,15 +36,37 @@ def load_website_metadata(url: str):
        return WebsiteMetadata(url=url, title=title, description=description)


+CHUNK_SIZE = 50 * 1024
+MAX_CONTENT_LIMIT = 5000 * 1024
+
+
 def load_page(url: str):
    headers = fake_request_headers()
-    r = requests.get(url, timeout=10, headers=headers)
+    size = 0
+    content = None
+    # Use with to ensure request gets closed even if it's only read partially
+    with requests.get(url, timeout=10, headers=headers, stream=True) as r:
+        for chunk in r.iter_content(chunk_size=CHUNK_SIZE):
+            size += len(chunk)
+            if content is None:
+                content = chunk
+            else:
+                content = content + chunk
+
+            # Stop reading if we have parsed end of head tag
+            if '</head>'.encode('utf-8') in content:
+                logger.debug(f'Found closing head tag after {size} bytes')
+                break
+            # Stop reading if we exceed limit
+            if size > MAX_CONTENT_LIMIT:
+                logger.debug(f'Cancel reading document after {size} bytes')
+                break

    # Use charset_normalizer to determine encoding that best matches the response content
    # Several sites seem to specify the response encoding incorrectly, so we ignore it and use custom logic instead
    # This is different from Response.text which does respect the encoding specified in the response first,
    # before trying to determine one
-    results = from_bytes(r.content)
+    results = from_bytes(content or '')
    return str(results.best())