Limit document size for website scraper (#354)

Limits the size of scraped HTML documents to prevent out of memory errors. The scraper will stop reading from the response when it encounters the closing head tag, or if the read content's size exceeds a max limit.

Fixes #345
This commit is contained in:
Sascha Ißbrücker
2022-10-07 21:18:18 +02:00
committed by GitHub
parent 277c1c76e3
commit 2fd7704816
2 changed files with 80 additions and 2 deletions

View File

@@ -0,0 +1,53 @@
from unittest import mock
from bookmarks.services import website_loader
from django.test import TestCase
class MockStreamingResponse:
def __init__(self, num_chunks, chunk_size, insert_head_after_chunk=None):
self.chunks = []
for index in range(num_chunks):
chunk = ''.zfill(chunk_size)
self.chunks.append(chunk.encode('utf-8'))
if index == insert_head_after_chunk:
self.chunks.append('</head>'.encode('utf-8'))
def iter_content(self, **kwargs):
return self.chunks
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
pass
class WebsiteLoaderTestCase(TestCase):
def test_load_page_returns_content(self):
with mock.patch('requests.get') as mock_get:
mock_get.return_value = MockStreamingResponse(num_chunks=10, chunk_size=1024)
content = website_loader.load_page('https://example.com')
expected_content_size = 10 * 1024
self.assertEqual(expected_content_size, len(content))
def test_load_page_limits_large_documents(self):
with mock.patch('requests.get') as mock_get:
mock_get.return_value = MockStreamingResponse(num_chunks=10, chunk_size=1024 * 1000)
content = website_loader.load_page('https://example.com')
# Should have read six chunks, after which content exceeds the max of 5MB
expected_content_size = 6 * 1024 * 1000
self.assertEqual(expected_content_size, len(content))
def test_load_page_stops_reading_at_closing_head_tag(self):
with mock.patch('requests.get') as mock_get:
mock_get.return_value = MockStreamingResponse(num_chunks=10, chunk_size=1024 * 1000,
insert_head_after_chunk=0)
content = website_loader.load_page('https://example.com')
# Should have read first chunk, and second chunk containing closing head tag
expected_content_size = 1 * 1024 * 1000 + len('</head>')
self.assertEqual(expected_content_size, len(content))