diff --git a/bookmarks/services/website_loader.py b/bookmarks/services/website_loader.py index 3a89b8b..bd37c26 100644 --- a/bookmarks/services/website_loader.py +++ b/bookmarks/services/website_loader.py @@ -71,8 +71,10 @@ def load_page(url: str): logger.debug(f'Loaded chunk (iteration={iteration}, total={size / 1024})') # Stop reading if we have parsed end of head tag - if ''.encode('utf-8') in content: + end_of_head = ''.encode('utf-8') + if end_of_head in content: logger.debug(f'Found closing head tag after {size} bytes') + content = content.split(end_of_head)[0] + end_of_head break # Stop reading if we exceed limit if size > MAX_CONTENT_LIMIT: diff --git a/bookmarks/tests/test_website_loader.py b/bookmarks/tests/test_website_loader.py index eb10a7c..f775d0d 100644 --- a/bookmarks/tests/test_website_loader.py +++ b/bookmarks/tests/test_website_loader.py @@ -59,7 +59,7 @@ class WebsiteLoaderTestCase(TestCase): expected_content_size = 6 * 1024 * 1000 self.assertEqual(expected_content_size, len(content)) - def test_load_page_stops_reading_at_closing_head_tag(self): + def test_load_page_stops_reading_at_end_of_head(self): with mock.patch('requests.get') as mock_get: mock_get.return_value = MockStreamingResponse(num_chunks=10, chunk_size=1024 * 1000, insert_head_after_chunk=0) @@ -69,6 +69,18 @@ class WebsiteLoaderTestCase(TestCase): expected_content_size = 1 * 1024 * 1000 + len('') self.assertEqual(expected_content_size, len(content)) + def test_load_page_removes_bytes_after_end_of_head(self): + with mock.patch('requests.get') as mock_get: + mock_response = MockStreamingResponse(num_chunks=1, chunk_size=0) + mock_response.chunks[0] = '人'.encode('utf-8') + # add a single byte that can't be decoded to utf-8 + mock_response.chunks[0] += 0xff.to_bytes(1, 'big') + mock_get.return_value = mock_response + content = website_loader.load_page('https://example.com') + + # verify that byte after head was removed, content parsed as utf-8 + self.assertEqual(content, '人') + def test_load_website_metadata(self): with mock.patch('bookmarks.services.website_loader.load_page') as mock_load_page: mock_load_page.return_value = self.render_html_document('test title', 'test description')