From 749bc1ef6331864709f05c577f76c04d820b5bee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sascha=20I=C3=9Fbr=C3=BCcker?= Date: Thu, 29 Aug 2024 22:45:10 +0200 Subject: [PATCH] Generate fallback URLs for web archive links (#804) * generate fallback web archive URL if none exists * remove fallback web archive snapshot creation * fix test --- bookmarks/services/importer.py | 2 - bookmarks/services/tasks.py | 63 +------ bookmarks/services/wayback.py | 50 ++--- bookmarks/signals.py | 8 - bookmarks/tests/test_bookmark_index_view.py | 6 +- .../tests/test_bookmarks_list_template.py | 46 +++-- bookmarks/tests/test_bookmarks_tasks.py | 177 ------------------ bookmarks/tests/test_importer.py | 11 -- bookmarks/tests/test_signals.py | 17 -- bookmarks/views/partials/contexts.py | 9 +- 10 files changed, 53 insertions(+), 336 deletions(-) delete mode 100644 bookmarks/tests/test_signals.py diff --git a/bookmarks/services/importer.py b/bookmarks/services/importer.py index 5a35427..d4bbd5a 100644 --- a/bookmarks/services/importer.py +++ b/bookmarks/services/importer.py @@ -79,8 +79,6 @@ def import_netscape_html( for batch in batches: _import_batch(batch, user, options, tag_cache, result) - # Create snapshots for newly imported bookmarks - tasks.schedule_bookmarks_without_snapshots(user) # Load favicons for newly imported bookmarks tasks.schedule_bookmarks_without_favicons(user) # Load previews for newly imported bookmarks diff --git a/bookmarks/services/tasks.py b/bookmarks/services/tasks.py index a6de13f..a0655fe 100644 --- a/bookmarks/services/tasks.py +++ b/bookmarks/services/tasks.py @@ -12,9 +12,8 @@ from django.utils import timezone, formats from huey import crontab from huey.contrib.djhuey import HUEY as huey from huey.exceptions import TaskLockedException -from waybackpy.exceptions import WaybackError, TooManyRequestsError, NoCDXRecordFound +from waybackpy.exceptions import WaybackError, TooManyRequestsError -import bookmarks.services.wayback from bookmarks.models import Bookmark, BookmarkAsset, UserProfile from bookmarks.services import favicon_loader, singlefile, preview_image_loader from bookmarks.services.website_loader import DEFAULT_USER_AGENT @@ -66,29 +65,6 @@ def create_web_archive_snapshot(user: User, bookmark: Bookmark, force_update: bo _create_web_archive_snapshot_task(bookmark.id, force_update) -def _load_newest_snapshot(bookmark: Bookmark): - try: - logger.info(f"Load existing snapshot for bookmark. url={bookmark.url}") - cdx_api = bookmarks.services.wayback.CustomWaybackMachineCDXServerAPI( - bookmark.url - ) - existing_snapshot = cdx_api.newest() - - if existing_snapshot: - bookmark.web_archive_snapshot_url = existing_snapshot.archive_url - bookmark.save(update_fields=["web_archive_snapshot_url"]) - logger.info( - f"Using newest snapshot. url={bookmark.url} from={existing_snapshot.datetime_timestamp}" - ) - - except NoCDXRecordFound: - logger.info(f"Could not find any snapshots for bookmark. url={bookmark.url}") - except WaybackError as error: - logger.error( - f"Failed to load existing snapshot. url={bookmark.url}", exc_info=error - ) - - def _create_snapshot(bookmark: Bookmark): logger.info(f"Create new snapshot for bookmark. url={bookmark.url}...") archive = waybackpy.WaybackMachineSaveAPI( @@ -117,48 +93,27 @@ def _create_web_archive_snapshot_task(bookmark_id: int, force_update: bool): return except TooManyRequestsError: logger.error( - f"Failed to create snapshot due to rate limiting, trying to load newest snapshot as fallback. url={bookmark.url}" + f"Failed to create snapshot due to rate limiting. url={bookmark.url}" ) except WaybackError as error: logger.error( - f"Failed to create snapshot, trying to load newest snapshot as fallback. url={bookmark.url}", + f"Failed to create snapshot. url={bookmark.url}", exc_info=error, ) - # Load the newest snapshot as fallback - _load_newest_snapshot(bookmark) - @task() def _load_web_archive_snapshot_task(bookmark_id: int): - try: - bookmark = Bookmark.objects.get(id=bookmark_id) - except Bookmark.DoesNotExist: - return - # Skip if snapshot exists - if bookmark.web_archive_snapshot_url: - return - # Load the newest snapshot - _load_newest_snapshot(bookmark) - - -def schedule_bookmarks_without_snapshots(user: User): - if is_web_archive_integration_active(user): - _schedule_bookmarks_without_snapshots_task(user.id) + # Loading snapshots from CDX API has been removed, keeping the task function + # for now to prevent errors when huey tries to run the task + pass @task() def _schedule_bookmarks_without_snapshots_task(user_id: int): - user = get_user_model().objects.get(id=user_id) - bookmarks_without_snapshots = Bookmark.objects.filter( - web_archive_snapshot_url__exact="", owner=user - ) - - # TODO: Implement bulk task creation - for bookmark in bookmarks_without_snapshots: - # To prevent rate limit errors from the Wayback API only try to load the latest snapshots instead of creating - # new ones when processing bookmarks in bulk - _load_web_archive_snapshot_task(bookmark.id) + # Loading snapshots from CDX API has been removed, keeping the task function + # for now to prevent errors when huey tries to run the task + pass def is_favicon_feature_active(user: User) -> bool: diff --git a/bookmarks/services/wayback.py b/bookmarks/services/wayback.py index b527b07..aaeb99d 100644 --- a/bookmarks/services/wayback.py +++ b/bookmarks/services/wayback.py @@ -1,42 +1,20 @@ -import time -from typing import Dict +import datetime -import waybackpy -import waybackpy.utils -from waybackpy.exceptions import NoCDXRecordFound +from django.utils import timezone -class CustomWaybackMachineCDXServerAPI(waybackpy.WaybackMachineCDXServerAPI): +def generate_fallback_webarchive_url( + url: str, timestamp: datetime.datetime +) -> str | None: """ - Customized WaybackMachineCDXServerAPI to work around some issues with retrieving the newest snapshot. - See https://github.com/akamhy/waybackpy/issues/176 + Generate a URL to the web archive for the given URL and timestamp. + A snapshot for the specific timestamp might not exist, in which case the + web archive will show the closest snapshot to the given timestamp. + If there is no snapshot at all the URL will be invalid. """ + if not url: + return None + if not timestamp: + timestamp = timezone.now() - def newest(self): - unix_timestamp = int(time.time()) - self.closest = waybackpy.utils.unix_timestamp_to_wayback_timestamp( - unix_timestamp - ) - self.sort = "closest" - self.limit = -5 - - newest_snapshot = None - for snapshot in self.snapshots(): - newest_snapshot = snapshot - break - - if not newest_snapshot: - raise NoCDXRecordFound( - "Wayback Machine's CDX server did not return any records " - + "for the query. The URL may not have any archives " - + " on the Wayback Machine or the URL may have been recently " - + "archived and is still not available on the CDX server." - ) - - return newest_snapshot - - def add_payload(self, payload: Dict[str, str]) -> None: - super().add_payload(payload) - # Set fastLatest query param, as we are only using this API to get the latest snapshot and using fastLatest - # makes searching for latest snapshots faster - payload["fastLatest"] = "true" + return f"https://web.archive.org/web/{timestamp.strftime('%Y%m%d%H%M%S')}/{url}" diff --git a/bookmarks/signals.py b/bookmarks/signals.py index 8507c84..e8ca82a 100644 --- a/bookmarks/signals.py +++ b/bookmarks/signals.py @@ -1,15 +1,7 @@ from django.conf import settings -from django.contrib.auth import user_logged_in from django.db.backends.signals import connection_created from django.dispatch import receiver -from bookmarks.services import tasks - - -@receiver(user_logged_in) -def user_logged_in(sender, request, user, **kwargs): - tasks.schedule_bookmarks_without_snapshots(user) - @receiver(connection_created) def extend_sqlite(connection=None, **kwargs): diff --git a/bookmarks/tests/test_bookmark_index_view.py b/bookmarks/tests/test_bookmark_index_view.py index f7be4a1..608f281 100644 --- a/bookmarks/tests/test_bookmark_index_view.py +++ b/bookmarks/tests/test_bookmark_index_view.py @@ -6,11 +6,7 @@ from django.test import TestCase from django.urls import reverse from bookmarks.models import Bookmark, BookmarkSearch, Tag, UserProfile -from bookmarks.tests.helpers import ( - BookmarkFactoryMixin, - HtmlTestMixin, - collapse_whitespace, -) +from bookmarks.tests.helpers import BookmarkFactoryMixin, HtmlTestMixin class BookmarkIndexViewTestCase(TestCase, BookmarkFactoryMixin, HtmlTestMixin): diff --git a/bookmarks/tests/test_bookmarks_list_template.py b/bookmarks/tests/test_bookmarks_list_template.py index dc7522e..a8658f4 100644 --- a/bookmarks/tests/test_bookmarks_list_template.py +++ b/bookmarks/tests/test_bookmarks_list_template.py @@ -1,3 +1,4 @@ +import datetime from typing import Type from dateutil.relativedelta import relativedelta @@ -36,15 +37,6 @@ class BookmarkListTemplateTest(TestCase, BookmarkFactoryMixin, HtmlTestMixin): html, ) - def assertDateLabel(self, html: str, label_content: str): - self.assertInHTML( - f""" - {label_content} - | - """, - html, - ) - def assertWebArchiveLink( self, html: str, label_content: str, url: str, link_target: str = "_blank" ): @@ -465,15 +457,6 @@ class BookmarkListTemplateTest(TestCase, BookmarkFactoryMixin, HtmlTestMixin): style = bookmark_list["style"] self.assertIn("--ld-bookmark-description-max-lines:3;", style) - def test_should_respect_absolute_date_setting(self): - bookmark = self.setup_date_format_test( - UserProfile.BOOKMARK_DATE_DISPLAY_ABSOLUTE - ) - html = self.render_template() - formatted_date = formats.date_format(bookmark.date_added, "SHORT_DATE_FORMAT") - - self.assertDateLabel(html, formatted_date) - def test_should_render_web_archive_link_with_absolute_date_setting(self): bookmark = self.setup_date_format_test( UserProfile.BOOKMARK_DATE_DISPLAY_ABSOLUTE, @@ -486,12 +469,6 @@ class BookmarkListTemplateTest(TestCase, BookmarkFactoryMixin, HtmlTestMixin): html, formatted_date, bookmark.web_archive_snapshot_url ) - def test_should_respect_relative_date_setting(self): - self.setup_date_format_test(UserProfile.BOOKMARK_DATE_DISPLAY_RELATIVE) - html = self.render_template() - - self.assertDateLabel(html, "1 week ago") - def test_should_render_web_archive_link_with_relative_date_setting(self): bookmark = self.setup_date_format_test( UserProfile.BOOKMARK_DATE_DISPLAY_RELATIVE, @@ -501,6 +478,27 @@ class BookmarkListTemplateTest(TestCase, BookmarkFactoryMixin, HtmlTestMixin): self.assertWebArchiveLink(html, "1 week ago", bookmark.web_archive_snapshot_url) + def test_should_render_generated_web_archive_link_without_saved_snapshot_url(self): + user = self.get_or_create_test_user() + user.profile.bookmark_date_display = UserProfile.BOOKMARK_DATE_DISPLAY_ABSOLUTE + user.profile.save() + + date_added = timezone.datetime( + 2023, 8, 11, 21, 45, 11, tzinfo=datetime.timezone.utc + ) + bookmark = self.setup_bookmark( + url="https://example.com/article", added=date_added + ) + + html = self.render_template() + formatted_date = formats.date_format(bookmark.date_added, "SHORT_DATE_FORMAT") + + self.assertWebArchiveLink( + html, + formatted_date, + "https://web.archive.org/web/20230811214511/https://example.com/article", + ) + def test_bookmark_link_target_should_be_blank_by_default(self): bookmark = self.setup_bookmark() html = self.render_template() diff --git a/bookmarks/tests/test_bookmarks_tasks.py b/bookmarks/tests/test_bookmarks_tasks.py index ade2532..25a42c9 100644 --- a/bookmarks/tests/test_bookmarks_tasks.py +++ b/bookmarks/tests/test_bookmarks_tasks.py @@ -1,6 +1,4 @@ -import datetime import os.path -from dataclasses import dataclass from unittest import mock import waybackpy @@ -10,8 +8,6 @@ from django.test import TestCase, override_settings from huey.contrib.djhuey import HUEY as huey from waybackpy.exceptions import WaybackError -import bookmarks.services.favicon_loader -import bookmarks.services.wayback from bookmarks.models import BookmarkAsset, UserProfile from bookmarks.services import tasks, singlefile from bookmarks.tests.helpers import BookmarkFactoryMixin @@ -29,12 +25,6 @@ def create_wayback_machine_save_api_mock( return mock_api -@dataclass -class MockCdxSnapshot: - archive_url: str - datetime_timestamp: datetime.datetime - - class BookmarkTasksTestCase(TestCase, BookmarkFactoryMixin): def setUp(self): @@ -50,17 +40,6 @@ class BookmarkTasksTestCase(TestCase, BookmarkFactoryMixin): ) self.mock_save_api_patcher.start() - self.mock_cdx_api = mock.Mock() - self.mock_cdx_api.newest.return_value = MockCdxSnapshot( - "https://example.com/newest_snapshot", datetime.datetime.now() - ) - self.mock_cdx_api_patcher = mock.patch.object( - bookmarks.services.wayback, - "CustomWaybackMachineCDXServerAPI", - return_value=self.mock_cdx_api, - ) - self.mock_cdx_api_patcher.start() - self.mock_load_favicon_patcher = mock.patch( "bookmarks.services.favicon_loader.load_favicon" ) @@ -90,7 +69,6 @@ class BookmarkTasksTestCase(TestCase, BookmarkFactoryMixin): def tearDown(self): self.mock_save_api_patcher.stop() - self.mock_cdx_api_patcher.stop() self.mock_load_favicon_patcher.stop() self.mock_singlefile_create_snapshot_patcher.stop() self.mock_load_preview_image_patcher.stop() @@ -142,45 +120,6 @@ class BookmarkTasksTestCase(TestCase, BookmarkFactoryMixin): self.assertEqual(bookmark.web_archive_snapshot_url, "https://other.com") - def test_create_web_archive_snapshot_should_use_newest_snapshot_as_fallback(self): - bookmark = self.setup_bookmark() - self.mock_save_api.save.side_effect = WaybackError - - tasks.create_web_archive_snapshot( - self.get_or_create_test_user(), bookmark, False - ) - - bookmark.refresh_from_db() - self.mock_cdx_api.newest.assert_called_once() - self.assertEqual( - "https://example.com/newest_snapshot", - bookmark.web_archive_snapshot_url, - ) - - def test_create_web_archive_snapshot_should_ignore_missing_newest_snapshot(self): - bookmark = self.setup_bookmark() - self.mock_save_api.save.side_effect = WaybackError - self.mock_cdx_api.newest.return_value = None - - tasks.create_web_archive_snapshot( - self.get_or_create_test_user(), bookmark, False - ) - - bookmark.refresh_from_db() - self.assertEqual("", bookmark.web_archive_snapshot_url) - - def test_create_web_archive_snapshot_should_ignore_newest_snapshot_errors(self): - bookmark = self.setup_bookmark() - self.mock_save_api.save.side_effect = WaybackError - self.mock_cdx_api.newest.side_effect = WaybackError - - tasks.create_web_archive_snapshot( - self.get_or_create_test_user(), bookmark, False - ) - - bookmark.refresh_from_db() - self.assertEqual("", bookmark.web_archive_snapshot_url) - def test_create_web_archive_snapshot_should_not_save_stale_bookmark_data(self): bookmark = self.setup_bookmark() @@ -203,69 +142,6 @@ class BookmarkTasksTestCase(TestCase, BookmarkFactoryMixin): bookmark.web_archive_snapshot_url, ) - def test_load_web_archive_snapshot_should_update_snapshot_url(self): - bookmark = self.setup_bookmark() - - tasks._load_web_archive_snapshot_task(bookmark.id) - - bookmark.refresh_from_db() - - self.assertEqual(self.executed_count(), 1) - self.mock_cdx_api.newest.assert_called_once() - self.assertEqual( - "https://example.com/newest_snapshot", bookmark.web_archive_snapshot_url - ) - - def test_load_web_archive_snapshot_should_handle_missing_bookmark_id(self): - tasks._load_web_archive_snapshot_task(123) - - self.assertEqual(self.executed_count(), 1) - self.mock_cdx_api.newest.assert_not_called() - - def test_load_web_archive_snapshot_should_skip_if_snapshot_exists(self): - bookmark = self.setup_bookmark(web_archive_snapshot_url="https://example.com") - - tasks._load_web_archive_snapshot_task(bookmark.id) - - self.assertEqual(self.executed_count(), 1) - self.mock_cdx_api.newest.assert_not_called() - - def test_load_web_archive_snapshot_should_handle_missing_snapshot(self): - bookmark = self.setup_bookmark() - self.mock_cdx_api.newest.return_value = None - - tasks._load_web_archive_snapshot_task(bookmark.id) - - self.assertEqual("", bookmark.web_archive_snapshot_url) - - def test_load_web_archive_snapshot_should_handle_wayback_errors(self): - bookmark = self.setup_bookmark() - self.mock_cdx_api.newest.side_effect = WaybackError - - tasks._load_web_archive_snapshot_task(bookmark.id) - - self.assertEqual("", bookmark.web_archive_snapshot_url) - - def test_load_web_archive_snapshot_should_not_save_stale_bookmark_data(self): - bookmark = self.setup_bookmark() - - # update bookmark during API call to check that saving - # the snapshot does not overwrite updated bookmark data - def mock_newest_impl(): - bookmark.title = "Updated title" - bookmark.save() - return mock.DEFAULT - - self.mock_cdx_api.newest.side_effect = mock_newest_impl - - tasks._load_web_archive_snapshot_task(bookmark.id) - bookmark.refresh_from_db() - - self.assertEqual("Updated title", bookmark.title) - self.assertEqual( - "https://example.com/newest_snapshot", bookmark.web_archive_snapshot_url - ) - @override_settings(LD_DISABLE_BACKGROUND_TASKS=True) def test_create_web_archive_snapshot_should_not_run_when_background_tasks_are_disabled( self, @@ -292,59 +168,6 @@ class BookmarkTasksTestCase(TestCase, BookmarkFactoryMixin): self.assertEqual(self.executed_count(), 0) - def test_schedule_bookmarks_without_snapshots_should_load_snapshot_for_all_bookmarks_without_snapshot( - self, - ): - user = self.get_or_create_test_user() - self.setup_bookmark() - self.setup_bookmark() - self.setup_bookmark() - self.setup_bookmark(web_archive_snapshot_url="https://example.com") - self.setup_bookmark(web_archive_snapshot_url="https://example.com") - self.setup_bookmark(web_archive_snapshot_url="https://example.com") - - tasks.schedule_bookmarks_without_snapshots(user) - - self.assertEqual(self.executed_count(), 4) - self.assertEqual(self.mock_cdx_api.newest.call_count, 3) - - def test_schedule_bookmarks_without_snapshots_should_only_update_user_owned_bookmarks( - self, - ): - user = self.get_or_create_test_user() - other_user = User.objects.create_user( - "otheruser", "otheruser@example.com", "password123" - ) - self.setup_bookmark() - self.setup_bookmark() - self.setup_bookmark() - self.setup_bookmark(user=other_user) - self.setup_bookmark(user=other_user) - self.setup_bookmark(user=other_user) - - tasks.schedule_bookmarks_without_snapshots(user) - - self.assertEqual(self.mock_cdx_api.newest.call_count, 3) - - @override_settings(LD_DISABLE_BACKGROUND_TASKS=True) - def test_schedule_bookmarks_without_snapshots_should_not_run_when_background_tasks_are_disabled( - self, - ): - tasks.schedule_bookmarks_without_snapshots(self.user) - - self.assertEqual(self.executed_count(), 0) - - def test_schedule_bookmarks_without_snapshots_should_not_run_when_web_archive_integration_is_disabled( - self, - ): - self.user.profile.web_archive_integration = ( - UserProfile.WEB_ARCHIVE_INTEGRATION_DISABLED - ) - self.user.profile.save() - tasks.schedule_bookmarks_without_snapshots(self.user) - - self.assertEqual(self.executed_count(), 0) - def test_load_favicon_should_create_favicon_file(self): bookmark = self.setup_bookmark() diff --git a/bookmarks/tests/test_importer.py b/bookmarks/tests/test_importer.py index 163b9ad..56b97ce 100644 --- a/bookmarks/tests/test_importer.py +++ b/bookmarks/tests/test_importer.py @@ -444,17 +444,6 @@ class ImporterTestCase(TestCase, BookmarkFactoryMixin, ImportTestMixin): self.assertEqual(Bookmark.objects.all()[0].description, "Example description") self.assertEqual(Bookmark.objects.all()[0].notes, "Updated notes") - def test_schedule_snapshot_creation(self): - user = self.get_or_create_test_user() - test_html = self.render_html(tags_html="") - - with patch.object( - tasks, "schedule_bookmarks_without_snapshots" - ) as mock_schedule_bookmarks_without_snapshots: - import_netscape_html(test_html, user) - - mock_schedule_bookmarks_without_snapshots.assert_called_once_with(user) - def test_schedule_favicon_loading(self): user = self.get_or_create_test_user() test_html = self.render_html(tags_html="") diff --git a/bookmarks/tests/test_signals.py b/bookmarks/tests/test_signals.py deleted file mode 100644 index d86a717..0000000 --- a/bookmarks/tests/test_signals.py +++ /dev/null @@ -1,17 +0,0 @@ -from unittest.mock import patch - -from django.test import TestCase - -from bookmarks.services import tasks -from bookmarks.tests.helpers import BookmarkFactoryMixin - - -class SignalsTestCase(TestCase, BookmarkFactoryMixin): - def test_login_should_schedule_snapshot_creation(self): - user = self.get_or_create_test_user() - - with patch.object( - tasks, "schedule_bookmarks_without_snapshots" - ) as mock_schedule_bookmarks_without_snapshots: - self.client.force_login(user) - mock_schedule_bookmarks_without_snapshots.assert_called_once_with(user) diff --git a/bookmarks/views/partials/contexts.py b/bookmarks/views/partials/contexts.py index f1412b9..871faab 100644 --- a/bookmarks/views/partials/contexts.py +++ b/bookmarks/views/partials/contexts.py @@ -1,12 +1,12 @@ +import re import urllib.parse from typing import Set, List -import re +from django.conf import settings from django.core.handlers.wsgi import WSGIRequest from django.core.paginator import Paginator from django.db import models from django.urls import reverse -from django.conf import settings from bookmarks import queries from bookmarks import utils @@ -18,6 +18,7 @@ from bookmarks.models import ( UserProfile, Tag, ) +from bookmarks.services.wayback import generate_fallback_webarchive_url DEFAULT_PAGE_SIZE = 30 CJK_RE = re.compile(r"[\u4e00-\u9fff]+") @@ -144,6 +145,10 @@ class BookmarkItem: self.notes = bookmark.notes self.tag_names = bookmark.tag_names self.web_archive_snapshot_url = bookmark.web_archive_snapshot_url + if not self.web_archive_snapshot_url: + self.web_archive_snapshot_url = generate_fallback_webarchive_url( + bookmark.url, bookmark.date_added + ) self.favicon_file = bookmark.favicon_file self.preview_image_file = bookmark.preview_image_file self.is_archived = bookmark.is_archived