Generate fallback URLs for web archive links (#804)

* generate fallback web archive URL if none exists * remove fallback web archive snapshot creation * fix test
2025-08-14 14:09:26 +02:00 · 2024-08-29 22:45:10 +02:00
parent 36a84276a2
commit 749bc1ef63
10 changed files with 53 additions and 336 deletions
--- a/bookmarks/services/importer.py
+++ b/bookmarks/services/importer.py
@@ -79,8 +79,6 @@ def import_netscape_html(
    for batch in batches:
        _import_batch(batch, user, options, tag_cache, result)

-    # Create snapshots for newly imported bookmarks
-    tasks.schedule_bookmarks_without_snapshots(user)
    # Load favicons for newly imported bookmarks
    tasks.schedule_bookmarks_without_favicons(user)
    # Load previews for newly imported bookmarks
--- a/bookmarks/services/tasks.py
+++ b/bookmarks/services/tasks.py
@@ -12,9 +12,8 @@ from django.utils import timezone, formats
 from huey import crontab
 from huey.contrib.djhuey import HUEY as huey
 from huey.exceptions import TaskLockedException
-from waybackpy.exceptions import WaybackError, TooManyRequestsError, NoCDXRecordFound
+from waybackpy.exceptions import WaybackError, TooManyRequestsError

-import bookmarks.services.wayback
 from bookmarks.models import Bookmark, BookmarkAsset, UserProfile
 from bookmarks.services import favicon_loader, singlefile, preview_image_loader
 from bookmarks.services.website_loader import DEFAULT_USER_AGENT
@@ -66,29 +65,6 @@ def create_web_archive_snapshot(user: User, bookmark: Bookmark, force_update: bo
        _create_web_archive_snapshot_task(bookmark.id, force_update)


-def _load_newest_snapshot(bookmark: Bookmark):
-    try:
-        logger.info(f"Load existing snapshot for bookmark. url={bookmark.url}")
-        cdx_api = bookmarks.services.wayback.CustomWaybackMachineCDXServerAPI(
-            bookmark.url
-        )
-        existing_snapshot = cdx_api.newest()
-
-        if existing_snapshot:
-            bookmark.web_archive_snapshot_url = existing_snapshot.archive_url
-            bookmark.save(update_fields=["web_archive_snapshot_url"])
-            logger.info(
-                f"Using newest snapshot. url={bookmark.url} from={existing_snapshot.datetime_timestamp}"
-            )
-
-    except NoCDXRecordFound:
-        logger.info(f"Could not find any snapshots for bookmark. url={bookmark.url}")
-    except WaybackError as error:
-        logger.error(
-            f"Failed to load existing snapshot. url={bookmark.url}", exc_info=error
-        )
-
-
 def _create_snapshot(bookmark: Bookmark):
    logger.info(f"Create new snapshot for bookmark. url={bookmark.url}...")
    archive = waybackpy.WaybackMachineSaveAPI(
@@ -117,48 +93,27 @@ def _create_web_archive_snapshot_task(bookmark_id: int, force_update: bool):
        return
    except TooManyRequestsError:
        logger.error(
-            f"Failed to create snapshot due to rate limiting, trying to load newest snapshot as fallback. url={bookmark.url}"
+            f"Failed to create snapshot due to rate limiting. url={bookmark.url}"
        )
    except WaybackError as error:
        logger.error(
-            f"Failed to create snapshot, trying to load newest snapshot as fallback. url={bookmark.url}",
+            f"Failed to create snapshot. url={bookmark.url}",
            exc_info=error,
        )

-    # Load the newest snapshot as fallback
-    _load_newest_snapshot(bookmark)
-

@task()
 def _load_web_archive_snapshot_task(bookmark_id: int):
-    try:
-        bookmark = Bookmark.objects.get(id=bookmark_id)
-    except Bookmark.DoesNotExist:
-        return
-    # Skip if snapshot exists
-    if bookmark.web_archive_snapshot_url:
-        return
-    # Load the newest snapshot
-    _load_newest_snapshot(bookmark)
-
-
-def schedule_bookmarks_without_snapshots(user: User):
-    if is_web_archive_integration_active(user):
-        _schedule_bookmarks_without_snapshots_task(user.id)
+    # Loading snapshots from CDX API has been removed, keeping the task function
+    # for now to prevent errors when huey tries to run the task
+    pass


@task()
 def _schedule_bookmarks_without_snapshots_task(user_id: int):
-    user = get_user_model().objects.get(id=user_id)
-    bookmarks_without_snapshots = Bookmark.objects.filter(
-        web_archive_snapshot_url__exact="", owner=user
-    )
-
-    # TODO: Implement bulk task creation
-    for bookmark in bookmarks_without_snapshots:
-        # To prevent rate limit errors from the Wayback API only try to load the latest snapshots instead of creating
-        # new ones when processing bookmarks in bulk
-        _load_web_archive_snapshot_task(bookmark.id)
+    # Loading snapshots from CDX API has been removed, keeping the task function
+    # for now to prevent errors when huey tries to run the task
+    pass


 def is_favicon_feature_active(user: User) -> bool:
--- a/bookmarks/services/wayback.py
+++ b/bookmarks/services/wayback.py
@@ -1,42 +1,20 @@
-import time
-from typing import Dict
+import datetime

-import waybackpy
-import waybackpy.utils
-from waybackpy.exceptions import NoCDXRecordFound
+from django.utils import timezone


-class CustomWaybackMachineCDXServerAPI(waybackpy.WaybackMachineCDXServerAPI):
+def generate_fallback_webarchive_url(
+    url: str, timestamp: datetime.datetime
+) -> str | None:
    """
-    Customized WaybackMachineCDXServerAPI to work around some issues with retrieving the newest snapshot.
-    See https://github.com/akamhy/waybackpy/issues/176
+    Generate a URL to the web archive for the given URL and timestamp.
+    A snapshot for the specific timestamp might not exist, in which case the
+    web archive will show the closest snapshot to the given timestamp.
+    If there is no snapshot at all the URL will be invalid.
    """
+    if not url:
+        return None
+    if not timestamp:
+        timestamp = timezone.now()

-    def newest(self):
-        unix_timestamp = int(time.time())
-        self.closest = waybackpy.utils.unix_timestamp_to_wayback_timestamp(
-            unix_timestamp
-        )
-        self.sort = "closest"
-        self.limit = -5
-
-        newest_snapshot = None
-        for snapshot in self.snapshots():
-            newest_snapshot = snapshot
-            break
-
-        if not newest_snapshot:
-            raise NoCDXRecordFound(
-                "Wayback Machine's CDX server did not return any records "
-                + "for the query. The URL may not have any archives "
-                + " on the Wayback Machine or the URL may have been recently "
-                + "archived and is still not available on the CDX server."
-            )
-
-        return newest_snapshot
-
-    def add_payload(self, payload: Dict[str, str]) -> None:
-        super().add_payload(payload)
-        # Set fastLatest query param, as we are only using this API to get the latest snapshot and using fastLatest
-        # makes searching for latest snapshots faster
-        payload["fastLatest"] = "true"
+    return f"https://web.archive.org/web/{timestamp.strftime('%Y%m%d%H%M%S')}/{url}"