Generate fallback URLs for web archive links (#804)

* generate fallback web archive URL if none exists

* remove fallback web archive snapshot creation

* fix test
This commit is contained in:
Sascha Ißbrücker
2024-08-29 22:45:10 +02:00
committed by GitHub
parent 36a84276a2
commit 749bc1ef63
10 changed files with 53 additions and 336 deletions

View File

@@ -79,8 +79,6 @@ def import_netscape_html(
for batch in batches:
_import_batch(batch, user, options, tag_cache, result)
# Create snapshots for newly imported bookmarks
tasks.schedule_bookmarks_without_snapshots(user)
# Load favicons for newly imported bookmarks
tasks.schedule_bookmarks_without_favicons(user)
# Load previews for newly imported bookmarks

View File

@@ -12,9 +12,8 @@ from django.utils import timezone, formats
from huey import crontab
from huey.contrib.djhuey import HUEY as huey
from huey.exceptions import TaskLockedException
from waybackpy.exceptions import WaybackError, TooManyRequestsError, NoCDXRecordFound
from waybackpy.exceptions import WaybackError, TooManyRequestsError
import bookmarks.services.wayback
from bookmarks.models import Bookmark, BookmarkAsset, UserProfile
from bookmarks.services import favicon_loader, singlefile, preview_image_loader
from bookmarks.services.website_loader import DEFAULT_USER_AGENT
@@ -66,29 +65,6 @@ def create_web_archive_snapshot(user: User, bookmark: Bookmark, force_update: bo
_create_web_archive_snapshot_task(bookmark.id, force_update)
def _load_newest_snapshot(bookmark: Bookmark):
try:
logger.info(f"Load existing snapshot for bookmark. url={bookmark.url}")
cdx_api = bookmarks.services.wayback.CustomWaybackMachineCDXServerAPI(
bookmark.url
)
existing_snapshot = cdx_api.newest()
if existing_snapshot:
bookmark.web_archive_snapshot_url = existing_snapshot.archive_url
bookmark.save(update_fields=["web_archive_snapshot_url"])
logger.info(
f"Using newest snapshot. url={bookmark.url} from={existing_snapshot.datetime_timestamp}"
)
except NoCDXRecordFound:
logger.info(f"Could not find any snapshots for bookmark. url={bookmark.url}")
except WaybackError as error:
logger.error(
f"Failed to load existing snapshot. url={bookmark.url}", exc_info=error
)
def _create_snapshot(bookmark: Bookmark):
logger.info(f"Create new snapshot for bookmark. url={bookmark.url}...")
archive = waybackpy.WaybackMachineSaveAPI(
@@ -117,48 +93,27 @@ def _create_web_archive_snapshot_task(bookmark_id: int, force_update: bool):
return
except TooManyRequestsError:
logger.error(
f"Failed to create snapshot due to rate limiting, trying to load newest snapshot as fallback. url={bookmark.url}"
f"Failed to create snapshot due to rate limiting. url={bookmark.url}"
)
except WaybackError as error:
logger.error(
f"Failed to create snapshot, trying to load newest snapshot as fallback. url={bookmark.url}",
f"Failed to create snapshot. url={bookmark.url}",
exc_info=error,
)
# Load the newest snapshot as fallback
_load_newest_snapshot(bookmark)
@task()
def _load_web_archive_snapshot_task(bookmark_id: int):
try:
bookmark = Bookmark.objects.get(id=bookmark_id)
except Bookmark.DoesNotExist:
return
# Skip if snapshot exists
if bookmark.web_archive_snapshot_url:
return
# Load the newest snapshot
_load_newest_snapshot(bookmark)
def schedule_bookmarks_without_snapshots(user: User):
if is_web_archive_integration_active(user):
_schedule_bookmarks_without_snapshots_task(user.id)
# Loading snapshots from CDX API has been removed, keeping the task function
# for now to prevent errors when huey tries to run the task
pass
@task()
def _schedule_bookmarks_without_snapshots_task(user_id: int):
user = get_user_model().objects.get(id=user_id)
bookmarks_without_snapshots = Bookmark.objects.filter(
web_archive_snapshot_url__exact="", owner=user
)
# TODO: Implement bulk task creation
for bookmark in bookmarks_without_snapshots:
# To prevent rate limit errors from the Wayback API only try to load the latest snapshots instead of creating
# new ones when processing bookmarks in bulk
_load_web_archive_snapshot_task(bookmark.id)
# Loading snapshots from CDX API has been removed, keeping the task function
# for now to prevent errors when huey tries to run the task
pass
def is_favicon_feature_active(user: User) -> bool:

View File

@@ -1,42 +1,20 @@
import time
from typing import Dict
import datetime
import waybackpy
import waybackpy.utils
from waybackpy.exceptions import NoCDXRecordFound
from django.utils import timezone
class CustomWaybackMachineCDXServerAPI(waybackpy.WaybackMachineCDXServerAPI):
def generate_fallback_webarchive_url(
url: str, timestamp: datetime.datetime
) -> str | None:
"""
Customized WaybackMachineCDXServerAPI to work around some issues with retrieving the newest snapshot.
See https://github.com/akamhy/waybackpy/issues/176
Generate a URL to the web archive for the given URL and timestamp.
A snapshot for the specific timestamp might not exist, in which case the
web archive will show the closest snapshot to the given timestamp.
If there is no snapshot at all the URL will be invalid.
"""
if not url:
return None
if not timestamp:
timestamp = timezone.now()
def newest(self):
unix_timestamp = int(time.time())
self.closest = waybackpy.utils.unix_timestamp_to_wayback_timestamp(
unix_timestamp
)
self.sort = "closest"
self.limit = -5
newest_snapshot = None
for snapshot in self.snapshots():
newest_snapshot = snapshot
break
if not newest_snapshot:
raise NoCDXRecordFound(
"Wayback Machine's CDX server did not return any records "
+ "for the query. The URL may not have any archives "
+ " on the Wayback Machine or the URL may have been recently "
+ "archived and is still not available on the CDX server."
)
return newest_snapshot
def add_payload(self, payload: Dict[str, str]) -> None:
super().add_payload(payload)
# Set fastLatest query param, as we are only using this API to get the latest snapshot and using fastLatest
# makes searching for latest snapshots faster
payload["fastLatest"] = "true"
return f"https://web.archive.org/web/{timestamp.strftime('%Y%m%d%H%M%S')}/{url}"