Generate fallback URLs for web archive links (#804)

* generate fallback web archive URL if none exists

* remove fallback web archive snapshot creation

* fix test
This commit is contained in:
Sascha Ißbrücker
2024-08-29 22:45:10 +02:00
committed by GitHub
parent 36a84276a2
commit 749bc1ef63
10 changed files with 53 additions and 336 deletions

View File

@@ -1,42 +1,20 @@
import time
from typing import Dict
import datetime
import waybackpy
import waybackpy.utils
from waybackpy.exceptions import NoCDXRecordFound
from django.utils import timezone
class CustomWaybackMachineCDXServerAPI(waybackpy.WaybackMachineCDXServerAPI):
def generate_fallback_webarchive_url(
url: str, timestamp: datetime.datetime
) -> str | None:
"""
Customized WaybackMachineCDXServerAPI to work around some issues with retrieving the newest snapshot.
See https://github.com/akamhy/waybackpy/issues/176
Generate a URL to the web archive for the given URL and timestamp.
A snapshot for the specific timestamp might not exist, in which case the
web archive will show the closest snapshot to the given timestamp.
If there is no snapshot at all the URL will be invalid.
"""
if not url:
return None
if not timestamp:
timestamp = timezone.now()
def newest(self):
unix_timestamp = int(time.time())
self.closest = waybackpy.utils.unix_timestamp_to_wayback_timestamp(
unix_timestamp
)
self.sort = "closest"
self.limit = -5
newest_snapshot = None
for snapshot in self.snapshots():
newest_snapshot = snapshot
break
if not newest_snapshot:
raise NoCDXRecordFound(
"Wayback Machine's CDX server did not return any records "
+ "for the query. The URL may not have any archives "
+ " on the Wayback Machine or the URL may have been recently "
+ "archived and is still not available on the CDX server."
)
return newest_snapshot
def add_payload(self, payload: Dict[str, str]) -> None:
super().add_payload(payload)
# Set fastLatest query param, as we are only using this API to get the latest snapshot and using fastLatest
# makes searching for latest snapshots faster
payload["fastLatest"] = "true"
return f"https://web.archive.org/web/{timestamp.strftime('%Y%m%d%H%M%S')}/{url}"