Archive snapshots of websites locally (#672)

* Add basic HTML snapshots

* Implement asset list

* Add snapshot creation tests

* Add deletion tests

* Show file size

* Remove snapshots

* Create new snapshots

* Switch to single-file

* CSS tweak

* Remove auto refresh

* Show delete link when there is no file yet

* Add current date to display name

* Add flag for snapshot support

* Add option for disabling automatic snapshots

* Make snapshots sharable

* Document image variants

* Update README.md

* Add migrations

* Fix tests
This commit is contained in:
Sascha Ißbrücker
2024-04-01 15:19:38 +02:00
committed by GitHub
parent db1906942a
commit 4280ab40c6
46 changed files with 1603 additions and 240 deletions

View File

@@ -34,6 +34,9 @@ def create_bookmark(bookmark: Bookmark, tag_string: str, current_user: User):
tasks.create_web_archive_snapshot(current_user, bookmark, False)
# Load favicon
tasks.load_favicon(current_user, bookmark)
# Create HTML snapshot
if current_user.profile.enable_automatic_html_snapshots:
tasks.create_html_snapshot(bookmark)
return bookmark

View File

@@ -0,0 +1,32 @@
import gzip
import shutil
import subprocess
import os
from django.conf import settings
class MonolithError(Exception):
pass
# Monolith isn't used at the moment, as the local snapshot implementation
# switched to single-file after the prototype. Keeping this around in case
# it turns out to be useful in the future.
def create_snapshot(url: str, filepath: str):
monolith_path = settings.LD_MONOLITH_PATH
monolith_options = settings.LD_MONOLITH_OPTIONS
temp_filepath = filepath + ".tmp"
try:
command = f"{monolith_path} '{url}' {monolith_options} -o {temp_filepath}"
subprocess.run(command, check=True, shell=True)
with open(temp_filepath, "rb") as raw_file, gzip.open(
filepath, "wb"
) as gz_file:
shutil.copyfileobj(raw_file, gz_file)
os.remove(temp_filepath)
except subprocess.CalledProcessError as error:
raise MonolithError(f"Failed to create snapshot: {error.stderr}")

View File

@@ -0,0 +1,33 @@
import gzip
import os
import shutil
import subprocess
from django.conf import settings
class SingeFileError(Exception):
pass
def create_snapshot(url: str, filepath: str):
singlefile_path = settings.LD_SINGLEFILE_PATH
singlefile_options = settings.LD_SINGLEFILE_OPTIONS
temp_filepath = filepath + ".tmp"
try:
command = f"{singlefile_path} '{url}' {singlefile_options} {temp_filepath}"
subprocess.run(command, check=True, shell=True)
# single-file doesn't return exit codes apparently, so check if the file was created
if not os.path.exists(temp_filepath):
raise SingeFileError("Failed to create snapshot")
with open(temp_filepath, "rb") as raw_file, gzip.open(
filepath, "wb"
) as gz_file:
shutil.copyfileobj(raw_file, gz_file)
os.remove(temp_filepath)
except subprocess.CalledProcessError as error:
raise SingeFileError(f"Failed to create snapshot: {error.stderr}")

View File

@@ -1,4 +1,5 @@
import logging
import os
import waybackpy
from background_task import background
@@ -7,10 +8,11 @@ from django.conf import settings
from django.contrib.auth import get_user_model
from django.contrib.auth.models import User
from waybackpy.exceptions import WaybackError, TooManyRequestsError, NoCDXRecordFound
from django.utils import timezone, formats
import bookmarks.services.wayback
from bookmarks.models import Bookmark, UserProfile
from bookmarks.services import favicon_loader
from bookmarks.models import Bookmark, BookmarkAsset, UserProfile
from bookmarks.services import favicon_loader, singlefile
from bookmarks.services.website_loader import DEFAULT_USER_AGENT
logger = logging.getLogger(__name__)
@@ -193,3 +195,64 @@ def _schedule_refresh_favicons_task(user_id: int):
tasks.append(task)
Task.objects.bulk_create(tasks)
def is_html_snapshot_feature_active() -> bool:
return settings.LD_ENABLE_SNAPSHOTS and not settings.LD_DISABLE_BACKGROUND_TASKS
def create_html_snapshot(bookmark: Bookmark):
if not is_html_snapshot_feature_active():
return
timestamp = formats.date_format(timezone.now(), "SHORT_DATE_FORMAT")
asset = BookmarkAsset(
bookmark=bookmark,
asset_type=BookmarkAsset.TYPE_SNAPSHOT,
content_type="text/html",
display_name=f"HTML snapshot from {timestamp}",
status=BookmarkAsset.STATUS_PENDING,
)
asset.save()
_create_html_snapshot_task(asset.id)
def _generate_snapshot_filename(asset: BookmarkAsset) -> str:
def sanitize_char(char):
if char.isalnum() or char in ("-", "_", "."):
return char
else:
return "_"
formatted_datetime = asset.date_created.strftime("%Y-%m-%d_%H%M%S")
sanitized_url = "".join(sanitize_char(char) for char in asset.bookmark.url)
return f"{asset.asset_type}_{formatted_datetime}_{sanitized_url}.html.gz"
@background()
def _create_html_snapshot_task(asset_id: int):
try:
asset = BookmarkAsset.objects.get(id=asset_id)
except BookmarkAsset.DoesNotExist:
return
logger.info(f"Create HTML snapshot for bookmark. url={asset.bookmark.url}")
try:
filename = _generate_snapshot_filename(asset)
filepath = os.path.join(settings.LD_ASSET_FOLDER, filename)
singlefile.create_snapshot(asset.bookmark.url, filepath)
asset.status = BookmarkAsset.STATUS_COMPLETE
asset.file = filename
asset.gzip = True
logger.info(
f"Successfully created HTML snapshot for bookmark. url={asset.bookmark.url}"
)
except singlefile.SingeFileError as error:
asset.status = BookmarkAsset.STATUS_FAILURE
logger.error(
f"Failed to create HTML snapshot for bookmark. url={asset.bookmark.url}",
exc_info=error,
)
asset.save()