Create snapshots on web.archive.org for bookmarks (#150)

* Implement initial background tasks concept

* fix property reference

* update requirements.txt

* simplify bookmark null check

* improve web archive url display

* add background tasks test

* add basic supervisor setup

* schedule missing snapshot creation on login

* remove task locks and clear task history before starting background task processor

* batch create snapshots after import

* fix script reference in supervisord.conf

* add option to disable background tasks

* restructure feature overview
This commit is contained in:
Sascha Ißbrücker
2021-09-04 22:31:04 +02:00
committed by GitHub
parent 8d214649b7
commit d87dde6bae
27 changed files with 470 additions and 19 deletions

View File

@@ -3,3 +3,7 @@ from django.apps import AppConfig
class BookmarksConfig(AppConfig):
name = 'bookmarks'
def ready(self):
# Register signal handlers
import bookmarks.signals

View File

@@ -0,0 +1,15 @@
from background_task.models import Task, CompletedTask
from django.core.management.base import BaseCommand
class Command(BaseCommand):
help = "Remove task locks and clear completed task history"
def handle(self, *args, **options):
# Remove task locks
# If the background task processor exited while executing tasks, these tasks would still be marked as locked,
# even though no process is working on them, and would prevent the task processor from picking the next task in
# the queue
Task.objects.all().update(locked_by=None, locked_at=None)
# Clear task history to prevent them from bloating the DB
CompletedTask.objects.all().delete()

View File

@@ -0,0 +1,18 @@
# Generated by Django 2.2.20 on 2021-05-16 14:35
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('bookmarks', '0008_userprofile_bookmark_date_display'),
]
operations = [
migrations.AddField(
model_name='bookmark',
name='web_archive_snapshot_url',
field=models.CharField(blank=True, max_length=2048),
),
]

View File

@@ -41,6 +41,7 @@ class Bookmark(models.Model):
description = models.TextField(blank=True)
website_title = models.CharField(max_length=512, blank=True, null=True)
website_description = models.TextField(blank=True, null=True)
web_archive_snapshot_url = models.CharField(max_length=2048, blank=True)
unread = models.BooleanField(default=True)
is_archived = models.BooleanField(default=False)
date_added = models.DateTimeField()

View File

@@ -6,6 +6,7 @@ from django.utils import timezone
from bookmarks.models import Bookmark, parse_tag_string
from bookmarks.services.tags import get_or_create_tags
from bookmarks.services.website_loader import load_website_metadata
from bookmarks.services import tasks
def create_bookmark(bookmark: Bookmark, tag_string: str, current_user: User):
@@ -27,10 +28,16 @@ def create_bookmark(bookmark: Bookmark, tag_string: str, current_user: User):
# Update tag list
_update_bookmark_tags(bookmark, tag_string, current_user)
bookmark.save()
# Create snapshot on web archive
tasks.create_web_archive_snapshot(bookmark.id, False)
return bookmark
def update_bookmark(bookmark: Bookmark, tag_string, current_user: User):
# Detect URL change
original_bookmark = Bookmark.objects.get(id=bookmark.id)
has_url_changed = original_bookmark.url != bookmark.url
# Update website info
_update_website_metadata(bookmark)
# Update tag list
@@ -38,6 +45,10 @@ def update_bookmark(bookmark: Bookmark, tag_string, current_user: User):
# Update dates
bookmark.date_modified = timezone.now()
bookmark.save()
# Update web archive snapshot, if URL changed
if has_url_changed:
tasks.create_web_archive_snapshot(bookmark.id, True)
return bookmark

View File

@@ -5,6 +5,7 @@ from django.contrib.auth.models import User
from django.utils import timezone
from bookmarks.models import Bookmark, parse_tag_string
from bookmarks.services import tasks
from bookmarks.services.parser import parse, NetscapeBookmark
from bookmarks.services.tags import get_or_create_tags
from bookmarks.utils import parse_timestamp
@@ -38,6 +39,9 @@ def import_netscape_html(html: str, user: User):
logging.exception('Error importing bookmark: ' + shortened_bookmark_tag_str)
result.failed = result.failed + 1
# Create snapshots for newly imported bookmarks
tasks.schedule_bookmarks_without_snapshots(user.id)
return result

View File

@@ -0,0 +1,62 @@
import logging
import waybackpy
from background_task import background
from django.conf import settings
from django.contrib.auth import get_user_model
from waybackpy.exceptions import WaybackError
from bookmarks.models import Bookmark
logger = logging.getLogger(__name__)
def when_background_tasks_enabled(fn):
def wrapper(*args, **kwargs):
if settings.LD_DISABLE_BACKGROUND_TASKS:
return
return fn(*args, **kwargs)
# Expose attributes from wrapped TaskProxy function
attrs = vars(fn)
for key, value in attrs.items():
setattr(wrapper, key, value)
return wrapper
@when_background_tasks_enabled
@background()
def create_web_archive_snapshot(bookmark_id: int, force_update: bool):
try:
bookmark = Bookmark.objects.get(id=bookmark_id)
except Bookmark.DoesNotExist:
return
# Skip if snapshot exists and update is not explicitly requested
if bookmark.web_archive_snapshot_url and not force_update:
return
logger.debug(f'Create web archive link for bookmark: {bookmark}...')
wayback = waybackpy.Url(bookmark.url)
try:
archive = wayback.save()
except WaybackError as error:
logger.exception(f'Error creating web archive link for bookmark: {bookmark}...', exc_info=error)
raise
bookmark.web_archive_snapshot_url = archive.archive_url
bookmark.save()
logger.debug(f'Successfully created web archive link for bookmark: {bookmark}...')
@when_background_tasks_enabled
@background()
def schedule_bookmarks_without_snapshots(user_id: int):
user = get_user_model().objects.get(id=user_id)
bookmarks_without_snapshots = Bookmark.objects.filter(web_archive_snapshot_url__exact='', owner=user)
for bookmark in bookmarks_without_snapshots:
create_web_archive_snapshot(bookmark.id, False)

8
bookmarks/signals.py Normal file
View File

@@ -0,0 +1,8 @@
from django.contrib.auth import user_logged_in
from django.dispatch import receiver
from bookmarks.services import tasks
@receiver(user_logged_in)
def user_logged_in(sender, request, user, **kwargs):
tasks.schedule_bookmarks_without_snapshots(user.id)

View File

@@ -54,6 +54,10 @@ ul.bookmark-list {
margin-right: 0.1rem;
}
.actions .date-label a {
color: $gray-color;
}
.actions .btn-link {
color: $gray-color;
padding: 0;

View File

@@ -27,11 +27,31 @@
</div>
<div class="actions">
{% if request.user.profile.bookmark_date_display == 'relative' %}
<span class="text-gray text-sm">{{ bookmark.date_added|humanize_relative_date }}</span>
<span class="date-label text-gray text-sm">
{% if bookmark.web_archive_snapshot_url %}
<a href="{{ bookmark.web_archive_snapshot_url }}"
title="Show snapshot on web archive" target="_blank" rel="noopener">
{% endif %}
<span>{{ bookmark.date_added|humanize_relative_date }}</span>
{% if bookmark.web_archive_snapshot_url %}
<span></span>
</a>
{% endif %}
</span>
<span class="text-gray text-sm">|</span>
{% endif %}
{% if request.user.profile.bookmark_date_display == 'absolute' %}
<span class="text-gray text-sm">{{ bookmark.date_added|humanize_absolute_date }}</span>
<span class="date-label text-gray text-sm">
{% if bookmark.web_archive_snapshot_url %}
<a href="{{ bookmark.web_archive_snapshot_url }}"
title="Show snapshot on web archive" target="_blank" rel="noopener">
{% endif %}
<span>{{ bookmark.date_added|humanize_absolute_date }}</span>
{% if bookmark.web_archive_snapshot_url %}
<span></span>
</a>
{% endif %}
</span>
<span class="text-gray text-sm">|</span>
{% endif %}
<a href="{% url 'bookmarks:edit' bookmark.id %}?return_url={{ return_url }}"

View File

@@ -28,6 +28,7 @@ class BookmarkFactoryMixin:
description: str = '',
website_title: str = '',
website_description: str = '',
web_archive_snapshot_url: str = '',
):
if tags is None:
tags = []
@@ -45,7 +46,8 @@ class BookmarkFactoryMixin:
date_added=timezone.now(),
date_modified=timezone.now(),
owner=user,
is_archived=is_archived
is_archived=is_archived,
web_archive_snapshot_url=web_archive_snapshot_url,
)
bookmark.save()
for tag in tags:

View File

@@ -24,9 +24,10 @@ class BookmarkListTagTest(TestCase, BookmarkFactoryMixin):
)
return template_to_render.render(context)
def setup_date_format_test(self, date_display_setting):
def setup_date_format_test(self, date_display_setting: str, web_archive_url: str = ''):
bookmark = self.setup_bookmark()
bookmark.date_added = timezone.now() - relativedelta(days=8)
bookmark.web_archive_snapshot_url = web_archive_url
bookmark.save()
user = self.get_or_create_test_user()
user.profile.bookmark_date_display = date_display_setting
@@ -39,7 +40,27 @@ class BookmarkListTagTest(TestCase, BookmarkFactoryMixin):
formatted_date = formats.date_format(bookmark.date_added, 'SHORT_DATE_FORMAT')
self.assertInHTML(f'''
<span class="text-gray text-sm">{formatted_date}</span>
<span class="date-label text-gray text-sm">
<span>{formatted_date}</span>
</span>
<span class="text-gray text-sm">|</span>
''', html)
def test_should_render_web_archive_link_with_absolute_date_setting(self):
bookmark = self.setup_date_format_test(UserProfile.BOOKMARK_DATE_DISPLAY_ABSOLUTE,
'https://web.archive.org/web/20210811214511/https://wanikani.com/')
html = self.render_template([bookmark])
formatted_date = formats.date_format(bookmark.date_added, 'SHORT_DATE_FORMAT')
self.assertInHTML(f'''
<span class="date-label text-gray text-sm">
<a href="{bookmark.web_archive_snapshot_url}"
title="Show snapshot on web archive" target="_blank" rel="noopener">
<span>{formatted_date}</span>
<span>∞</span>
</a>
</span>
<span class="text-gray text-sm">|</span>
''', html)
def test_should_respect_relative_date_setting(self):
@@ -47,5 +68,23 @@ class BookmarkListTagTest(TestCase, BookmarkFactoryMixin):
html = self.render_template([bookmark])
self.assertInHTML('''
<span class="text-gray text-sm">1 week ago</span>
<span class="date-label text-gray text-sm">
<span>1 week ago</span>
</span>
<span class="text-gray text-sm">|</span>
''', html)
def test_should_render_web_archive_link_with_relative_date_setting(self):
bookmark = self.setup_date_format_test(UserProfile.BOOKMARK_DATE_DISPLAY_RELATIVE,
'https://web.archive.org/web/20210811214511/https://wanikani.com/')
html = self.render_template([bookmark])
self.assertInHTML(f'''
<span class="date-label text-gray text-sm">
<a href="{bookmark.web_archive_snapshot_url}"
title="Show snapshot on web archive" target="_blank" rel="noopener">
<span>1 week ago</span>
<span>∞</span>
</a>
</span>
<span class="text-gray text-sm">|</span>
''', html)

View File

@@ -1,11 +1,14 @@
from unittest.mock import patch
from django.contrib.auth import get_user_model
from django.test import TestCase
from django.utils import timezone
from bookmarks.models import Bookmark, Tag
from bookmarks.services.bookmarks import archive_bookmark, archive_bookmarks, unarchive_bookmark, unarchive_bookmarks, \
delete_bookmarks, tag_bookmarks, untag_bookmarks
from bookmarks.services.bookmarks import create_bookmark, update_bookmark, archive_bookmark, archive_bookmarks, \
unarchive_bookmark, unarchive_bookmarks, delete_bookmarks, tag_bookmarks, untag_bookmarks
from bookmarks.tests.helpers import BookmarkFactoryMixin
from bookmarks.services import tasks
User = get_user_model()
@@ -13,7 +16,30 @@ User = get_user_model()
class BookmarkServiceTestCase(TestCase, BookmarkFactoryMixin):
def setUp(self) -> None:
self.user = User.objects.create_user('testuser', 'test@example.com', 'password123')
self.get_or_create_test_user()
def test_create_should_create_web_archive_snapshot(self):
with patch.object(tasks, 'create_web_archive_snapshot') as mock_create_web_archive_snapshot:
bookmark_data = Bookmark(url='https://example.com')
bookmark = create_bookmark(bookmark_data, 'tag1 tag2', self.user)
mock_create_web_archive_snapshot.assert_called_once_with(bookmark.id, False)
def test_update_should_create_web_archive_snapshot_if_url_did_change(self):
with patch.object(tasks, 'create_web_archive_snapshot') as mock_create_web_archive_snapshot:
bookmark = self.setup_bookmark()
bookmark.url = 'https://example.com/updated'
update_bookmark(bookmark, 'tag1 tag2', self.user)
mock_create_web_archive_snapshot.assert_called_once_with(bookmark.id, True)
def test_update_should_not_create_web_archive_snapshot_if_url_did_not_change(self):
with patch.object(tasks, 'create_web_archive_snapshot') as mock_create_web_archive_snapshot:
bookmark = self.setup_bookmark()
bookmark.title = 'updated title'
update_bookmark(bookmark, 'tag1 tag2', self.user)
mock_create_web_archive_snapshot.assert_not_called()
def test_archive_bookmark(self):
bookmark = Bookmark(

View File

@@ -0,0 +1,154 @@
from unittest.mock import patch
import waybackpy
from background_task.models import Task
from django.contrib.auth.models import User
from django.test import TestCase, override_settings
from bookmarks.models import Bookmark
from bookmarks.services.tasks import create_web_archive_snapshot, schedule_bookmarks_without_snapshots
from bookmarks.tests.helpers import BookmarkFactoryMixin, disable_logging
class MockWaybackUrl:
def __init__(self, archive_url: str):
self.archive_url = archive_url
def save(self):
return self
class MockWaybackUrlWithSaveError:
def save(self):
raise NotImplementedError
class BookmarkTasksTestCase(TestCase, BookmarkFactoryMixin):
@disable_logging
def run_pending_task(self, task_function):
func = getattr(task_function, 'task_function', None)
task = Task.objects.all()[0]
args, kwargs = task.params()
func(*args, **kwargs)
task.delete()
@disable_logging
def run_all_pending_tasks(self, task_function):
func = getattr(task_function, 'task_function', None)
tasks = Task.objects.all()
for task in tasks:
args, kwargs = task.params()
func(*args, **kwargs)
task.delete()
def test_create_web_archive_snapshot_should_update_snapshot_url(self):
bookmark = self.setup_bookmark()
with patch.object(waybackpy, 'Url', return_value=MockWaybackUrl('https://example.com')):
create_web_archive_snapshot(bookmark.id, False)
self.run_pending_task(create_web_archive_snapshot)
bookmark.refresh_from_db()
self.assertEqual(bookmark.web_archive_snapshot_url, 'https://example.com')
def test_create_web_archive_snapshot_should_handle_missing_bookmark_id(self):
with patch.object(waybackpy, 'Url', return_value=MockWaybackUrl('https://example.com')) as mock_wayback_url:
create_web_archive_snapshot(123, False)
self.run_pending_task(create_web_archive_snapshot)
mock_wayback_url.assert_not_called()
def test_create_web_archive_snapshot_should_handle_wayback_save_error(self):
bookmark = self.setup_bookmark()
with patch.object(waybackpy, 'Url',
return_value=MockWaybackUrlWithSaveError()):
with self.assertRaises(NotImplementedError):
create_web_archive_snapshot(bookmark.id, False)
self.run_pending_task(create_web_archive_snapshot)
def test_create_web_archive_snapshot_should_skip_if_snapshot_exists(self):
bookmark = self.setup_bookmark(web_archive_snapshot_url='https://example.com')
with patch.object(waybackpy, 'Url', return_value=MockWaybackUrl('https://other.com')):
create_web_archive_snapshot(bookmark.id, False)
self.run_pending_task(create_web_archive_snapshot)
bookmark.refresh_from_db()
self.assertEqual(bookmark.web_archive_snapshot_url, 'https://example.com')
def test_create_web_archive_snapshot_should_force_update_snapshot(self):
bookmark = self.setup_bookmark(web_archive_snapshot_url='https://example.com')
with patch.object(waybackpy, 'Url', return_value=MockWaybackUrl('https://other.com')):
create_web_archive_snapshot(bookmark.id, True)
self.run_pending_task(create_web_archive_snapshot)
bookmark.refresh_from_db()
self.assertEqual(bookmark.web_archive_snapshot_url, 'https://other.com')
@override_settings(LD_DISABLE_BACKGROUND_TASKS=True)
def test_create_web_archive_snapshot_should_not_run_when_background_tasks_are_disabled(self):
bookmark = self.setup_bookmark()
create_web_archive_snapshot(bookmark.id, False)
self.assertEqual(Task.objects.count(), 0)
def test_schedule_bookmarks_without_snapshots_should_create_snapshot_task_for_all_bookmarks_without_snapshot(self):
user = self.get_or_create_test_user()
self.setup_bookmark()
self.setup_bookmark()
self.setup_bookmark()
with patch.object(waybackpy, 'Url', return_value=MockWaybackUrl('https://example.com')):
schedule_bookmarks_without_snapshots(user.id)
self.run_pending_task(schedule_bookmarks_without_snapshots)
self.run_all_pending_tasks(create_web_archive_snapshot)
for bookmark in Bookmark.objects.all():
self.assertEqual(bookmark.web_archive_snapshot_url, 'https://example.com')
def test_schedule_bookmarks_without_snapshots_should_not_update_bookmarks_with_existing_snapshot(self):
user = self.get_or_create_test_user()
self.setup_bookmark(web_archive_snapshot_url='https://example.com')
self.setup_bookmark(web_archive_snapshot_url='https://example.com')
self.setup_bookmark(web_archive_snapshot_url='https://example.com')
with patch.object(waybackpy, 'Url', return_value=MockWaybackUrl('https://other.com')):
schedule_bookmarks_without_snapshots(user.id)
self.run_pending_task(schedule_bookmarks_without_snapshots)
self.run_all_pending_tasks(create_web_archive_snapshot)
for bookmark in Bookmark.objects.all():
self.assertEqual(bookmark.web_archive_snapshot_url, 'https://example.com')
def test_schedule_bookmarks_without_snapshots_should_only_update_user_owned_bookmarks(self):
user = self.get_or_create_test_user()
other_user = User.objects.create_user('otheruser', 'otheruser@example.com', 'password123')
self.setup_bookmark()
self.setup_bookmark()
self.setup_bookmark()
self.setup_bookmark(user=other_user)
self.setup_bookmark(user=other_user)
self.setup_bookmark(user=other_user)
with patch.object(waybackpy, 'Url', return_value=MockWaybackUrl('https://example.com')):
schedule_bookmarks_without_snapshots(user.id)
self.run_pending_task(schedule_bookmarks_without_snapshots)
self.run_all_pending_tasks(create_web_archive_snapshot)
for bookmark in Bookmark.objects.all().filter(owner=user):
self.assertEqual(bookmark.web_archive_snapshot_url, 'https://example.com')
for bookmark in Bookmark.objects.all().filter(owner=other_user):
self.assertEqual(bookmark.web_archive_snapshot_url, '')
@override_settings(LD_DISABLE_BACKGROUND_TASKS=True)
def test_schedule_bookmarks_without_snapshots_should_not_run_when_background_tasks_are_disabled(self):
user = self.get_or_create_test_user()
schedule_bookmarks_without_snapshots(user.id)
self.assertEqual(Task.objects.count(), 0)

View File

@@ -1,5 +1,8 @@
from unittest.mock import patch
from django.test import TestCase
from bookmarks.services import tasks
from bookmarks.services.importer import import_netscape_html
from bookmarks.tests.helpers import BookmarkFactoryMixin, disable_logging
@@ -31,3 +34,12 @@ class ImporterTestCase(TestCase, BookmarkFactoryMixin):
import_result = import_netscape_html(test_html, self.get_or_create_test_user())
self.assertEqual(import_result.success, 0)
def test_schedule_snapshot_creation(self):
user = self.get_or_create_test_user()
test_html = self.create_import_html('')
with patch.object(tasks, 'schedule_bookmarks_without_snapshots') as mock_schedule_bookmarks_without_snapshots:
import_netscape_html(test_html, user)
mock_schedule_bookmarks_without_snapshots.assert_called_once_with(user.id)

View File

@@ -0,0 +1,15 @@
from unittest.mock import patch
from django.test import TestCase
from bookmarks.services import tasks
from bookmarks.tests.helpers import BookmarkFactoryMixin
class SignalsTestCase(TestCase, BookmarkFactoryMixin):
def test_login_should_schedule_snapshot_creation(self):
user = self.get_or_create_test_user()
with patch.object(tasks, 'schedule_bookmarks_without_snapshots') as mock_schedule_bookmarks_without_snapshots:
self.client.force_login(user)
mock_schedule_bookmarks_without_snapshots.assert_called_once_with(user.id)