Improve import performance (#261)

* Run import in batches, cache tags

* Use bulk operations for bookmarks and assigning tags

* Improve naming

* Restore bookmark validation

* Add logging

* Bulk create tags

* Use HTMLParser for parsing bookmarks

* add parser tests

* Add more importer tests

* Add more importer tests

* Remove pyparsing dependency

Co-authored-by: Sascha Ißbrücker <sascha.issbruecker@gmail.com>
This commit is contained in:
Sascha Ißbrücker
2022-05-21 09:27:30 +02:00
committed by GitHub
parent 117160ea87
commit f4e3d724f0
8 changed files with 571 additions and 103 deletions

View File

@@ -1,5 +1,7 @@
import random
import logging
from dataclasses import dataclass
from typing import Optional, List
from django.contrib.auth.models import User
from django.utils import timezone
@@ -87,6 +89,42 @@ class LinkdingApiTestCase(APITestCase):
return response
class BookmarkHtmlTag:
def __init__(self, href: str = '', title: str = '', description: str = '', add_date: str = '', tags: str = ''):
self.href = href
self.title = title
self.description = description
self.add_date = add_date
self.tags = tags
class ImportTestMixin:
def render_tag(self, tag: BookmarkHtmlTag):
return f'''
<DT>
<A {f'HREF="{tag.href}"' if tag.href else ''}
{f'ADD_DATE="{tag.add_date}"' if tag.add_date else ''}
{f'TAGS="{tag.tags}"' if tag.tags else ''}>
{tag.title if tag.title else ''}
</A>
{f'<DD>{tag.description}' if tag.description else ''}
'''
def render_html(self, tags: List[BookmarkHtmlTag] = None, tags_html: str = ''):
if tags:
rendered_tags = [self.render_tag(tag) for tag in tags]
tags_html = '\n'.join(rendered_tags)
return f'''
<!DOCTYPE NETSCAPE-Bookmark-file-1>
<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
<TITLE>Bookmarks</TITLE>
<H1>Bookmarks</H1>
<DL><p>
{tags_html}
</DL><p>
'''
_words = [
'quasi',
'consequatur',

View File

@@ -1,29 +1,204 @@
from typing import List
from unittest.mock import patch
from django.test import TestCase
from django.test import TestCase, override_settings
from django.utils import timezone
from bookmarks.models import Tag
from bookmarks.models import Bookmark, Tag, parse_tag_string
from bookmarks.services import tasks
from bookmarks.services.importer import import_netscape_html
from bookmarks.tests.helpers import BookmarkFactoryMixin, disable_logging
from bookmarks.tests.helpers import BookmarkFactoryMixin, ImportTestMixin, BookmarkHtmlTag, disable_logging
from bookmarks.utils import parse_timestamp
class ImporterTestCase(TestCase, BookmarkFactoryMixin):
class ImporterTestCase(TestCase, BookmarkFactoryMixin, ImportTestMixin):
def create_import_html(self, bookmark_tags_string: str):
return f'''
<!DOCTYPE NETSCAPE-Bookmark-file-1>
<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
<TITLE>Bookmarks</TITLE>
<H1>Bookmarks</H1>
<DL><p>
{bookmark_tags_string}
</DL><p>
'''
def assertBookmarksImported(self, html_tags: List[BookmarkHtmlTag]):
for html_tag in html_tags:
bookmark = Bookmark.objects.get(url=html_tag.href)
self.assertIsNotNone(bookmark)
self.assertEqual(bookmark.title, html_tag.title)
self.assertEqual(bookmark.description, html_tag.description)
self.assertEqual(bookmark.date_added, parse_timestamp(html_tag.add_date))
tag_names = parse_tag_string(html_tag.tags)
# Check assigned tags
for tag_name in tag_names:
tag = next(
(tag for tag in bookmark.tags.all() if tag.name == tag_name), None)
self.assertIsNotNone(tag)
def test_import(self):
html_tags = [
BookmarkHtmlTag(href='https://example.com', title='Example title', description='Example description',
add_date='1', tags='example-tag'),
BookmarkHtmlTag(href='https://foo.com', title='Foo title', description='',
add_date='2', tags=''),
BookmarkHtmlTag(href='https://bar.com', title='Bar title', description='Bar description',
add_date='3', tags='bar-tag, other-tag'),
]
import_html = self.render_html(tags=html_tags)
result = import_netscape_html(import_html, self.get_or_create_test_user())
# Check result
self.assertEqual(result.total, 3)
self.assertEqual(result.success, 3)
self.assertEqual(result.failed, 0)
# Check bookmarks
bookmarks = Bookmark.objects.all()
self.assertEqual(len(bookmarks), 3)
self.assertBookmarksImported(html_tags)
def test_synchronize(self):
# Initial import
html_tags = [
BookmarkHtmlTag(href='https://example.com', title='Example title', description='Example description',
add_date='1', tags='example-tag'),
BookmarkHtmlTag(href='https://foo.com', title='Foo title', description='',
add_date='2', tags=''),
BookmarkHtmlTag(href='https://bar.com', title='Bar title', description='Bar description',
add_date='3', tags='bar-tag, other-tag'),
]
import_html = self.render_html(tags=html_tags)
import_netscape_html(import_html, self.get_or_create_test_user())
# Change data, add some new data
html_tags = [
BookmarkHtmlTag(href='https://example.com', title='Updated Example title',
description='Updated Example description', add_date='111', tags='updated-example-tag'),
BookmarkHtmlTag(href='https://foo.com', title='Updated Foo title', description='Updated Foo description',
add_date='222', tags='new-tag'),
BookmarkHtmlTag(href='https://bar.com', title='Updated Bar title', description='Updated Bar description',
add_date='333', tags='updated-bar-tag, updated-other-tag'),
BookmarkHtmlTag(href='https://baz.com', add_date='444', tags='baz-tag')
]
# Import updated data
import_html = self.render_html(tags=html_tags)
result = import_netscape_html(import_html, self.get_or_create_test_user())
# Check result
self.assertEqual(result.total, 4)
self.assertEqual(result.success, 4)
self.assertEqual(result.failed, 0)
# Check bookmarks
bookmarks = Bookmark.objects.all()
self.assertEqual(len(bookmarks), 4)
self.assertBookmarksImported(html_tags)
def test_import_with_some_invalid_bookmarks(self):
html_tags = [
BookmarkHtmlTag(href='https://example.com'),
# Invalid URL
BookmarkHtmlTag(href='foo.com'),
# No URL
BookmarkHtmlTag(),
]
import_html = self.render_html(tags=html_tags)
result = import_netscape_html(import_html, self.get_or_create_test_user())
# Check result
self.assertEqual(result.total, 3)
self.assertEqual(result.success, 1)
self.assertEqual(result.failed, 2)
# Check bookmarks
bookmarks = Bookmark.objects.all()
self.assertEqual(len(bookmarks), 1)
self.assertBookmarksImported(html_tags[1:1])
def test_import_tags(self):
html_tags = [
BookmarkHtmlTag(href='https://example.com', tags='tag1'),
BookmarkHtmlTag(href='https://foo.com', tags='tag2'),
BookmarkHtmlTag(href='https://bar.com', tags='tag3'),
]
import_html = self.render_html(tags=html_tags)
import_netscape_html(import_html, self.get_or_create_test_user())
self.assertEqual(Tag.objects.count(), 3)
def test_create_missing_tags(self):
html_tags = [
BookmarkHtmlTag(href='https://example.com', tags='tag1'),
BookmarkHtmlTag(href='https://foo.com', tags='tag2'),
BookmarkHtmlTag(href='https://bar.com', tags='tag3'),
]
import_html = self.render_html(tags=html_tags)
import_netscape_html(import_html, self.get_or_create_test_user())
html_tags.append(
BookmarkHtmlTag(href='https://baz.com', tags='tag4')
)
import_html = self.render_html(tags=html_tags)
import_netscape_html(import_html, self.get_or_create_test_user())
self.assertEqual(Tag.objects.count(), 4)
def test_should_append_tags_to_bookmark_when_reimporting_with_different_tags(self):
html_tags = [
BookmarkHtmlTag(href='https://example.com', tags='tag1'),
]
import_html = self.render_html(tags=html_tags)
import_netscape_html(import_html, self.get_or_create_test_user())
html_tags.append(
BookmarkHtmlTag(href='https://example.com', tags='tag2, tag3')
)
import_html = self.render_html(tags=html_tags)
import_netscape_html(import_html, self.get_or_create_test_user())
self.assertEqual(Bookmark.objects.count(), 1)
self.assertEqual(Bookmark.objects.all()[0].tags.all().count(), 3)
@override_settings(USE_TZ=False)
def test_use_current_date_when_no_add_date(self):
test_html = self.render_html(tags_html=f'''
<DT><A HREF="https://example.com">Example.com</A>
<DD>Example.com
''')
with patch.object(timezone, 'now', return_value=timezone.datetime(2021, 1, 1)):
import_netscape_html(test_html, self.get_or_create_test_user())
self.assertEqual(Bookmark.objects.count(), 1)
self.assertEqual(Bookmark.objects.all()[0].date_added, timezone.datetime(2021, 1, 1))
def test_keep_title_if_imported_bookmark_has_empty_title(self):
test_html = self.render_html(tags=[
BookmarkHtmlTag(href='https://example.com', title='Example.com')
])
import_netscape_html(test_html, self.get_or_create_test_user())
test_html = self.render_html(tags=[
BookmarkHtmlTag(href='https://example.com')
])
import_netscape_html(test_html, self.get_or_create_test_user())
self.assertEqual(Bookmark.objects.count(), 1)
self.assertEqual(Bookmark.objects.all()[0].title, 'Example.com')
def test_keep_description_if_imported_bookmark_has_empty_description(self):
test_html = self.render_html(tags=[
BookmarkHtmlTag(href='https://example.com', description='Example.com')
])
import_netscape_html(test_html, self.get_or_create_test_user())
test_html = self.render_html(tags=[
BookmarkHtmlTag(href='https://example.com')
])
import_netscape_html(test_html, self.get_or_create_test_user())
self.assertEqual(Bookmark.objects.count(), 1)
self.assertEqual(Bookmark.objects.all()[0].description, 'Example.com')
def test_replace_whitespace_in_tag_names(self):
test_html = self.create_import_html(f'''
<DT><A HREF="https://example.com" ADD_DATE="1616337559" PRIVATE="0" TOREAD="0" TAGS="tag 1, tag 2, tag 3">Example.com</A>
test_html = self.render_html(tags_html=f'''
<DT><A HREF="https://example.com" TAGS="tag 1, tag 2, tag 3">Example.com</A>
<DD>Example.com
''')
import_netscape_html(test_html, self.get_or_create_test_user())
@@ -35,22 +210,22 @@ class ImporterTestCase(TestCase, BookmarkFactoryMixin):
@disable_logging
def test_validate_empty_or_missing_bookmark_url(self):
test_html = self.create_import_html(f'''
<!-- Empty URL -->
<DT><A HREF="" ADD_DATE="1616337559" PRIVATE="0" TOREAD="0" TAGS="tag3">Empty URL</A>
test_html = self.render_html(tags_html=f'''
<DT><A HREF="">Empty URL</A>
<DD>Empty URL
<!-- Missing URL -->
<DT><A ADD_DATE="1616337559" PRIVATE="0" TOREAD="0" TAGS="tag3">Missing URL</A>
<DT><A>Missing URL</A>
<DD>Missing URL
''')
import_result = import_netscape_html(test_html, self.get_or_create_test_user())
self.assertEqual(Bookmark.objects.count(), 0)
self.assertEqual(import_result.success, 0)
self.assertEqual(import_result.failed, 2)
def test_schedule_snapshot_creation(self):
user = self.get_or_create_test_user()
test_html = self.create_import_html('')
test_html = self.render_html(tags_html='')
with patch.object(tasks, 'schedule_bookmarks_without_snapshots') as mock_schedule_bookmarks_without_snapshots:
import_netscape_html(test_html, user)

View File

@@ -0,0 +1,122 @@
from typing import List
from django.test import TestCase
from bookmarks.services.parser import NetscapeBookmark
from bookmarks.services.parser import parse
from bookmarks.tests.helpers import ImportTestMixin, BookmarkHtmlTag
class ParserTestCase(TestCase, ImportTestMixin):
def assertTagsEqual(self, bookmarks: List[NetscapeBookmark], html_tags: List[BookmarkHtmlTag]):
self.assertEqual(len(bookmarks), len(html_tags))
for bookmark in bookmarks:
html_tag = html_tags[bookmarks.index(bookmark)]
self.assertEqual(bookmark.href, html_tag.href)
self.assertEqual(bookmark.title, html_tag.title)
self.assertEqual(bookmark.date_added, html_tag.add_date)
self.assertEqual(bookmark.description, html_tag.description)
self.assertEqual(bookmark.tag_string, html_tag.tags)
def test_parse_bookmarks(self):
html_tags = [
BookmarkHtmlTag(href='https://example.com', title='Example title', description='Example description',
add_date='1', tags='example-tag'),
BookmarkHtmlTag(href='https://foo.com', title='Foo title', description='',
add_date='2', tags=''),
BookmarkHtmlTag(href='https://bar.com', title='Bar title', description='Bar description',
add_date='3', tags='bar-tag, other-tag'),
]
html = self.render_html(html_tags)
bookmarks = parse(html)
self.assertTagsEqual(bookmarks, html_tags)
def test_no_bookmarks(self):
html = self.render_html()
bookmarks = parse(html)
self.assertEqual(bookmarks, [])
def test_reset_properties_after_adding_bookmark(self):
html_tags = [
BookmarkHtmlTag(href='https://example.com', title='Example title', description='Example description',
add_date='1', tags='example-tag'),
BookmarkHtmlTag(href='', title='', description='',
add_date='', tags='')
]
html = self.render_html(html_tags)
bookmarks = parse(html)
self.assertTagsEqual(bookmarks, html_tags)
def test_empty_title(self):
html_tags = [
BookmarkHtmlTag(href='https://example.com', title='', description='Example description',
add_date='1', tags='example-tag'),
]
html = self.render_html(tags_html='''
<DT><A HREF="https://example.com" ADD_DATE="1" TAGS="example-tag"></A>
<DD>Example description
''')
bookmarks = parse(html)
self.assertTagsEqual(bookmarks, html_tags)
def test_with_closing_description_tag(self):
html_tags = [
BookmarkHtmlTag(href='https://example.com', title='Example title', description='Example description',
add_date='1', tags='example-tag'),
BookmarkHtmlTag(href='https://foo.com', title='Foo title', description='',
add_date='2', tags=''),
]
html = self.render_html(tags_html='''
<DT><A HREF="https://example.com" ADD_DATE="1" TAGS="example-tag">Example title</A>
<DD>Example description</DD>
<DT><A HREF="https://foo.com" ADD_DATE="2">Foo title</A>
<DD></DD>
''')
bookmarks = parse(html)
self.assertTagsEqual(bookmarks, html_tags)
def test_description_tag_before_anchor_tag(self):
html_tags = [
BookmarkHtmlTag(href='https://example.com', title='Example title', description='Example description',
add_date='1', tags='example-tag'),
BookmarkHtmlTag(href='https://foo.com', title='Foo title', description='',
add_date='2', tags=''),
]
html = self.render_html(tags_html='''
<DT><DD>Example description</DD>
<A HREF="https://example.com" ADD_DATE="1" TAGS="example-tag">Example title</A>
<DT><DD></DD>
<A HREF="https://foo.com" ADD_DATE="2">Foo title</A>
''')
bookmarks = parse(html)
self.assertTagsEqual(bookmarks, html_tags)
def test_with_folders(self):
html_tags = [
BookmarkHtmlTag(href='https://example.com', title='Example title', description='Example description',
add_date='1', tags='example-tag'),
BookmarkHtmlTag(href='https://foo.com', title='Foo title', description='',
add_date='2', tags=''),
]
html = self.render_html(tags_html='''
<DL><p>
<DT><H3>Folder 1</H3>
<DL><p>
<DT><A HREF="https://example.com" ADD_DATE="1" TAGS="example-tag">Example title</A>
<DD>Example description
</DL><p>
<DT><H3>Folder 2</H3>
<DL><p>
<DT><A HREF="https://foo.com" ADD_DATE="2">Foo title</A>
</DL><p>
</DL><p>
''')
bookmarks = parse(html)
self.assertTagsEqual(bookmarks, html_tags)