mirror of
https://github.com/sissbruecker/linkding.git
synced 2025-08-07 02:48:27 +02:00
Improve import performance (#261)
* Run import in batches, cache tags * Use bulk operations for bookmarks and assigning tags * Improve naming * Restore bookmark validation * Add logging * Bulk create tags * Use HTMLParser for parsing bookmarks * add parser tests * Add more importer tests * Add more importer tests * Remove pyparsing dependency Co-authored-by: Sascha Ißbrücker <sascha.issbruecker@gmail.com>
This commit is contained in:
@@ -1,13 +1,13 @@
|
|||||||
import logging
|
import logging
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
from typing import List
|
||||||
|
|
||||||
from django.contrib.auth.models import User
|
from django.contrib.auth.models import User
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
|
|
||||||
from bookmarks.models import Bookmark, parse_tag_string
|
from bookmarks.models import Bookmark, Tag, parse_tag_string
|
||||||
from bookmarks.services import tasks
|
from bookmarks.services import tasks
|
||||||
from bookmarks.services.parser import parse, NetscapeBookmark
|
from bookmarks.services.parser import parse, NetscapeBookmark
|
||||||
from bookmarks.services.tags import get_or_create_tags
|
|
||||||
from bookmarks.utils import parse_timestamp
|
from bookmarks.utils import parse_timestamp
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -20,8 +20,39 @@ class ImportResult:
|
|||||||
failed: int = 0
|
failed: int = 0
|
||||||
|
|
||||||
|
|
||||||
|
class TagCache:
|
||||||
|
def __init__(self, user: User):
|
||||||
|
self.user = user
|
||||||
|
self.cache = dict()
|
||||||
|
# Init cache with all existing tags for that user
|
||||||
|
tags = Tag.objects.filter(owner=user)
|
||||||
|
for tag in tags:
|
||||||
|
self.put(tag)
|
||||||
|
|
||||||
|
def get(self, tag_name: str):
|
||||||
|
tag_name_lowercase = tag_name.lower()
|
||||||
|
if tag_name_lowercase in self.cache:
|
||||||
|
return self.cache[tag_name_lowercase]
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_all(self, tag_names: List[str]):
|
||||||
|
result = []
|
||||||
|
for tag_name in tag_names:
|
||||||
|
tag = self.get(tag_name)
|
||||||
|
# Prevent returning duplicates
|
||||||
|
if not (tag in result):
|
||||||
|
result.append(tag)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def put(self, tag: Tag):
|
||||||
|
self.cache[tag.name.lower()] = tag
|
||||||
|
|
||||||
|
|
||||||
def import_netscape_html(html: str, user: User):
|
def import_netscape_html(html: str, user: User):
|
||||||
result = ImportResult()
|
result = ImportResult()
|
||||||
|
import_start = timezone.now()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
netscape_bookmarks = parse(html)
|
netscape_bookmarks = parse(html)
|
||||||
@@ -29,26 +60,130 @@ def import_netscape_html(html: str, user: User):
|
|||||||
logging.exception('Could not read bookmarks file.')
|
logging.exception('Could not read bookmarks file.')
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
parse_end = timezone.now()
|
||||||
|
logger.debug(f'Parse duration: {parse_end - import_start}')
|
||||||
|
|
||||||
|
# Create and cache all tags beforehand
|
||||||
|
_create_missing_tags(netscape_bookmarks, user)
|
||||||
|
tag_cache = TagCache(user)
|
||||||
|
|
||||||
|
# Split bookmarks to import into batches, to keep memory usage for bulk operations manageable
|
||||||
|
batches = _get_batches(netscape_bookmarks, 200)
|
||||||
|
for batch in batches:
|
||||||
|
_import_batch(batch, user, tag_cache, result)
|
||||||
|
|
||||||
|
# Create snapshots for newly imported bookmarks
|
||||||
|
tasks.schedule_bookmarks_without_snapshots(user)
|
||||||
|
|
||||||
|
end = timezone.now()
|
||||||
|
logger.debug(f'Import duration: {end - import_start}')
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _create_missing_tags(netscape_bookmarks: List[NetscapeBookmark], user: User):
|
||||||
|
tag_cache = TagCache(user)
|
||||||
|
tags_to_create = []
|
||||||
|
|
||||||
|
for netscape_bookmark in netscape_bookmarks:
|
||||||
|
tag_names = parse_tag_string(netscape_bookmark.tag_string)
|
||||||
|
for tag_name in tag_names:
|
||||||
|
tag = tag_cache.get(tag_name)
|
||||||
|
if not tag:
|
||||||
|
tag = Tag(name=tag_name, owner=user)
|
||||||
|
tag.date_added = timezone.now()
|
||||||
|
tags_to_create.append(tag)
|
||||||
|
|
||||||
|
Tag.objects.bulk_create(tags_to_create)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_batches(items: List, batch_size: int):
|
||||||
|
batches = []
|
||||||
|
offset = 0
|
||||||
|
num_items = len(items)
|
||||||
|
|
||||||
|
while offset < num_items:
|
||||||
|
batch = items[offset:min(offset + batch_size, num_items)]
|
||||||
|
if len(batch) > 0:
|
||||||
|
batches.append(batch)
|
||||||
|
offset = offset + batch_size
|
||||||
|
|
||||||
|
return batches
|
||||||
|
|
||||||
|
|
||||||
|
def _import_batch(netscape_bookmarks: List[NetscapeBookmark], user: User, tag_cache: TagCache, result: ImportResult):
|
||||||
|
# Query existing bookmarks
|
||||||
|
batch_urls = [bookmark.href for bookmark in netscape_bookmarks]
|
||||||
|
existing_bookmarks = Bookmark.objects.filter(owner=user, url__in=batch_urls)
|
||||||
|
|
||||||
|
# Create or update bookmarks from parsed Netscape bookmarks
|
||||||
|
bookmarks_to_create = []
|
||||||
|
bookmarks_to_update = []
|
||||||
|
|
||||||
for netscape_bookmark in netscape_bookmarks:
|
for netscape_bookmark in netscape_bookmarks:
|
||||||
result.total = result.total + 1
|
result.total = result.total + 1
|
||||||
try:
|
try:
|
||||||
_import_bookmark_tag(netscape_bookmark, user)
|
# Lookup existing bookmark by URL, or create new bookmark if there is no bookmark for that URL yet
|
||||||
|
bookmark = next(
|
||||||
|
(bookmark for bookmark in existing_bookmarks if bookmark.url == netscape_bookmark.href), None)
|
||||||
|
if not bookmark:
|
||||||
|
bookmark = Bookmark(owner=user)
|
||||||
|
is_update = False
|
||||||
|
else:
|
||||||
|
is_update = True
|
||||||
|
# Copy data from parsed bookmark
|
||||||
|
_copy_bookmark_data(netscape_bookmark, bookmark)
|
||||||
|
# Validate bookmark fields, exclude owner to prevent n+1 database query,
|
||||||
|
# also there is no specific validation on owner
|
||||||
|
bookmark.clean_fields(exclude=['owner'])
|
||||||
|
# Schedule for update or insert
|
||||||
|
if is_update:
|
||||||
|
bookmarks_to_update.append(bookmark)
|
||||||
|
else:
|
||||||
|
bookmarks_to_create.append(bookmark)
|
||||||
|
|
||||||
result.success = result.success + 1
|
result.success = result.success + 1
|
||||||
except:
|
except:
|
||||||
shortened_bookmark_tag_str = str(netscape_bookmark)[:100] + '...'
|
shortened_bookmark_tag_str = str(netscape_bookmark)[:100] + '...'
|
||||||
logging.exception('Error importing bookmark: ' + shortened_bookmark_tag_str)
|
logging.exception('Error importing bookmark: ' + shortened_bookmark_tag_str)
|
||||||
result.failed = result.failed + 1
|
result.failed = result.failed + 1
|
||||||
|
|
||||||
# Create snapshots for newly imported bookmarks
|
# Bulk update bookmarks in DB
|
||||||
tasks.schedule_bookmarks_without_snapshots(user)
|
Bookmark.objects.bulk_update(bookmarks_to_update,
|
||||||
|
['url', 'date_added', 'date_modified', 'unread', 'title', 'description', 'owner'])
|
||||||
|
# Bulk insert new bookmarks into DB
|
||||||
|
Bookmark.objects.bulk_create(bookmarks_to_create)
|
||||||
|
|
||||||
return result
|
# Bulk assign tags
|
||||||
|
# In Django 3, bulk_create does not return the auto-generated IDs when bulk inserting,
|
||||||
|
# so we have to reload the inserted bookmarks, and match them to the parsed bookmarks by URL
|
||||||
|
existing_bookmarks = Bookmark.objects.filter(owner=user, url__in=batch_urls)
|
||||||
|
|
||||||
|
BookmarkToTagRelationShip = Bookmark.tags.through
|
||||||
|
relationships = []
|
||||||
|
|
||||||
|
for netscape_bookmark in netscape_bookmarks:
|
||||||
|
# Lookup bookmark by URL again
|
||||||
|
bookmark = next(
|
||||||
|
(bookmark for bookmark in existing_bookmarks if bookmark.url == netscape_bookmark.href), None)
|
||||||
|
|
||||||
|
if not bookmark:
|
||||||
|
# Something is wrong, we should have just created this bookmark
|
||||||
|
shortened_bookmark_tag_str = str(netscape_bookmark)[:100] + '...'
|
||||||
|
logging.warning(
|
||||||
|
f'Failed to assign tags to the bookmark: {shortened_bookmark_tag_str}. Could not find bookmark by URL.')
|
||||||
|
|
||||||
|
# Get tag models by string, schedule inserts for bookmark -> tag associations
|
||||||
|
tag_names = parse_tag_string(netscape_bookmark.tag_string)
|
||||||
|
tags = tag_cache.get_all(tag_names)
|
||||||
|
for tag in tags:
|
||||||
|
relationships.append(BookmarkToTagRelationShip(bookmark=bookmark, tag=tag))
|
||||||
|
|
||||||
|
# Insert all bookmark -> tag associations at once, should ignore errors if association already exists
|
||||||
|
BookmarkToTagRelationShip.objects.bulk_create(relationships, ignore_conflicts=True)
|
||||||
|
|
||||||
|
|
||||||
def _import_bookmark_tag(netscape_bookmark: NetscapeBookmark, user: User):
|
def _copy_bookmark_data(netscape_bookmark: NetscapeBookmark, bookmark: Bookmark):
|
||||||
# Either modify existing bookmark for the URL or create new one
|
|
||||||
bookmark = _get_or_create_bookmark(netscape_bookmark.href, user)
|
|
||||||
|
|
||||||
bookmark.url = netscape_bookmark.href
|
bookmark.url = netscape_bookmark.href
|
||||||
if netscape_bookmark.date_added:
|
if netscape_bookmark.date_added:
|
||||||
bookmark.date_added = parse_timestamp(netscape_bookmark.date_added)
|
bookmark.date_added = parse_timestamp(netscape_bookmark.date_added)
|
||||||
@@ -56,24 +191,7 @@ def _import_bookmark_tag(netscape_bookmark: NetscapeBookmark, user: User):
|
|||||||
bookmark.date_added = timezone.now()
|
bookmark.date_added = timezone.now()
|
||||||
bookmark.date_modified = bookmark.date_added
|
bookmark.date_modified = bookmark.date_added
|
||||||
bookmark.unread = False
|
bookmark.unread = False
|
||||||
bookmark.title = netscape_bookmark.title
|
if netscape_bookmark.title:
|
||||||
|
bookmark.title = netscape_bookmark.title
|
||||||
if netscape_bookmark.description:
|
if netscape_bookmark.description:
|
||||||
bookmark.description = netscape_bookmark.description
|
bookmark.description = netscape_bookmark.description
|
||||||
bookmark.owner = user
|
|
||||||
|
|
||||||
bookmark.full_clean()
|
|
||||||
bookmark.save()
|
|
||||||
|
|
||||||
# Set tags
|
|
||||||
tag_names = parse_tag_string(netscape_bookmark.tag_string)
|
|
||||||
tags = get_or_create_tags(tag_names, user)
|
|
||||||
|
|
||||||
bookmark.tags.set(tags)
|
|
||||||
bookmark.save()
|
|
||||||
|
|
||||||
|
|
||||||
def _get_or_create_bookmark(url: str, user: User):
|
|
||||||
try:
|
|
||||||
return Bookmark.objects.get(url=url, owner=user)
|
|
||||||
except Bookmark.DoesNotExist:
|
|
||||||
return Bookmark()
|
|
||||||
|
@@ -1,6 +1,6 @@
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
from html.parser import HTMLParser
|
||||||
import pyparsing as pp
|
from typing import Dict, List
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -12,60 +12,72 @@ class NetscapeBookmark:
|
|||||||
tag_string: str
|
tag_string: str
|
||||||
|
|
||||||
|
|
||||||
def extract_bookmark_link(tag):
|
class BookmarkParser(HTMLParser):
|
||||||
href = tag[0].href
|
def __init__(self):
|
||||||
title = tag[0].text
|
super().__init__()
|
||||||
tag_string = tag[0].tags
|
self.bookmarks = []
|
||||||
date_added = tag[0].add_date
|
|
||||||
|
|
||||||
return {
|
self.current_tag = None
|
||||||
'href': href,
|
self.bookmark = None
|
||||||
'title': title,
|
self.href = ''
|
||||||
'tag_string': tag_string,
|
self.add_date = ''
|
||||||
'date_added': date_added
|
self.tags = ''
|
||||||
}
|
self.title = ''
|
||||||
|
self.description = ''
|
||||||
|
|
||||||
|
def handle_starttag(self, tag: str, attrs: list):
|
||||||
|
name = 'handle_start_' + tag.lower()
|
||||||
|
if name in dir(self):
|
||||||
|
getattr(self, name)({k.lower(): v for k, v in attrs})
|
||||||
|
self.current_tag = tag
|
||||||
|
|
||||||
def extract_bookmark(tag):
|
def handle_endtag(self, tag: str):
|
||||||
link = tag[0].link
|
name = 'handle_end_' + tag.lower()
|
||||||
description = tag[0].description
|
if name in dir(self):
|
||||||
description = description[0] if description else ''
|
getattr(self, name)()
|
||||||
|
self.current_tag = None
|
||||||
|
|
||||||
return {
|
def handle_data(self, data):
|
||||||
'link': link,
|
name = f'handle_{self.current_tag}_data'
|
||||||
'description': description,
|
if name in dir(self):
|
||||||
}
|
getattr(self, name)(data)
|
||||||
|
|
||||||
|
def handle_end_dl(self):
|
||||||
|
self.add_bookmark()
|
||||||
|
|
||||||
def extract_description(tag):
|
def handle_start_dt(self, attrs: Dict[str, str]):
|
||||||
return tag[0].strip()
|
self.add_bookmark()
|
||||||
|
|
||||||
|
def handle_start_a(self, attrs: Dict[str, str]):
|
||||||
# define grammar
|
vars(self).update(attrs)
|
||||||
dt_start, _ = pp.makeHTMLTags("DT")
|
self.bookmark = NetscapeBookmark(
|
||||||
dd_start, _ = pp.makeHTMLTags("DD")
|
href=self.href,
|
||||||
a_start, a_end = pp.makeHTMLTags("A")
|
title='',
|
||||||
bookmark_link_tag = pp.Group(a_start + a_start.tag_body("text") + a_end.suppress())
|
description='',
|
||||||
bookmark_link_tag.addParseAction(extract_bookmark_link)
|
date_added=self.add_date,
|
||||||
bookmark_description_tag = dd_start.suppress() + pp.SkipTo(pp.anyOpenTag | pp.anyCloseTag)("description")
|
tag_string=self.tags,
|
||||||
bookmark_description_tag.addParseAction(extract_description)
|
|
||||||
bookmark_tag = pp.Group(dt_start + bookmark_link_tag("link") + pp.ZeroOrMore(bookmark_description_tag)("description"))
|
|
||||||
bookmark_tag.addParseAction(extract_bookmark)
|
|
||||||
|
|
||||||
|
|
||||||
def parse(html: str) -> [NetscapeBookmark]:
|
|
||||||
matches = bookmark_tag.searchString(html)
|
|
||||||
bookmarks = []
|
|
||||||
|
|
||||||
for match in matches:
|
|
||||||
bookmark_match = match[0]
|
|
||||||
bookmark = NetscapeBookmark(
|
|
||||||
href=bookmark_match['link']['href'],
|
|
||||||
title=bookmark_match['link']['title'],
|
|
||||||
description=bookmark_match['description'],
|
|
||||||
tag_string=bookmark_match['link']['tag_string'],
|
|
||||||
date_added=bookmark_match['link']['date_added'],
|
|
||||||
)
|
)
|
||||||
bookmarks.append(bookmark)
|
|
||||||
|
|
||||||
return bookmarks
|
def handle_a_data(self, data):
|
||||||
|
self.title = data.strip()
|
||||||
|
|
||||||
|
def handle_dd_data(self, data):
|
||||||
|
self.description = data.strip()
|
||||||
|
|
||||||
|
def add_bookmark(self):
|
||||||
|
if self.bookmark:
|
||||||
|
self.bookmark.title = self.title
|
||||||
|
self.bookmark.description = self.description
|
||||||
|
self.bookmarks.append(self.bookmark)
|
||||||
|
self.bookmark = None
|
||||||
|
self.href = ''
|
||||||
|
self.add_date = ''
|
||||||
|
self.tags = ''
|
||||||
|
self.title = ''
|
||||||
|
self.description = ''
|
||||||
|
|
||||||
|
|
||||||
|
def parse(html: str) -> List[NetscapeBookmark]:
|
||||||
|
parser = BookmarkParser()
|
||||||
|
parser.feed(html)
|
||||||
|
return parser.bookmarks
|
||||||
|
@@ -1,5 +1,7 @@
|
|||||||
import random
|
import random
|
||||||
import logging
|
import logging
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Optional, List
|
||||||
|
|
||||||
from django.contrib.auth.models import User
|
from django.contrib.auth.models import User
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
@@ -87,6 +89,42 @@ class LinkdingApiTestCase(APITestCase):
|
|||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
class BookmarkHtmlTag:
|
||||||
|
def __init__(self, href: str = '', title: str = '', description: str = '', add_date: str = '', tags: str = ''):
|
||||||
|
self.href = href
|
||||||
|
self.title = title
|
||||||
|
self.description = description
|
||||||
|
self.add_date = add_date
|
||||||
|
self.tags = tags
|
||||||
|
|
||||||
|
|
||||||
|
class ImportTestMixin:
|
||||||
|
def render_tag(self, tag: BookmarkHtmlTag):
|
||||||
|
return f'''
|
||||||
|
<DT>
|
||||||
|
<A {f'HREF="{tag.href}"' if tag.href else ''}
|
||||||
|
{f'ADD_DATE="{tag.add_date}"' if tag.add_date else ''}
|
||||||
|
{f'TAGS="{tag.tags}"' if tag.tags else ''}>
|
||||||
|
{tag.title if tag.title else ''}
|
||||||
|
</A>
|
||||||
|
{f'<DD>{tag.description}' if tag.description else ''}
|
||||||
|
'''
|
||||||
|
|
||||||
|
def render_html(self, tags: List[BookmarkHtmlTag] = None, tags_html: str = ''):
|
||||||
|
if tags:
|
||||||
|
rendered_tags = [self.render_tag(tag) for tag in tags]
|
||||||
|
tags_html = '\n'.join(rendered_tags)
|
||||||
|
return f'''
|
||||||
|
<!DOCTYPE NETSCAPE-Bookmark-file-1>
|
||||||
|
<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
|
||||||
|
<TITLE>Bookmarks</TITLE>
|
||||||
|
<H1>Bookmarks</H1>
|
||||||
|
<DL><p>
|
||||||
|
{tags_html}
|
||||||
|
</DL><p>
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
_words = [
|
_words = [
|
||||||
'quasi',
|
'quasi',
|
||||||
'consequatur',
|
'consequatur',
|
||||||
|
@@ -1,29 +1,204 @@
|
|||||||
|
from typing import List
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
from django.test import TestCase
|
from django.test import TestCase, override_settings
|
||||||
|
from django.utils import timezone
|
||||||
|
|
||||||
from bookmarks.models import Tag
|
from bookmarks.models import Bookmark, Tag, parse_tag_string
|
||||||
from bookmarks.services import tasks
|
from bookmarks.services import tasks
|
||||||
from bookmarks.services.importer import import_netscape_html
|
from bookmarks.services.importer import import_netscape_html
|
||||||
from bookmarks.tests.helpers import BookmarkFactoryMixin, disable_logging
|
from bookmarks.tests.helpers import BookmarkFactoryMixin, ImportTestMixin, BookmarkHtmlTag, disable_logging
|
||||||
|
from bookmarks.utils import parse_timestamp
|
||||||
|
|
||||||
|
|
||||||
class ImporterTestCase(TestCase, BookmarkFactoryMixin):
|
class ImporterTestCase(TestCase, BookmarkFactoryMixin, ImportTestMixin):
|
||||||
|
|
||||||
def create_import_html(self, bookmark_tags_string: str):
|
def assertBookmarksImported(self, html_tags: List[BookmarkHtmlTag]):
|
||||||
return f'''
|
for html_tag in html_tags:
|
||||||
<!DOCTYPE NETSCAPE-Bookmark-file-1>
|
bookmark = Bookmark.objects.get(url=html_tag.href)
|
||||||
<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
|
self.assertIsNotNone(bookmark)
|
||||||
<TITLE>Bookmarks</TITLE>
|
|
||||||
<H1>Bookmarks</H1>
|
self.assertEqual(bookmark.title, html_tag.title)
|
||||||
<DL><p>
|
self.assertEqual(bookmark.description, html_tag.description)
|
||||||
{bookmark_tags_string}
|
self.assertEqual(bookmark.date_added, parse_timestamp(html_tag.add_date))
|
||||||
</DL><p>
|
|
||||||
'''
|
tag_names = parse_tag_string(html_tag.tags)
|
||||||
|
|
||||||
|
# Check assigned tags
|
||||||
|
for tag_name in tag_names:
|
||||||
|
tag = next(
|
||||||
|
(tag for tag in bookmark.tags.all() if tag.name == tag_name), None)
|
||||||
|
self.assertIsNotNone(tag)
|
||||||
|
|
||||||
|
def test_import(self):
|
||||||
|
html_tags = [
|
||||||
|
BookmarkHtmlTag(href='https://example.com', title='Example title', description='Example description',
|
||||||
|
add_date='1', tags='example-tag'),
|
||||||
|
BookmarkHtmlTag(href='https://foo.com', title='Foo title', description='',
|
||||||
|
add_date='2', tags=''),
|
||||||
|
BookmarkHtmlTag(href='https://bar.com', title='Bar title', description='Bar description',
|
||||||
|
add_date='3', tags='bar-tag, other-tag'),
|
||||||
|
]
|
||||||
|
import_html = self.render_html(tags=html_tags)
|
||||||
|
result = import_netscape_html(import_html, self.get_or_create_test_user())
|
||||||
|
|
||||||
|
# Check result
|
||||||
|
self.assertEqual(result.total, 3)
|
||||||
|
self.assertEqual(result.success, 3)
|
||||||
|
self.assertEqual(result.failed, 0)
|
||||||
|
|
||||||
|
# Check bookmarks
|
||||||
|
bookmarks = Bookmark.objects.all()
|
||||||
|
self.assertEqual(len(bookmarks), 3)
|
||||||
|
self.assertBookmarksImported(html_tags)
|
||||||
|
|
||||||
|
def test_synchronize(self):
|
||||||
|
# Initial import
|
||||||
|
html_tags = [
|
||||||
|
BookmarkHtmlTag(href='https://example.com', title='Example title', description='Example description',
|
||||||
|
add_date='1', tags='example-tag'),
|
||||||
|
BookmarkHtmlTag(href='https://foo.com', title='Foo title', description='',
|
||||||
|
add_date='2', tags=''),
|
||||||
|
BookmarkHtmlTag(href='https://bar.com', title='Bar title', description='Bar description',
|
||||||
|
add_date='3', tags='bar-tag, other-tag'),
|
||||||
|
]
|
||||||
|
import_html = self.render_html(tags=html_tags)
|
||||||
|
import_netscape_html(import_html, self.get_or_create_test_user())
|
||||||
|
|
||||||
|
# Change data, add some new data
|
||||||
|
html_tags = [
|
||||||
|
BookmarkHtmlTag(href='https://example.com', title='Updated Example title',
|
||||||
|
description='Updated Example description', add_date='111', tags='updated-example-tag'),
|
||||||
|
BookmarkHtmlTag(href='https://foo.com', title='Updated Foo title', description='Updated Foo description',
|
||||||
|
add_date='222', tags='new-tag'),
|
||||||
|
BookmarkHtmlTag(href='https://bar.com', title='Updated Bar title', description='Updated Bar description',
|
||||||
|
add_date='333', tags='updated-bar-tag, updated-other-tag'),
|
||||||
|
BookmarkHtmlTag(href='https://baz.com', add_date='444', tags='baz-tag')
|
||||||
|
]
|
||||||
|
|
||||||
|
# Import updated data
|
||||||
|
import_html = self.render_html(tags=html_tags)
|
||||||
|
result = import_netscape_html(import_html, self.get_or_create_test_user())
|
||||||
|
|
||||||
|
# Check result
|
||||||
|
self.assertEqual(result.total, 4)
|
||||||
|
self.assertEqual(result.success, 4)
|
||||||
|
self.assertEqual(result.failed, 0)
|
||||||
|
|
||||||
|
# Check bookmarks
|
||||||
|
bookmarks = Bookmark.objects.all()
|
||||||
|
self.assertEqual(len(bookmarks), 4)
|
||||||
|
self.assertBookmarksImported(html_tags)
|
||||||
|
|
||||||
|
def test_import_with_some_invalid_bookmarks(self):
|
||||||
|
html_tags = [
|
||||||
|
BookmarkHtmlTag(href='https://example.com'),
|
||||||
|
# Invalid URL
|
||||||
|
BookmarkHtmlTag(href='foo.com'),
|
||||||
|
# No URL
|
||||||
|
BookmarkHtmlTag(),
|
||||||
|
]
|
||||||
|
import_html = self.render_html(tags=html_tags)
|
||||||
|
result = import_netscape_html(import_html, self.get_or_create_test_user())
|
||||||
|
|
||||||
|
# Check result
|
||||||
|
self.assertEqual(result.total, 3)
|
||||||
|
self.assertEqual(result.success, 1)
|
||||||
|
self.assertEqual(result.failed, 2)
|
||||||
|
|
||||||
|
# Check bookmarks
|
||||||
|
bookmarks = Bookmark.objects.all()
|
||||||
|
self.assertEqual(len(bookmarks), 1)
|
||||||
|
self.assertBookmarksImported(html_tags[1:1])
|
||||||
|
|
||||||
|
def test_import_tags(self):
|
||||||
|
html_tags = [
|
||||||
|
BookmarkHtmlTag(href='https://example.com', tags='tag1'),
|
||||||
|
BookmarkHtmlTag(href='https://foo.com', tags='tag2'),
|
||||||
|
BookmarkHtmlTag(href='https://bar.com', tags='tag3'),
|
||||||
|
]
|
||||||
|
import_html = self.render_html(tags=html_tags)
|
||||||
|
import_netscape_html(import_html, self.get_or_create_test_user())
|
||||||
|
|
||||||
|
self.assertEqual(Tag.objects.count(), 3)
|
||||||
|
|
||||||
|
def test_create_missing_tags(self):
|
||||||
|
html_tags = [
|
||||||
|
BookmarkHtmlTag(href='https://example.com', tags='tag1'),
|
||||||
|
BookmarkHtmlTag(href='https://foo.com', tags='tag2'),
|
||||||
|
BookmarkHtmlTag(href='https://bar.com', tags='tag3'),
|
||||||
|
]
|
||||||
|
import_html = self.render_html(tags=html_tags)
|
||||||
|
import_netscape_html(import_html, self.get_or_create_test_user())
|
||||||
|
|
||||||
|
html_tags.append(
|
||||||
|
BookmarkHtmlTag(href='https://baz.com', tags='tag4')
|
||||||
|
)
|
||||||
|
import_html = self.render_html(tags=html_tags)
|
||||||
|
import_netscape_html(import_html, self.get_or_create_test_user())
|
||||||
|
|
||||||
|
self.assertEqual(Tag.objects.count(), 4)
|
||||||
|
|
||||||
|
def test_should_append_tags_to_bookmark_when_reimporting_with_different_tags(self):
|
||||||
|
html_tags = [
|
||||||
|
BookmarkHtmlTag(href='https://example.com', tags='tag1'),
|
||||||
|
]
|
||||||
|
import_html = self.render_html(tags=html_tags)
|
||||||
|
import_netscape_html(import_html, self.get_or_create_test_user())
|
||||||
|
|
||||||
|
html_tags.append(
|
||||||
|
BookmarkHtmlTag(href='https://example.com', tags='tag2, tag3')
|
||||||
|
)
|
||||||
|
import_html = self.render_html(tags=html_tags)
|
||||||
|
import_netscape_html(import_html, self.get_or_create_test_user())
|
||||||
|
|
||||||
|
self.assertEqual(Bookmark.objects.count(), 1)
|
||||||
|
self.assertEqual(Bookmark.objects.all()[0].tags.all().count(), 3)
|
||||||
|
|
||||||
|
@override_settings(USE_TZ=False)
|
||||||
|
def test_use_current_date_when_no_add_date(self):
|
||||||
|
test_html = self.render_html(tags_html=f'''
|
||||||
|
<DT><A HREF="https://example.com">Example.com</A>
|
||||||
|
<DD>Example.com
|
||||||
|
''')
|
||||||
|
|
||||||
|
with patch.object(timezone, 'now', return_value=timezone.datetime(2021, 1, 1)):
|
||||||
|
import_netscape_html(test_html, self.get_or_create_test_user())
|
||||||
|
|
||||||
|
self.assertEqual(Bookmark.objects.count(), 1)
|
||||||
|
self.assertEqual(Bookmark.objects.all()[0].date_added, timezone.datetime(2021, 1, 1))
|
||||||
|
|
||||||
|
def test_keep_title_if_imported_bookmark_has_empty_title(self):
|
||||||
|
test_html = self.render_html(tags=[
|
||||||
|
BookmarkHtmlTag(href='https://example.com', title='Example.com')
|
||||||
|
])
|
||||||
|
import_netscape_html(test_html, self.get_or_create_test_user())
|
||||||
|
|
||||||
|
test_html = self.render_html(tags=[
|
||||||
|
BookmarkHtmlTag(href='https://example.com')
|
||||||
|
])
|
||||||
|
import_netscape_html(test_html, self.get_or_create_test_user())
|
||||||
|
|
||||||
|
self.assertEqual(Bookmark.objects.count(), 1)
|
||||||
|
self.assertEqual(Bookmark.objects.all()[0].title, 'Example.com')
|
||||||
|
|
||||||
|
def test_keep_description_if_imported_bookmark_has_empty_description(self):
|
||||||
|
test_html = self.render_html(tags=[
|
||||||
|
BookmarkHtmlTag(href='https://example.com', description='Example.com')
|
||||||
|
])
|
||||||
|
import_netscape_html(test_html, self.get_or_create_test_user())
|
||||||
|
|
||||||
|
test_html = self.render_html(tags=[
|
||||||
|
BookmarkHtmlTag(href='https://example.com')
|
||||||
|
])
|
||||||
|
import_netscape_html(test_html, self.get_or_create_test_user())
|
||||||
|
|
||||||
|
self.assertEqual(Bookmark.objects.count(), 1)
|
||||||
|
self.assertEqual(Bookmark.objects.all()[0].description, 'Example.com')
|
||||||
|
|
||||||
def test_replace_whitespace_in_tag_names(self):
|
def test_replace_whitespace_in_tag_names(self):
|
||||||
test_html = self.create_import_html(f'''
|
test_html = self.render_html(tags_html=f'''
|
||||||
<DT><A HREF="https://example.com" ADD_DATE="1616337559" PRIVATE="0" TOREAD="0" TAGS="tag 1, tag 2, tag 3">Example.com</A>
|
<DT><A HREF="https://example.com" TAGS="tag 1, tag 2, tag 3">Example.com</A>
|
||||||
<DD>Example.com
|
<DD>Example.com
|
||||||
''')
|
''')
|
||||||
import_netscape_html(test_html, self.get_or_create_test_user())
|
import_netscape_html(test_html, self.get_or_create_test_user())
|
||||||
@@ -35,22 +210,22 @@ class ImporterTestCase(TestCase, BookmarkFactoryMixin):
|
|||||||
|
|
||||||
@disable_logging
|
@disable_logging
|
||||||
def test_validate_empty_or_missing_bookmark_url(self):
|
def test_validate_empty_or_missing_bookmark_url(self):
|
||||||
test_html = self.create_import_html(f'''
|
test_html = self.render_html(tags_html=f'''
|
||||||
<!-- Empty URL -->
|
<DT><A HREF="">Empty URL</A>
|
||||||
<DT><A HREF="" ADD_DATE="1616337559" PRIVATE="0" TOREAD="0" TAGS="tag3">Empty URL</A>
|
|
||||||
<DD>Empty URL
|
<DD>Empty URL
|
||||||
<!-- Missing URL -->
|
<DT><A>Missing URL</A>
|
||||||
<DT><A ADD_DATE="1616337559" PRIVATE="0" TOREAD="0" TAGS="tag3">Missing URL</A>
|
|
||||||
<DD>Missing URL
|
<DD>Missing URL
|
||||||
''')
|
''')
|
||||||
|
|
||||||
import_result = import_netscape_html(test_html, self.get_or_create_test_user())
|
import_result = import_netscape_html(test_html, self.get_or_create_test_user())
|
||||||
|
|
||||||
|
self.assertEqual(Bookmark.objects.count(), 0)
|
||||||
self.assertEqual(import_result.success, 0)
|
self.assertEqual(import_result.success, 0)
|
||||||
|
self.assertEqual(import_result.failed, 2)
|
||||||
|
|
||||||
def test_schedule_snapshot_creation(self):
|
def test_schedule_snapshot_creation(self):
|
||||||
user = self.get_or_create_test_user()
|
user = self.get_or_create_test_user()
|
||||||
test_html = self.create_import_html('')
|
test_html = self.render_html(tags_html='')
|
||||||
|
|
||||||
with patch.object(tasks, 'schedule_bookmarks_without_snapshots') as mock_schedule_bookmarks_without_snapshots:
|
with patch.object(tasks, 'schedule_bookmarks_without_snapshots') as mock_schedule_bookmarks_without_snapshots:
|
||||||
import_netscape_html(test_html, user)
|
import_netscape_html(test_html, user)
|
||||||
|
122
bookmarks/tests/test_parser.py
Normal file
122
bookmarks/tests/test_parser.py
Normal file
@@ -0,0 +1,122 @@
|
|||||||
|
from typing import List
|
||||||
|
|
||||||
|
from django.test import TestCase
|
||||||
|
|
||||||
|
from bookmarks.services.parser import NetscapeBookmark
|
||||||
|
from bookmarks.services.parser import parse
|
||||||
|
from bookmarks.tests.helpers import ImportTestMixin, BookmarkHtmlTag
|
||||||
|
|
||||||
|
|
||||||
|
class ParserTestCase(TestCase, ImportTestMixin):
|
||||||
|
def assertTagsEqual(self, bookmarks: List[NetscapeBookmark], html_tags: List[BookmarkHtmlTag]):
|
||||||
|
self.assertEqual(len(bookmarks), len(html_tags))
|
||||||
|
for bookmark in bookmarks:
|
||||||
|
html_tag = html_tags[bookmarks.index(bookmark)]
|
||||||
|
self.assertEqual(bookmark.href, html_tag.href)
|
||||||
|
self.assertEqual(bookmark.title, html_tag.title)
|
||||||
|
self.assertEqual(bookmark.date_added, html_tag.add_date)
|
||||||
|
self.assertEqual(bookmark.description, html_tag.description)
|
||||||
|
self.assertEqual(bookmark.tag_string, html_tag.tags)
|
||||||
|
|
||||||
|
def test_parse_bookmarks(self):
|
||||||
|
html_tags = [
|
||||||
|
BookmarkHtmlTag(href='https://example.com', title='Example title', description='Example description',
|
||||||
|
add_date='1', tags='example-tag'),
|
||||||
|
BookmarkHtmlTag(href='https://foo.com', title='Foo title', description='',
|
||||||
|
add_date='2', tags=''),
|
||||||
|
BookmarkHtmlTag(href='https://bar.com', title='Bar title', description='Bar description',
|
||||||
|
add_date='3', tags='bar-tag, other-tag'),
|
||||||
|
]
|
||||||
|
html = self.render_html(html_tags)
|
||||||
|
bookmarks = parse(html)
|
||||||
|
|
||||||
|
self.assertTagsEqual(bookmarks, html_tags)
|
||||||
|
|
||||||
|
def test_no_bookmarks(self):
|
||||||
|
html = self.render_html()
|
||||||
|
bookmarks = parse(html)
|
||||||
|
|
||||||
|
self.assertEqual(bookmarks, [])
|
||||||
|
|
||||||
|
def test_reset_properties_after_adding_bookmark(self):
|
||||||
|
html_tags = [
|
||||||
|
BookmarkHtmlTag(href='https://example.com', title='Example title', description='Example description',
|
||||||
|
add_date='1', tags='example-tag'),
|
||||||
|
BookmarkHtmlTag(href='', title='', description='',
|
||||||
|
add_date='', tags='')
|
||||||
|
]
|
||||||
|
html = self.render_html(html_tags)
|
||||||
|
bookmarks = parse(html)
|
||||||
|
|
||||||
|
self.assertTagsEqual(bookmarks, html_tags)
|
||||||
|
|
||||||
|
def test_empty_title(self):
|
||||||
|
html_tags = [
|
||||||
|
BookmarkHtmlTag(href='https://example.com', title='', description='Example description',
|
||||||
|
add_date='1', tags='example-tag'),
|
||||||
|
]
|
||||||
|
html = self.render_html(tags_html='''
|
||||||
|
<DT><A HREF="https://example.com" ADD_DATE="1" TAGS="example-tag"></A>
|
||||||
|
<DD>Example description
|
||||||
|
''')
|
||||||
|
bookmarks = parse(html)
|
||||||
|
|
||||||
|
self.assertTagsEqual(bookmarks, html_tags)
|
||||||
|
|
||||||
|
def test_with_closing_description_tag(self):
|
||||||
|
html_tags = [
|
||||||
|
BookmarkHtmlTag(href='https://example.com', title='Example title', description='Example description',
|
||||||
|
add_date='1', tags='example-tag'),
|
||||||
|
BookmarkHtmlTag(href='https://foo.com', title='Foo title', description='',
|
||||||
|
add_date='2', tags=''),
|
||||||
|
]
|
||||||
|
html = self.render_html(tags_html='''
|
||||||
|
<DT><A HREF="https://example.com" ADD_DATE="1" TAGS="example-tag">Example title</A>
|
||||||
|
<DD>Example description</DD>
|
||||||
|
<DT><A HREF="https://foo.com" ADD_DATE="2">Foo title</A>
|
||||||
|
<DD></DD>
|
||||||
|
''')
|
||||||
|
bookmarks = parse(html)
|
||||||
|
|
||||||
|
self.assertTagsEqual(bookmarks, html_tags)
|
||||||
|
|
||||||
|
def test_description_tag_before_anchor_tag(self):
|
||||||
|
html_tags = [
|
||||||
|
BookmarkHtmlTag(href='https://example.com', title='Example title', description='Example description',
|
||||||
|
add_date='1', tags='example-tag'),
|
||||||
|
BookmarkHtmlTag(href='https://foo.com', title='Foo title', description='',
|
||||||
|
add_date='2', tags=''),
|
||||||
|
]
|
||||||
|
html = self.render_html(tags_html='''
|
||||||
|
<DT><DD>Example description</DD>
|
||||||
|
<A HREF="https://example.com" ADD_DATE="1" TAGS="example-tag">Example title</A>
|
||||||
|
<DT><DD></DD>
|
||||||
|
<A HREF="https://foo.com" ADD_DATE="2">Foo title</A>
|
||||||
|
''')
|
||||||
|
bookmarks = parse(html)
|
||||||
|
|
||||||
|
self.assertTagsEqual(bookmarks, html_tags)
|
||||||
|
|
||||||
|
def test_with_folders(self):
|
||||||
|
html_tags = [
|
||||||
|
BookmarkHtmlTag(href='https://example.com', title='Example title', description='Example description',
|
||||||
|
add_date='1', tags='example-tag'),
|
||||||
|
BookmarkHtmlTag(href='https://foo.com', title='Foo title', description='',
|
||||||
|
add_date='2', tags=''),
|
||||||
|
]
|
||||||
|
html = self.render_html(tags_html='''
|
||||||
|
<DL><p>
|
||||||
|
<DT><H3>Folder 1</H3>
|
||||||
|
<DL><p>
|
||||||
|
<DT><A HREF="https://example.com" ADD_DATE="1" TAGS="example-tag">Example title</A>
|
||||||
|
<DD>Example description
|
||||||
|
</DL><p>
|
||||||
|
<DT><H3>Folder 2</H3>
|
||||||
|
<DL><p>
|
||||||
|
<DT><A HREF="https://foo.com" ADD_DATE="2">Foo title</A>
|
||||||
|
</DL><p>
|
||||||
|
</DL><p>
|
||||||
|
''')
|
||||||
|
bookmarks = parse(html)
|
||||||
|
|
||||||
|
self.assertTagsEqual(bookmarks, html_tags)
|
@@ -13,7 +13,6 @@ django-sass-processor==1.0.1
|
|||||||
django-widget-tweaks==1.4.8
|
django-widget-tweaks==1.4.8
|
||||||
djangorestframework==3.12.4
|
djangorestframework==3.12.4
|
||||||
idna==2.8
|
idna==2.8
|
||||||
pyparsing==2.4.7
|
|
||||||
python-dateutil==2.8.1
|
python-dateutil==2.8.1
|
||||||
pytz==2021.1
|
pytz==2021.1
|
||||||
requests==2.26.0
|
requests==2.26.0
|
||||||
|
@@ -18,7 +18,6 @@ django-widget-tweaks==1.4.8
|
|||||||
djangorestframework==3.12.4
|
djangorestframework==3.12.4
|
||||||
idna==2.8
|
idna==2.8
|
||||||
libsass==0.21.0
|
libsass==0.21.0
|
||||||
pyparsing==2.4.7
|
|
||||||
python-dateutil==2.8.1
|
python-dateutil==2.8.1
|
||||||
pytz==2021.1
|
pytz==2021.1
|
||||||
rcssmin==1.0.6
|
rcssmin==1.0.6
|
||||||
|
@@ -48,6 +48,11 @@ LOGGING = {
|
|||||||
'level': 'DEBUG',
|
'level': 'DEBUG',
|
||||||
'handlers': ['console'],
|
'handlers': ['console'],
|
||||||
'propagate': False,
|
'propagate': False,
|
||||||
|
},
|
||||||
|
'bookmarks.services.importer': { # Log importer debug output
|
||||||
|
'level': 'DEBUG',
|
||||||
|
'handlers': ['console'],
|
||||||
|
'propagate': False,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user