mirror of
https://github.com/sissbruecker/linkding.git
synced 2025-08-09 19:57:49 +02:00
Improve import performance (#261)
* Run import in batches, cache tags * Use bulk operations for bookmarks and assigning tags * Improve naming * Restore bookmark validation * Add logging * Bulk create tags * Use HTMLParser for parsing bookmarks * add parser tests * Add more importer tests * Add more importer tests * Remove pyparsing dependency Co-authored-by: Sascha Ißbrücker <sascha.issbruecker@gmail.com>
This commit is contained in:
@@ -1,13 +1,13 @@
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from typing import List
|
||||
|
||||
from django.contrib.auth.models import User
|
||||
from django.utils import timezone
|
||||
|
||||
from bookmarks.models import Bookmark, parse_tag_string
|
||||
from bookmarks.models import Bookmark, Tag, parse_tag_string
|
||||
from bookmarks.services import tasks
|
||||
from bookmarks.services.parser import parse, NetscapeBookmark
|
||||
from bookmarks.services.tags import get_or_create_tags
|
||||
from bookmarks.utils import parse_timestamp
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -20,8 +20,39 @@ class ImportResult:
|
||||
failed: int = 0
|
||||
|
||||
|
||||
class TagCache:
|
||||
def __init__(self, user: User):
|
||||
self.user = user
|
||||
self.cache = dict()
|
||||
# Init cache with all existing tags for that user
|
||||
tags = Tag.objects.filter(owner=user)
|
||||
for tag in tags:
|
||||
self.put(tag)
|
||||
|
||||
def get(self, tag_name: str):
|
||||
tag_name_lowercase = tag_name.lower()
|
||||
if tag_name_lowercase in self.cache:
|
||||
return self.cache[tag_name_lowercase]
|
||||
else:
|
||||
return None
|
||||
|
||||
def get_all(self, tag_names: List[str]):
|
||||
result = []
|
||||
for tag_name in tag_names:
|
||||
tag = self.get(tag_name)
|
||||
# Prevent returning duplicates
|
||||
if not (tag in result):
|
||||
result.append(tag)
|
||||
|
||||
return result
|
||||
|
||||
def put(self, tag: Tag):
|
||||
self.cache[tag.name.lower()] = tag
|
||||
|
||||
|
||||
def import_netscape_html(html: str, user: User):
|
||||
result = ImportResult()
|
||||
import_start = timezone.now()
|
||||
|
||||
try:
|
||||
netscape_bookmarks = parse(html)
|
||||
@@ -29,26 +60,130 @@ def import_netscape_html(html: str, user: User):
|
||||
logging.exception('Could not read bookmarks file.')
|
||||
raise
|
||||
|
||||
parse_end = timezone.now()
|
||||
logger.debug(f'Parse duration: {parse_end - import_start}')
|
||||
|
||||
# Create and cache all tags beforehand
|
||||
_create_missing_tags(netscape_bookmarks, user)
|
||||
tag_cache = TagCache(user)
|
||||
|
||||
# Split bookmarks to import into batches, to keep memory usage for bulk operations manageable
|
||||
batches = _get_batches(netscape_bookmarks, 200)
|
||||
for batch in batches:
|
||||
_import_batch(batch, user, tag_cache, result)
|
||||
|
||||
# Create snapshots for newly imported bookmarks
|
||||
tasks.schedule_bookmarks_without_snapshots(user)
|
||||
|
||||
end = timezone.now()
|
||||
logger.debug(f'Import duration: {end - import_start}')
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _create_missing_tags(netscape_bookmarks: List[NetscapeBookmark], user: User):
|
||||
tag_cache = TagCache(user)
|
||||
tags_to_create = []
|
||||
|
||||
for netscape_bookmark in netscape_bookmarks:
|
||||
tag_names = parse_tag_string(netscape_bookmark.tag_string)
|
||||
for tag_name in tag_names:
|
||||
tag = tag_cache.get(tag_name)
|
||||
if not tag:
|
||||
tag = Tag(name=tag_name, owner=user)
|
||||
tag.date_added = timezone.now()
|
||||
tags_to_create.append(tag)
|
||||
|
||||
Tag.objects.bulk_create(tags_to_create)
|
||||
|
||||
|
||||
def _get_batches(items: List, batch_size: int):
|
||||
batches = []
|
||||
offset = 0
|
||||
num_items = len(items)
|
||||
|
||||
while offset < num_items:
|
||||
batch = items[offset:min(offset + batch_size, num_items)]
|
||||
if len(batch) > 0:
|
||||
batches.append(batch)
|
||||
offset = offset + batch_size
|
||||
|
||||
return batches
|
||||
|
||||
|
||||
def _import_batch(netscape_bookmarks: List[NetscapeBookmark], user: User, tag_cache: TagCache, result: ImportResult):
|
||||
# Query existing bookmarks
|
||||
batch_urls = [bookmark.href for bookmark in netscape_bookmarks]
|
||||
existing_bookmarks = Bookmark.objects.filter(owner=user, url__in=batch_urls)
|
||||
|
||||
# Create or update bookmarks from parsed Netscape bookmarks
|
||||
bookmarks_to_create = []
|
||||
bookmarks_to_update = []
|
||||
|
||||
for netscape_bookmark in netscape_bookmarks:
|
||||
result.total = result.total + 1
|
||||
try:
|
||||
_import_bookmark_tag(netscape_bookmark, user)
|
||||
# Lookup existing bookmark by URL, or create new bookmark if there is no bookmark for that URL yet
|
||||
bookmark = next(
|
||||
(bookmark for bookmark in existing_bookmarks if bookmark.url == netscape_bookmark.href), None)
|
||||
if not bookmark:
|
||||
bookmark = Bookmark(owner=user)
|
||||
is_update = False
|
||||
else:
|
||||
is_update = True
|
||||
# Copy data from parsed bookmark
|
||||
_copy_bookmark_data(netscape_bookmark, bookmark)
|
||||
# Validate bookmark fields, exclude owner to prevent n+1 database query,
|
||||
# also there is no specific validation on owner
|
||||
bookmark.clean_fields(exclude=['owner'])
|
||||
# Schedule for update or insert
|
||||
if is_update:
|
||||
bookmarks_to_update.append(bookmark)
|
||||
else:
|
||||
bookmarks_to_create.append(bookmark)
|
||||
|
||||
result.success = result.success + 1
|
||||
except:
|
||||
shortened_bookmark_tag_str = str(netscape_bookmark)[:100] + '...'
|
||||
logging.exception('Error importing bookmark: ' + shortened_bookmark_tag_str)
|
||||
result.failed = result.failed + 1
|
||||
|
||||
# Create snapshots for newly imported bookmarks
|
||||
tasks.schedule_bookmarks_without_snapshots(user)
|
||||
# Bulk update bookmarks in DB
|
||||
Bookmark.objects.bulk_update(bookmarks_to_update,
|
||||
['url', 'date_added', 'date_modified', 'unread', 'title', 'description', 'owner'])
|
||||
# Bulk insert new bookmarks into DB
|
||||
Bookmark.objects.bulk_create(bookmarks_to_create)
|
||||
|
||||
return result
|
||||
# Bulk assign tags
|
||||
# In Django 3, bulk_create does not return the auto-generated IDs when bulk inserting,
|
||||
# so we have to reload the inserted bookmarks, and match them to the parsed bookmarks by URL
|
||||
existing_bookmarks = Bookmark.objects.filter(owner=user, url__in=batch_urls)
|
||||
|
||||
BookmarkToTagRelationShip = Bookmark.tags.through
|
||||
relationships = []
|
||||
|
||||
for netscape_bookmark in netscape_bookmarks:
|
||||
# Lookup bookmark by URL again
|
||||
bookmark = next(
|
||||
(bookmark for bookmark in existing_bookmarks if bookmark.url == netscape_bookmark.href), None)
|
||||
|
||||
if not bookmark:
|
||||
# Something is wrong, we should have just created this bookmark
|
||||
shortened_bookmark_tag_str = str(netscape_bookmark)[:100] + '...'
|
||||
logging.warning(
|
||||
f'Failed to assign tags to the bookmark: {shortened_bookmark_tag_str}. Could not find bookmark by URL.')
|
||||
|
||||
# Get tag models by string, schedule inserts for bookmark -> tag associations
|
||||
tag_names = parse_tag_string(netscape_bookmark.tag_string)
|
||||
tags = tag_cache.get_all(tag_names)
|
||||
for tag in tags:
|
||||
relationships.append(BookmarkToTagRelationShip(bookmark=bookmark, tag=tag))
|
||||
|
||||
# Insert all bookmark -> tag associations at once, should ignore errors if association already exists
|
||||
BookmarkToTagRelationShip.objects.bulk_create(relationships, ignore_conflicts=True)
|
||||
|
||||
|
||||
def _import_bookmark_tag(netscape_bookmark: NetscapeBookmark, user: User):
|
||||
# Either modify existing bookmark for the URL or create new one
|
||||
bookmark = _get_or_create_bookmark(netscape_bookmark.href, user)
|
||||
|
||||
def _copy_bookmark_data(netscape_bookmark: NetscapeBookmark, bookmark: Bookmark):
|
||||
bookmark.url = netscape_bookmark.href
|
||||
if netscape_bookmark.date_added:
|
||||
bookmark.date_added = parse_timestamp(netscape_bookmark.date_added)
|
||||
@@ -56,24 +191,7 @@ def _import_bookmark_tag(netscape_bookmark: NetscapeBookmark, user: User):
|
||||
bookmark.date_added = timezone.now()
|
||||
bookmark.date_modified = bookmark.date_added
|
||||
bookmark.unread = False
|
||||
bookmark.title = netscape_bookmark.title
|
||||
if netscape_bookmark.title:
|
||||
bookmark.title = netscape_bookmark.title
|
||||
if netscape_bookmark.description:
|
||||
bookmark.description = netscape_bookmark.description
|
||||
bookmark.owner = user
|
||||
|
||||
bookmark.full_clean()
|
||||
bookmark.save()
|
||||
|
||||
# Set tags
|
||||
tag_names = parse_tag_string(netscape_bookmark.tag_string)
|
||||
tags = get_or_create_tags(tag_names, user)
|
||||
|
||||
bookmark.tags.set(tags)
|
||||
bookmark.save()
|
||||
|
||||
|
||||
def _get_or_create_bookmark(url: str, user: User):
|
||||
try:
|
||||
return Bookmark.objects.get(url=url, owner=user)
|
||||
except Bookmark.DoesNotExist:
|
||||
return Bookmark()
|
||||
|
Reference in New Issue
Block a user