Improve import performance (#261)

* Run import in batches, cache tags

* Use bulk operations for bookmarks and assigning tags

* Improve naming

* Restore bookmark validation

* Add logging

* Bulk create tags

* Use HTMLParser for parsing bookmarks

* add parser tests

* Add more importer tests

* Add more importer tests

* Remove pyparsing dependency

Co-authored-by: Sascha Ißbrücker <sascha.issbruecker@gmail.com>
This commit is contained in:
Sascha Ißbrücker
2022-05-21 09:27:30 +02:00
committed by GitHub
parent 117160ea87
commit f4e3d724f0
8 changed files with 571 additions and 103 deletions

View File

@@ -1,6 +1,6 @@
from dataclasses import dataclass
import pyparsing as pp
from html.parser import HTMLParser
from typing import Dict, List
@dataclass
@@ -12,60 +12,72 @@ class NetscapeBookmark:
tag_string: str
def extract_bookmark_link(tag):
href = tag[0].href
title = tag[0].text
tag_string = tag[0].tags
date_added = tag[0].add_date
class BookmarkParser(HTMLParser):
def __init__(self):
super().__init__()
self.bookmarks = []
return {
'href': href,
'title': title,
'tag_string': tag_string,
'date_added': date_added
}
self.current_tag = None
self.bookmark = None
self.href = ''
self.add_date = ''
self.tags = ''
self.title = ''
self.description = ''
def handle_starttag(self, tag: str, attrs: list):
name = 'handle_start_' + tag.lower()
if name in dir(self):
getattr(self, name)({k.lower(): v for k, v in attrs})
self.current_tag = tag
def extract_bookmark(tag):
link = tag[0].link
description = tag[0].description
description = description[0] if description else ''
def handle_endtag(self, tag: str):
name = 'handle_end_' + tag.lower()
if name in dir(self):
getattr(self, name)()
self.current_tag = None
return {
'link': link,
'description': description,
}
def handle_data(self, data):
name = f'handle_{self.current_tag}_data'
if name in dir(self):
getattr(self, name)(data)
def handle_end_dl(self):
self.add_bookmark()
def extract_description(tag):
return tag[0].strip()
def handle_start_dt(self, attrs: Dict[str, str]):
self.add_bookmark()
# define grammar
dt_start, _ = pp.makeHTMLTags("DT")
dd_start, _ = pp.makeHTMLTags("DD")
a_start, a_end = pp.makeHTMLTags("A")
bookmark_link_tag = pp.Group(a_start + a_start.tag_body("text") + a_end.suppress())
bookmark_link_tag.addParseAction(extract_bookmark_link)
bookmark_description_tag = dd_start.suppress() + pp.SkipTo(pp.anyOpenTag | pp.anyCloseTag)("description")
bookmark_description_tag.addParseAction(extract_description)
bookmark_tag = pp.Group(dt_start + bookmark_link_tag("link") + pp.ZeroOrMore(bookmark_description_tag)("description"))
bookmark_tag.addParseAction(extract_bookmark)
def parse(html: str) -> [NetscapeBookmark]:
matches = bookmark_tag.searchString(html)
bookmarks = []
for match in matches:
bookmark_match = match[0]
bookmark = NetscapeBookmark(
href=bookmark_match['link']['href'],
title=bookmark_match['link']['title'],
description=bookmark_match['description'],
tag_string=bookmark_match['link']['tag_string'],
date_added=bookmark_match['link']['date_added'],
def handle_start_a(self, attrs: Dict[str, str]):
vars(self).update(attrs)
self.bookmark = NetscapeBookmark(
href=self.href,
title='',
description='',
date_added=self.add_date,
tag_string=self.tags,
)
bookmarks.append(bookmark)
return bookmarks
def handle_a_data(self, data):
self.title = data.strip()
def handle_dd_data(self, data):
self.description = data.strip()
def add_bookmark(self):
if self.bookmark:
self.bookmark.title = self.title
self.bookmark.description = self.description
self.bookmarks.append(self.bookmark)
self.bookmark = None
self.href = ''
self.add_date = ''
self.tags = ''
self.title = ''
self.description = ''
def parse(html: str) -> List[NetscapeBookmark]:
parser = BookmarkParser()
parser.feed(html)
return parser.bookmarks