mirror of
https://github.com/sissbruecker/linkding.git
synced 2025-09-21 16:39:34 +02:00
Improve import performance (#261)
* Run import in batches, cache tags * Use bulk operations for bookmarks and assigning tags * Improve naming * Restore bookmark validation * Add logging * Bulk create tags * Use HTMLParser for parsing bookmarks * add parser tests * Add more importer tests * Add more importer tests * Remove pyparsing dependency Co-authored-by: Sascha Ißbrücker <sascha.issbruecker@gmail.com>
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
from dataclasses import dataclass
|
||||
|
||||
import pyparsing as pp
|
||||
from html.parser import HTMLParser
|
||||
from typing import Dict, List
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -12,60 +12,72 @@ class NetscapeBookmark:
|
||||
tag_string: str
|
||||
|
||||
|
||||
def extract_bookmark_link(tag):
|
||||
href = tag[0].href
|
||||
title = tag[0].text
|
||||
tag_string = tag[0].tags
|
||||
date_added = tag[0].add_date
|
||||
class BookmarkParser(HTMLParser):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.bookmarks = []
|
||||
|
||||
return {
|
||||
'href': href,
|
||||
'title': title,
|
||||
'tag_string': tag_string,
|
||||
'date_added': date_added
|
||||
}
|
||||
self.current_tag = None
|
||||
self.bookmark = None
|
||||
self.href = ''
|
||||
self.add_date = ''
|
||||
self.tags = ''
|
||||
self.title = ''
|
||||
self.description = ''
|
||||
|
||||
def handle_starttag(self, tag: str, attrs: list):
|
||||
name = 'handle_start_' + tag.lower()
|
||||
if name in dir(self):
|
||||
getattr(self, name)({k.lower(): v for k, v in attrs})
|
||||
self.current_tag = tag
|
||||
|
||||
def extract_bookmark(tag):
|
||||
link = tag[0].link
|
||||
description = tag[0].description
|
||||
description = description[0] if description else ''
|
||||
def handle_endtag(self, tag: str):
|
||||
name = 'handle_end_' + tag.lower()
|
||||
if name in dir(self):
|
||||
getattr(self, name)()
|
||||
self.current_tag = None
|
||||
|
||||
return {
|
||||
'link': link,
|
||||
'description': description,
|
||||
}
|
||||
def handle_data(self, data):
|
||||
name = f'handle_{self.current_tag}_data'
|
||||
if name in dir(self):
|
||||
getattr(self, name)(data)
|
||||
|
||||
def handle_end_dl(self):
|
||||
self.add_bookmark()
|
||||
|
||||
def extract_description(tag):
|
||||
return tag[0].strip()
|
||||
def handle_start_dt(self, attrs: Dict[str, str]):
|
||||
self.add_bookmark()
|
||||
|
||||
|
||||
# define grammar
|
||||
dt_start, _ = pp.makeHTMLTags("DT")
|
||||
dd_start, _ = pp.makeHTMLTags("DD")
|
||||
a_start, a_end = pp.makeHTMLTags("A")
|
||||
bookmark_link_tag = pp.Group(a_start + a_start.tag_body("text") + a_end.suppress())
|
||||
bookmark_link_tag.addParseAction(extract_bookmark_link)
|
||||
bookmark_description_tag = dd_start.suppress() + pp.SkipTo(pp.anyOpenTag | pp.anyCloseTag)("description")
|
||||
bookmark_description_tag.addParseAction(extract_description)
|
||||
bookmark_tag = pp.Group(dt_start + bookmark_link_tag("link") + pp.ZeroOrMore(bookmark_description_tag)("description"))
|
||||
bookmark_tag.addParseAction(extract_bookmark)
|
||||
|
||||
|
||||
def parse(html: str) -> [NetscapeBookmark]:
|
||||
matches = bookmark_tag.searchString(html)
|
||||
bookmarks = []
|
||||
|
||||
for match in matches:
|
||||
bookmark_match = match[0]
|
||||
bookmark = NetscapeBookmark(
|
||||
href=bookmark_match['link']['href'],
|
||||
title=bookmark_match['link']['title'],
|
||||
description=bookmark_match['description'],
|
||||
tag_string=bookmark_match['link']['tag_string'],
|
||||
date_added=bookmark_match['link']['date_added'],
|
||||
def handle_start_a(self, attrs: Dict[str, str]):
|
||||
vars(self).update(attrs)
|
||||
self.bookmark = NetscapeBookmark(
|
||||
href=self.href,
|
||||
title='',
|
||||
description='',
|
||||
date_added=self.add_date,
|
||||
tag_string=self.tags,
|
||||
)
|
||||
bookmarks.append(bookmark)
|
||||
|
||||
return bookmarks
|
||||
def handle_a_data(self, data):
|
||||
self.title = data.strip()
|
||||
|
||||
def handle_dd_data(self, data):
|
||||
self.description = data.strip()
|
||||
|
||||
def add_bookmark(self):
|
||||
if self.bookmark:
|
||||
self.bookmark.title = self.title
|
||||
self.bookmark.description = self.description
|
||||
self.bookmarks.append(self.bookmark)
|
||||
self.bookmark = None
|
||||
self.href = ''
|
||||
self.add_date = ''
|
||||
self.tags = ''
|
||||
self.title = ''
|
||||
self.description = ''
|
||||
|
||||
|
||||
def parse(html: str) -> List[NetscapeBookmark]:
|
||||
parser = BookmarkParser()
|
||||
parser.feed(html)
|
||||
return parser.bookmarks
|
||||
|
Reference in New Issue
Block a user