Implement custom netscape file parser (#51)

* Implement custom Netscape file parser (#50) * Add environment variable to configure request timeouts (#50) Co-authored-by: Sascha Ißbrücker <sissbruecker@lyska.io>
2025-08-13 13:39:27 +02:00 · 2020-12-31 07:02:28 +01:00
parent 50a825b3ca
commit 779de41b65
7 changed files with 111 additions and 51 deletions
--- a/bookmarks/services/importer.py
+++ b/bookmarks/services/importer.py
@@ -2,11 +2,10 @@ import logging
 from dataclasses import dataclass
 from datetime import datetime

-import bs4
-from bs4 import BeautifulSoup
 from django.contrib.auth.models import User

 from bookmarks.models import Bookmark, parse_tag_string
+from bookmarks.services.parser import parse, NetscapeBookmark
 from bookmarks.services.tags import get_or_create_tags

 logger = logging.getLogger(__name__)
@@ -23,52 +22,41 @@ def import_netscape_html(html: str, user: User):
    result = ImportResult()

    try:
-        soup = BeautifulSoup(html, 'html.parser')
+        netscape_bookmarks = parse(html)
    except:
        logging.exception('Could not read bookmarks file.')
        raise

-    bookmark_tags = soup.find_all('dt')
-
-    for bookmark_tag in bookmark_tags:
+    for netscape_bookmark in netscape_bookmarks:
        result.total = result.total + 1
        try:
-            _import_bookmark_tag(bookmark_tag, user)
+            _import_bookmark_tag(netscape_bookmark, user)
            result.success = result.success + 1
        except:
-            shortened_bookmark_tag_str = str(bookmark_tag)[:100] + '...'
+            shortened_bookmark_tag_str = str(netscape_bookmark)[:100] + '...'
            logging.exception('Error importing bookmark: ' + shortened_bookmark_tag_str)
            result.failed = result.failed + 1

    return result


-def _import_bookmark_tag(bookmark_tag: bs4.Tag, user: User):
-    link_tag = bookmark_tag.a
-
-    if link_tag is None:
-        return
-
+def _import_bookmark_tag(netscape_bookmark: NetscapeBookmark, user: User):
    # Either modify existing bookmark for the URL or create new one
-    url = link_tag['href']
-    description = _extract_description(bookmark_tag)
-    bookmark = _get_or_create_bookmark(url, user)
+    bookmark = _get_or_create_bookmark(netscape_bookmark.href, user)

-    bookmark.url = url
-    add_date = link_tag.get('add_date', datetime.now().timestamp())
-    bookmark.date_added = datetime.utcfromtimestamp(int(add_date)).astimezone()
+    bookmark.url = netscape_bookmark.href
+    bookmark.date_added = datetime.utcfromtimestamp(int(netscape_bookmark.date_added)).astimezone()
    bookmark.date_modified = bookmark.date_added
-    bookmark.unread = link_tag.get('toread', '0') == '1'
-    bookmark.title = link_tag.string
-    if description:
-        bookmark.description = description
+    bookmark.unread = False
+    bookmark.title = netscape_bookmark.title
+    if netscape_bookmark.description:
+        bookmark.description = netscape_bookmark.description
    bookmark.owner = user

    bookmark.save()

    # Set tags
-    tag_string = link_tag.get('tags', '')
-    tag_names = parse_tag_string(tag_string)
+    tag_names = parse_tag_string(netscape_bookmark.tag_string)
    tags = get_or_create_tags(tag_names, user)

    bookmark.tags.set(tags)
@@ -80,27 +68,3 @@ def _get_or_create_bookmark(url: str, user: User):
        return Bookmark.objects.get(url=url, owner=user)
    except Bookmark.DoesNotExist:
        return Bookmark()
-
-
-def _extract_description(bookmark_tag: bs4.Tag):
-    """
-    Since the Netscape HTML format has no closing tags, all following bookmark tags are part of the description tag
-    so to extract the description text we have to get creative. For now we combine the text of all text nodes until we
-    detect a <dt> tag which indicates a new bookmark
-    :param bookmark_tag:
-    :return:
-    """
-    description_tag = bookmark_tag.find('dd', recursive=False)
-
-    if description_tag is None:
-        return None
-
-    description = ''
-
-    for content in description_tag.contents:
-        if type(content) is bs4.element.Tag and content.name == 'dt':
-            break
-        if type(content) is bs4.element.NavigableString:
-            description += content
-
-    return description.strip()
--- a/bookmarks/services/parser.py
+++ b/bookmarks/services/parser.py
@@ -0,0 +1,73 @@
+from dataclasses import dataclass
+from datetime import datetime
+
+import pyparsing as pp
+
+
+@dataclass
+class NetscapeBookmark:
+    href: str
+    title: str
+    description: str
+    date_added: int
+    tag_string: str
+
+
+def extract_bookmark_link(tag):
+    href = tag[0].href
+    title = tag[0].text
+    tag_string = tag[0].tags
+    date_added_string = tag[0].add_date if tag[0].add_date else datetime.now().timestamp()
+    date_added = int(date_added_string)
+
+    return {
+        'href': href,
+        'title': title,
+        'tag_string': tag_string,
+        'date_added': date_added
+    }
+
+
+def extract_bookmark(tag):
+    link = tag[0].link
+    description = tag[0].description
+    description = description[0] if description else ''
+
+    return {
+        'link': link,
+        'description': description,
+    }
+
+
+def extract_description(tag):
+    return tag[0].strip()
+
+
+# define grammar
+dt_start, _ = pp.makeHTMLTags("DT")
+dd_start, _ = pp.makeHTMLTags("DD")
+a_start, a_end = pp.makeHTMLTags("A")
+bookmark_link_tag = pp.Group(a_start + a_start.tag_body("text") + a_end.suppress())
+bookmark_link_tag.addParseAction(extract_bookmark_link)
+bookmark_description_tag = dd_start.suppress() + pp.SkipTo(pp.anyOpenTag | pp.anyCloseTag)("description")
+bookmark_description_tag.addParseAction(extract_description)
+bookmark_tag = pp.Group(dt_start + bookmark_link_tag("link") + pp.ZeroOrMore(bookmark_description_tag)("description"))
+bookmark_tag.addParseAction(extract_bookmark)
+
+
+def parse(html: str) -> [NetscapeBookmark]:
+    matches = bookmark_tag.searchString(html)
+    bookmarks = []
+
+    for match in matches:
+        bookmark_match = match[0]
+        bookmark = NetscapeBookmark(
+            href=bookmark_match['link']['href'],
+            title=bookmark_match['link']['title'],
+            description=bookmark_match['description'],
+            tag_string=bookmark_match['link']['tag_string'],
+            date_added=bookmark_match['link']['date_added'],
+        )
+        bookmarks.append(bookmark)
+
+    return bookmarks
--- a/bookmarks/views/settings.py
+++ b/bookmarks/views/settings.py
@@ -35,7 +35,7 @@ def bookmark_import(request):
        return HttpResponseRedirect(reverse('bookmarks:settings.index'))

    try:
-        content = import_file.read()
+        content = import_file.read().decode()
        result = import_netscape_html(content, request.user)
        success_msg = str(result.success) + ' bookmarks were successfully imported.'
        messages.success(request, success_msg, 'bookmark_import_success')