Implement custom netscape file parser (#51)

* Implement custom Netscape file parser (#50) * Add environment variable to configure request timeouts (#50) Co-authored-by: Sascha Ißbrücker <sissbruecker@lyska.io>
2025-09-24 09:59:33 +02:00 · 2020-12-31 07:02:28 +01:00
parent 50a825b3ca
commit 779de41b65
7 changed files with 111 additions and 51 deletions
--- a/bookmarks/services/importer.py
+++ b/bookmarks/services/importer.py
@@ -2,11 +2,10 @@ import logging
 from dataclasses import dataclass
 from datetime import datetime

-import bs4
-from bs4 import BeautifulSoup
 from django.contrib.auth.models import User

 from bookmarks.models import Bookmark, parse_tag_string
+from bookmarks.services.parser import parse, NetscapeBookmark
 from bookmarks.services.tags import get_or_create_tags

 logger = logging.getLogger(__name__)
@@ -23,52 +22,41 @@ def import_netscape_html(html: str, user: User):
    result = ImportResult()

    try:
-        soup = BeautifulSoup(html, 'html.parser')
+        netscape_bookmarks = parse(html)
    except:
        logging.exception('Could not read bookmarks file.')
        raise

-    bookmark_tags = soup.find_all('dt')
-
-    for bookmark_tag in bookmark_tags:
+    for netscape_bookmark in netscape_bookmarks:
        result.total = result.total + 1
        try:
-            _import_bookmark_tag(bookmark_tag, user)
+            _import_bookmark_tag(netscape_bookmark, user)
            result.success = result.success + 1
        except:
-            shortened_bookmark_tag_str = str(bookmark_tag)[:100] + '...'
+            shortened_bookmark_tag_str = str(netscape_bookmark)[:100] + '...'
            logging.exception('Error importing bookmark: ' + shortened_bookmark_tag_str)
            result.failed = result.failed + 1

    return result


-def _import_bookmark_tag(bookmark_tag: bs4.Tag, user: User):
-    link_tag = bookmark_tag.a
-
-    if link_tag is None:
-        return
-
+def _import_bookmark_tag(netscape_bookmark: NetscapeBookmark, user: User):
    # Either modify existing bookmark for the URL or create new one
-    url = link_tag['href']
-    description = _extract_description(bookmark_tag)
-    bookmark = _get_or_create_bookmark(url, user)
+    bookmark = _get_or_create_bookmark(netscape_bookmark.href, user)

-    bookmark.url = url
-    add_date = link_tag.get('add_date', datetime.now().timestamp())
-    bookmark.date_added = datetime.utcfromtimestamp(int(add_date)).astimezone()
+    bookmark.url = netscape_bookmark.href
+    bookmark.date_added = datetime.utcfromtimestamp(int(netscape_bookmark.date_added)).astimezone()
    bookmark.date_modified = bookmark.date_added
-    bookmark.unread = link_tag.get('toread', '0') == '1'
-    bookmark.title = link_tag.string
-    if description:
-        bookmark.description = description
+    bookmark.unread = False
+    bookmark.title = netscape_bookmark.title
+    if netscape_bookmark.description:
+        bookmark.description = netscape_bookmark.description
    bookmark.owner = user

    bookmark.save()

    # Set tags
-    tag_string = link_tag.get('tags', '')
-    tag_names = parse_tag_string(tag_string)
+    tag_names = parse_tag_string(netscape_bookmark.tag_string)
    tags = get_or_create_tags(tag_names, user)

    bookmark.tags.set(tags)
@@ -80,27 +68,3 @@ def _get_or_create_bookmark(url: str, user: User):
        return Bookmark.objects.get(url=url, owner=user)
    except Bookmark.DoesNotExist:
        return Bookmark()
-
-
-def _extract_description(bookmark_tag: bs4.Tag):
-    """
-    Since the Netscape HTML format has no closing tags, all following bookmark tags are part of the description tag
-    so to extract the description text we have to get creative. For now we combine the text of all text nodes until we
-    detect a <dt> tag which indicates a new bookmark
-    :param bookmark_tag:
-    :return:
-    """
-    description_tag = bookmark_tag.find('dd', recursive=False)
-
-    if description_tag is None:
-        return None
-
-    description = ''
-
-    for content in description_tag.contents:
-        if type(content) is bs4.element.Tag and content.name == 'dt':
-            break
-        if type(content) is bs4.element.NavigableString:
-            description += content
-
-    return description.strip()