mirror of
https://github.com/sissbruecker/linkding.git
synced 2025-08-09 19:57:49 +02:00
Implement custom netscape file parser (#51)
* Implement custom Netscape file parser (#50) * Add environment variable to configure request timeouts (#50) Co-authored-by: Sascha Ißbrücker <sissbruecker@lyska.io>
This commit is contained in:
@@ -2,11 +2,10 @@ import logging
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
|
||||
import bs4
|
||||
from bs4 import BeautifulSoup
|
||||
from django.contrib.auth.models import User
|
||||
|
||||
from bookmarks.models import Bookmark, parse_tag_string
|
||||
from bookmarks.services.parser import parse, NetscapeBookmark
|
||||
from bookmarks.services.tags import get_or_create_tags
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -23,52 +22,41 @@ def import_netscape_html(html: str, user: User):
|
||||
result = ImportResult()
|
||||
|
||||
try:
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
netscape_bookmarks = parse(html)
|
||||
except:
|
||||
logging.exception('Could not read bookmarks file.')
|
||||
raise
|
||||
|
||||
bookmark_tags = soup.find_all('dt')
|
||||
|
||||
for bookmark_tag in bookmark_tags:
|
||||
for netscape_bookmark in netscape_bookmarks:
|
||||
result.total = result.total + 1
|
||||
try:
|
||||
_import_bookmark_tag(bookmark_tag, user)
|
||||
_import_bookmark_tag(netscape_bookmark, user)
|
||||
result.success = result.success + 1
|
||||
except:
|
||||
shortened_bookmark_tag_str = str(bookmark_tag)[:100] + '...'
|
||||
shortened_bookmark_tag_str = str(netscape_bookmark)[:100] + '...'
|
||||
logging.exception('Error importing bookmark: ' + shortened_bookmark_tag_str)
|
||||
result.failed = result.failed + 1
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _import_bookmark_tag(bookmark_tag: bs4.Tag, user: User):
|
||||
link_tag = bookmark_tag.a
|
||||
|
||||
if link_tag is None:
|
||||
return
|
||||
|
||||
def _import_bookmark_tag(netscape_bookmark: NetscapeBookmark, user: User):
|
||||
# Either modify existing bookmark for the URL or create new one
|
||||
url = link_tag['href']
|
||||
description = _extract_description(bookmark_tag)
|
||||
bookmark = _get_or_create_bookmark(url, user)
|
||||
bookmark = _get_or_create_bookmark(netscape_bookmark.href, user)
|
||||
|
||||
bookmark.url = url
|
||||
add_date = link_tag.get('add_date', datetime.now().timestamp())
|
||||
bookmark.date_added = datetime.utcfromtimestamp(int(add_date)).astimezone()
|
||||
bookmark.url = netscape_bookmark.href
|
||||
bookmark.date_added = datetime.utcfromtimestamp(int(netscape_bookmark.date_added)).astimezone()
|
||||
bookmark.date_modified = bookmark.date_added
|
||||
bookmark.unread = link_tag.get('toread', '0') == '1'
|
||||
bookmark.title = link_tag.string
|
||||
if description:
|
||||
bookmark.description = description
|
||||
bookmark.unread = False
|
||||
bookmark.title = netscape_bookmark.title
|
||||
if netscape_bookmark.description:
|
||||
bookmark.description = netscape_bookmark.description
|
||||
bookmark.owner = user
|
||||
|
||||
bookmark.save()
|
||||
|
||||
# Set tags
|
||||
tag_string = link_tag.get('tags', '')
|
||||
tag_names = parse_tag_string(tag_string)
|
||||
tag_names = parse_tag_string(netscape_bookmark.tag_string)
|
||||
tags = get_or_create_tags(tag_names, user)
|
||||
|
||||
bookmark.tags.set(tags)
|
||||
@@ -80,27 +68,3 @@ def _get_or_create_bookmark(url: str, user: User):
|
||||
return Bookmark.objects.get(url=url, owner=user)
|
||||
except Bookmark.DoesNotExist:
|
||||
return Bookmark()
|
||||
|
||||
|
||||
def _extract_description(bookmark_tag: bs4.Tag):
|
||||
"""
|
||||
Since the Netscape HTML format has no closing tags, all following bookmark tags are part of the description tag
|
||||
so to extract the description text we have to get creative. For now we combine the text of all text nodes until we
|
||||
detect a <dt> tag which indicates a new bookmark
|
||||
:param bookmark_tag:
|
||||
:return:
|
||||
"""
|
||||
description_tag = bookmark_tag.find('dd', recursive=False)
|
||||
|
||||
if description_tag is None:
|
||||
return None
|
||||
|
||||
description = ''
|
||||
|
||||
for content in description_tag.contents:
|
||||
if type(content) is bs4.element.Tag and content.name == 'dt':
|
||||
break
|
||||
if type(content) is bs4.element.NavigableString:
|
||||
description += content
|
||||
|
||||
return description.strip()
|
||||
|
Reference in New Issue
Block a user