mirror of
https://github.com/sissbruecker/linkding.git
synced 2025-08-13 13:39:27 +02:00
Implement custom netscape file parser (#51)
* Implement custom Netscape file parser (#50) * Add environment variable to configure request timeouts (#50) Co-authored-by: Sascha Ißbrücker <sissbruecker@lyska.io>
This commit is contained in:
@@ -2,11 +2,10 @@ import logging
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
|
||||
import bs4
|
||||
from bs4 import BeautifulSoup
|
||||
from django.contrib.auth.models import User
|
||||
|
||||
from bookmarks.models import Bookmark, parse_tag_string
|
||||
from bookmarks.services.parser import parse, NetscapeBookmark
|
||||
from bookmarks.services.tags import get_or_create_tags
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -23,52 +22,41 @@ def import_netscape_html(html: str, user: User):
|
||||
result = ImportResult()
|
||||
|
||||
try:
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
netscape_bookmarks = parse(html)
|
||||
except:
|
||||
logging.exception('Could not read bookmarks file.')
|
||||
raise
|
||||
|
||||
bookmark_tags = soup.find_all('dt')
|
||||
|
||||
for bookmark_tag in bookmark_tags:
|
||||
for netscape_bookmark in netscape_bookmarks:
|
||||
result.total = result.total + 1
|
||||
try:
|
||||
_import_bookmark_tag(bookmark_tag, user)
|
||||
_import_bookmark_tag(netscape_bookmark, user)
|
||||
result.success = result.success + 1
|
||||
except:
|
||||
shortened_bookmark_tag_str = str(bookmark_tag)[:100] + '...'
|
||||
shortened_bookmark_tag_str = str(netscape_bookmark)[:100] + '...'
|
||||
logging.exception('Error importing bookmark: ' + shortened_bookmark_tag_str)
|
||||
result.failed = result.failed + 1
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _import_bookmark_tag(bookmark_tag: bs4.Tag, user: User):
|
||||
link_tag = bookmark_tag.a
|
||||
|
||||
if link_tag is None:
|
||||
return
|
||||
|
||||
def _import_bookmark_tag(netscape_bookmark: NetscapeBookmark, user: User):
|
||||
# Either modify existing bookmark for the URL or create new one
|
||||
url = link_tag['href']
|
||||
description = _extract_description(bookmark_tag)
|
||||
bookmark = _get_or_create_bookmark(url, user)
|
||||
bookmark = _get_or_create_bookmark(netscape_bookmark.href, user)
|
||||
|
||||
bookmark.url = url
|
||||
add_date = link_tag.get('add_date', datetime.now().timestamp())
|
||||
bookmark.date_added = datetime.utcfromtimestamp(int(add_date)).astimezone()
|
||||
bookmark.url = netscape_bookmark.href
|
||||
bookmark.date_added = datetime.utcfromtimestamp(int(netscape_bookmark.date_added)).astimezone()
|
||||
bookmark.date_modified = bookmark.date_added
|
||||
bookmark.unread = link_tag.get('toread', '0') == '1'
|
||||
bookmark.title = link_tag.string
|
||||
if description:
|
||||
bookmark.description = description
|
||||
bookmark.unread = False
|
||||
bookmark.title = netscape_bookmark.title
|
||||
if netscape_bookmark.description:
|
||||
bookmark.description = netscape_bookmark.description
|
||||
bookmark.owner = user
|
||||
|
||||
bookmark.save()
|
||||
|
||||
# Set tags
|
||||
tag_string = link_tag.get('tags', '')
|
||||
tag_names = parse_tag_string(tag_string)
|
||||
tag_names = parse_tag_string(netscape_bookmark.tag_string)
|
||||
tags = get_or_create_tags(tag_names, user)
|
||||
|
||||
bookmark.tags.set(tags)
|
||||
@@ -80,27 +68,3 @@ def _get_or_create_bookmark(url: str, user: User):
|
||||
return Bookmark.objects.get(url=url, owner=user)
|
||||
except Bookmark.DoesNotExist:
|
||||
return Bookmark()
|
||||
|
||||
|
||||
def _extract_description(bookmark_tag: bs4.Tag):
|
||||
"""
|
||||
Since the Netscape HTML format has no closing tags, all following bookmark tags are part of the description tag
|
||||
so to extract the description text we have to get creative. For now we combine the text of all text nodes until we
|
||||
detect a <dt> tag which indicates a new bookmark
|
||||
:param bookmark_tag:
|
||||
:return:
|
||||
"""
|
||||
description_tag = bookmark_tag.find('dd', recursive=False)
|
||||
|
||||
if description_tag is None:
|
||||
return None
|
||||
|
||||
description = ''
|
||||
|
||||
for content in description_tag.contents:
|
||||
if type(content) is bs4.element.Tag and content.name == 'dt':
|
||||
break
|
||||
if type(content) is bs4.element.NavigableString:
|
||||
description += content
|
||||
|
||||
return description.strip()
|
||||
|
73
bookmarks/services/parser.py
Normal file
73
bookmarks/services/parser.py
Normal file
@@ -0,0 +1,73 @@
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
|
||||
import pyparsing as pp
|
||||
|
||||
|
||||
@dataclass
|
||||
class NetscapeBookmark:
|
||||
href: str
|
||||
title: str
|
||||
description: str
|
||||
date_added: int
|
||||
tag_string: str
|
||||
|
||||
|
||||
def extract_bookmark_link(tag):
|
||||
href = tag[0].href
|
||||
title = tag[0].text
|
||||
tag_string = tag[0].tags
|
||||
date_added_string = tag[0].add_date if tag[0].add_date else datetime.now().timestamp()
|
||||
date_added = int(date_added_string)
|
||||
|
||||
return {
|
||||
'href': href,
|
||||
'title': title,
|
||||
'tag_string': tag_string,
|
||||
'date_added': date_added
|
||||
}
|
||||
|
||||
|
||||
def extract_bookmark(tag):
|
||||
link = tag[0].link
|
||||
description = tag[0].description
|
||||
description = description[0] if description else ''
|
||||
|
||||
return {
|
||||
'link': link,
|
||||
'description': description,
|
||||
}
|
||||
|
||||
|
||||
def extract_description(tag):
|
||||
return tag[0].strip()
|
||||
|
||||
|
||||
# define grammar
|
||||
dt_start, _ = pp.makeHTMLTags("DT")
|
||||
dd_start, _ = pp.makeHTMLTags("DD")
|
||||
a_start, a_end = pp.makeHTMLTags("A")
|
||||
bookmark_link_tag = pp.Group(a_start + a_start.tag_body("text") + a_end.suppress())
|
||||
bookmark_link_tag.addParseAction(extract_bookmark_link)
|
||||
bookmark_description_tag = dd_start.suppress() + pp.SkipTo(pp.anyOpenTag | pp.anyCloseTag)("description")
|
||||
bookmark_description_tag.addParseAction(extract_description)
|
||||
bookmark_tag = pp.Group(dt_start + bookmark_link_tag("link") + pp.ZeroOrMore(bookmark_description_tag)("description"))
|
||||
bookmark_tag.addParseAction(extract_bookmark)
|
||||
|
||||
|
||||
def parse(html: str) -> [NetscapeBookmark]:
|
||||
matches = bookmark_tag.searchString(html)
|
||||
bookmarks = []
|
||||
|
||||
for match in matches:
|
||||
bookmark_match = match[0]
|
||||
bookmark = NetscapeBookmark(
|
||||
href=bookmark_match['link']['href'],
|
||||
title=bookmark_match['link']['title'],
|
||||
description=bookmark_match['description'],
|
||||
tag_string=bookmark_match['link']['tag_string'],
|
||||
date_added=bookmark_match['link']['date_added'],
|
||||
)
|
||||
bookmarks.append(bookmark)
|
||||
|
||||
return bookmarks
|
@@ -35,7 +35,7 @@ def bookmark_import(request):
|
||||
return HttpResponseRedirect(reverse('bookmarks:settings.index'))
|
||||
|
||||
try:
|
||||
content = import_file.read()
|
||||
content = import_file.read().decode()
|
||||
result = import_netscape_html(content, request.user)
|
||||
success_msg = str(result.success) + ' bookmarks were successfully imported.'
|
||||
messages.success(request, success_msg, 'bookmark_import_success')
|
||||
|
Reference in New Issue
Block a user