diff --git a/bookmarks/services/website_loader.py b/bookmarks/services/website_loader.py index 5d00f6d..fef0d5b 100644 --- a/bookmarks/services/website_loader.py +++ b/bookmarks/services/website_loader.py @@ -2,6 +2,7 @@ from dataclasses import dataclass import requests from bs4 import BeautifulSoup +from charset_normalizer import from_bytes @dataclass @@ -33,5 +34,11 @@ def load_website_metadata(url: str): def load_page(url: str): - r = requests.get(url) - return r.text + r = requests.get(url, timeout=10) + + # Use charset_normalizer to determine encoding that best matches the response content + # Several sites seem to specify the response encoding incorrectly, so we ignore it and use custom logic instead + # This is different from Response.text which does respect the encoding specified in the response first, + # before trying to determine one + results = from_bytes(r.content) + return str(results.best())