From 937858cf587812995cdb3ea574d8f83fd8f3b345 Mon Sep 17 00:00:00 2001 From: Taku Izumi <32434502+kgh02017@users.noreply.github.com> Date: Wed, 25 Aug 2021 17:16:23 +0900 Subject: [PATCH] Fix website scraper decoding content incorrectly (#126) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Avoid stall on web scraping This patch fixes stall on web scraping. I encountered a stall (scraping never ends) when adding a bookmark of some site. To avoid this case, adding a timeout parameter at requests.get() function is a solution. Signed-off-by: Taku Izumi * Avoid character corruption of scraping some Japanese sites This patch fixes character corruption of scraping some Japanese sites. To avoid character corruption, I use r.content instead of r.text in load_page function. The reason of character corruption is encoding problem, I think. r.text handles data as unicode encoded text, so if scraping web site's charset is not unicode encoded, character corruption occurs. r.content handles data as str[], we can avoid encoding problem. Signed-off-by: Taku Izumi * use charset_normalizer to determine response encoding Co-authored-by: Taku Izumi Co-authored-by: Sascha Ißbrücker --- bookmarks/services/website_loader.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/bookmarks/services/website_loader.py b/bookmarks/services/website_loader.py index 5d00f6d..fef0d5b 100644 --- a/bookmarks/services/website_loader.py +++ b/bookmarks/services/website_loader.py @@ -2,6 +2,7 @@ from dataclasses import dataclass import requests from bs4 import BeautifulSoup +from charset_normalizer import from_bytes @dataclass @@ -33,5 +34,11 @@ def load_website_metadata(url: str): def load_page(url: str): - r = requests.get(url) - return r.text + r = requests.get(url, timeout=10) + + # Use charset_normalizer to determine encoding that best matches the response content + # Several sites seem to specify the response encoding incorrectly, so we ignore it and use custom logic instead + # This is different from Response.text which does respect the encoding specified in the response first, + # before trying to determine one + results = from_bytes(r.content) + return str(results.best())