Fix website loader content encoding detection (#482)

This commit is contained in:
Sascha Ißbrücker
2023-05-30 22:04:54 +02:00
committed by GitHub
parent 5d48c64b2b
commit 4220ea0b4c
2 changed files with 16 additions and 2 deletions

View File

@@ -71,8 +71,10 @@ def load_page(url: str):
logger.debug(f'Loaded chunk (iteration={iteration}, total={size / 1024})')
# Stop reading if we have parsed end of head tag
if '</head>'.encode('utf-8') in content:
end_of_head = '</head>'.encode('utf-8')
if end_of_head in content:
logger.debug(f'Found closing head tag after {size} bytes')
content = content.split(end_of_head)[0] + end_of_head
break
# Stop reading if we exceed limit
if size > MAX_CONTENT_LIMIT: