mirror of
https://github.com/sissbruecker/linkding.git
synced 2025-09-05 16:56:46 +02:00
Normalize URLs when checking for duplicates (#1169)
* Normalize URLs when checking for duplicates * Improve migration script
This commit is contained in:
@@ -27,6 +27,7 @@ from bookmarks.models import (
|
||||
BookmarkBundle,
|
||||
)
|
||||
from bookmarks.services import assets, bookmarks, bundles, auto_tagging, website_loader
|
||||
from bookmarks.utils import normalize_url
|
||||
from bookmarks.type_defs import HttpRequest
|
||||
from bookmarks.views import access
|
||||
|
||||
@@ -107,7 +108,10 @@ class BookmarkViewSet(
|
||||
def check(self, request: HttpRequest):
|
||||
url = request.GET.get("url")
|
||||
ignore_cache = request.GET.get("ignore_cache", False) in ["true"]
|
||||
bookmark = Bookmark.objects.filter(owner=request.user, url=url).first()
|
||||
normalized_url = normalize_url(url)
|
||||
bookmark = Bookmark.objects.filter(
|
||||
owner=request.user, url_normalized=normalized_url
|
||||
).first()
|
||||
existing_bookmark_data = (
|
||||
self.get_serializer(bookmark).data if bookmark else None
|
||||
)
|
||||
@@ -151,7 +155,10 @@ class BookmarkViewSet(
|
||||
status=status.HTTP_400_BAD_REQUEST,
|
||||
)
|
||||
|
||||
bookmark = Bookmark.objects.filter(owner=request.user, url=url).first()
|
||||
normalized_url = normalize_url(url)
|
||||
bookmark = Bookmark.objects.filter(
|
||||
owner=request.user, url_normalized=normalized_url
|
||||
).first()
|
||||
|
||||
if not bookmark:
|
||||
bookmark = Bookmark(url=url)
|
||||
|
@@ -5,6 +5,7 @@ from bookmarks.models import Bookmark, build_tag_string
|
||||
from bookmarks.validators import BookmarkURLValidator
|
||||
from bookmarks.type_defs import HttpRequest
|
||||
from bookmarks.services.bookmarks import create_bookmark, update_bookmark
|
||||
from bookmarks.utils import normalize_url
|
||||
|
||||
|
||||
class CustomErrorList(ErrorList):
|
||||
@@ -85,8 +86,11 @@ class BookmarkForm(forms.ModelForm):
|
||||
# raise a validation error in that case.
|
||||
url = self.cleaned_data["url"]
|
||||
if self.instance.pk:
|
||||
normalized_url = normalize_url(url)
|
||||
is_duplicate = (
|
||||
Bookmark.objects.filter(owner=self.instance.owner, url=url)
|
||||
Bookmark.objects.filter(
|
||||
owner=self.instance.owner, url_normalized=normalized_url
|
||||
)
|
||||
.exclude(pk=self.instance.pk)
|
||||
.exists()
|
||||
)
|
||||
|
18
bookmarks/migrations/0046_add_url_normalized_field.py
Normal file
18
bookmarks/migrations/0046_add_url_normalized_field.py
Normal file
@@ -0,0 +1,18 @@
|
||||
# Generated by Django 5.2.3 on 2025-08-22 08:26
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
("bookmarks", "0045_userprofile_hide_bundles_bookmarkbundle"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name="bookmark",
|
||||
name="url_normalized",
|
||||
field=models.CharField(blank=True, db_index=True, max_length=2048),
|
||||
),
|
||||
]
|
38
bookmarks/migrations/0047_populate_url_normalized_field.py
Normal file
38
bookmarks/migrations/0047_populate_url_normalized_field.py
Normal file
@@ -0,0 +1,38 @@
|
||||
# Generated by Django 5.2.3 on 2025-08-22 08:28
|
||||
|
||||
from django.db import migrations, transaction
|
||||
from bookmarks.utils import normalize_url
|
||||
|
||||
|
||||
def populate_url_normalized(apps, schema_editor):
|
||||
Bookmark = apps.get_model("bookmarks", "Bookmark")
|
||||
|
||||
batch_size = 500
|
||||
with transaction.atomic():
|
||||
qs = Bookmark.objects.all()
|
||||
for start in range(0, qs.count(), batch_size):
|
||||
batch = list(qs[start : start + batch_size])
|
||||
for bookmark in batch:
|
||||
bookmark.url_normalized = normalize_url(bookmark.url)
|
||||
Bookmark.objects.bulk_update(
|
||||
batch, ["url_normalized"], batch_size=batch_size
|
||||
)
|
||||
|
||||
|
||||
def reverse_populate_url_normalized(apps, schema_editor):
|
||||
Bookmark = apps.get_model("bookmarks", "Bookmark")
|
||||
Bookmark.objects.all().update(url_normalized="")
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
("bookmarks", "0046_add_url_normalized_field"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RunPython(
|
||||
populate_url_normalized,
|
||||
reverse_populate_url_normalized,
|
||||
),
|
||||
]
|
@@ -14,7 +14,7 @@ from django.db.models.signals import post_save, post_delete
|
||||
from django.dispatch import receiver
|
||||
from django.http import QueryDict
|
||||
|
||||
from bookmarks.utils import unique
|
||||
from bookmarks.utils import unique, normalize_url
|
||||
from bookmarks.validators import BookmarkURLValidator
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -54,6 +54,7 @@ def build_tag_string(tag_names: List[str], delimiter: str = ","):
|
||||
|
||||
class Bookmark(models.Model):
|
||||
url = models.CharField(max_length=2048, validators=[BookmarkURLValidator()])
|
||||
url_normalized = models.CharField(max_length=2048, blank=True, db_index=True)
|
||||
title = models.CharField(max_length=512, blank=True)
|
||||
description = models.TextField(blank=True)
|
||||
notes = models.TextField(blank=True)
|
||||
@@ -96,6 +97,10 @@ class Bookmark(models.Model):
|
||||
names = [tag.name for tag in self.tags.all()]
|
||||
return sorted(names)
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
self.url_normalized = normalize_url(self.url)
|
||||
super().save(*args, **kwargs)
|
||||
|
||||
def __str__(self):
|
||||
return self.resolved_title + " (" + self.url[:30] + "...)"
|
||||
|
||||
|
@@ -4,6 +4,7 @@ from typing import Union
|
||||
from django.utils import timezone
|
||||
|
||||
from bookmarks.models import Bookmark, User, parse_tag_string
|
||||
from bookmarks.utils import normalize_url
|
||||
from bookmarks.services import auto_tagging
|
||||
from bookmarks.services import tasks
|
||||
from bookmarks.services import website_loader
|
||||
@@ -19,8 +20,9 @@ def create_bookmark(
|
||||
disable_html_snapshot: bool = False,
|
||||
):
|
||||
# If URL is already bookmarked, then update it
|
||||
normalized_url = normalize_url(bookmark.url)
|
||||
existing_bookmark: Bookmark = Bookmark.objects.filter(
|
||||
owner=current_user, url=bookmark.url
|
||||
owner=current_user, url_normalized=normalized_url
|
||||
).first()
|
||||
|
||||
if existing_bookmark is not None:
|
||||
|
@@ -188,6 +188,25 @@ class BookmarkEditViewTestCase(TestCase, BookmarkFactoryMixin):
|
||||
edited_bookmark.refresh_from_db()
|
||||
self.assertNotEqual(edited_bookmark.url, existing_bookmark.url)
|
||||
|
||||
def test_should_prevent_duplicate_normalized_urls(self):
|
||||
self.setup_bookmark(url="https://EXAMPLE.COM/path/?z=1&a=2")
|
||||
|
||||
edited_bookmark = self.setup_bookmark(url="http://different.com")
|
||||
|
||||
form_data = self.create_form_data({"url": "https://example.com/path?a=2&z=1"})
|
||||
response = self.client.post(
|
||||
reverse("linkding:bookmarks.edit", args=[edited_bookmark.id]), form_data
|
||||
)
|
||||
|
||||
self.assertEqual(response.status_code, 422)
|
||||
self.assertInHTML(
|
||||
"<li>A bookmark with this URL already exists.</li>",
|
||||
response.content.decode(),
|
||||
)
|
||||
|
||||
edited_bookmark.refresh_from_db()
|
||||
self.assertEqual(edited_bookmark.url, "http://different.com")
|
||||
|
||||
def test_should_redirect_to_return_url(self):
|
||||
bookmark = self.setup_bookmark()
|
||||
form_data = self.create_form_data()
|
||||
|
@@ -1047,6 +1047,29 @@ class BookmarksApiTestCase(LinkdingApiTestCase, BookmarkFactoryMixin):
|
||||
self.assertEqual(expected_metadata.description, metadata["description"])
|
||||
self.assertEqual(expected_metadata.preview_image, metadata["preview_image"])
|
||||
|
||||
def test_check_returns_bookmark_using_normalized_url(self):
|
||||
self.authenticate()
|
||||
|
||||
# Create bookmark with one URL variant
|
||||
bookmark = self.setup_bookmark(
|
||||
url="https://EXAMPLE.COM/path/?z=1&a=2",
|
||||
title="Example title",
|
||||
description="Example description",
|
||||
)
|
||||
|
||||
# Check with different URL variant that should normalize to the same URL
|
||||
url = reverse("linkding:bookmark-check")
|
||||
check_url = urllib.parse.quote_plus("https://example.com/path?a=2&z=1")
|
||||
response = self.get(
|
||||
f"{url}?url={check_url}", expected_status_code=status.HTTP_200_OK
|
||||
)
|
||||
bookmark_data = response.data["bookmark"]
|
||||
|
||||
# Should find the existing bookmark despite URL differences
|
||||
self.assertIsNotNone(bookmark_data)
|
||||
self.assertEqual(bookmark.id, bookmark_data["id"])
|
||||
self.assertEqual(bookmark.title, bookmark_data["title"])
|
||||
|
||||
def test_check_returns_no_auto_tags_if_none_configured(self):
|
||||
self.authenticate()
|
||||
|
||||
|
@@ -95,6 +95,41 @@ class BookmarkServiceTestCase(TestCase, BookmarkFactoryMixin):
|
||||
# Saving a duplicate bookmark should not modify archive flag - right?
|
||||
self.assertFalse(updated_bookmark.is_archived)
|
||||
|
||||
def test_create_should_update_existing_bookmark_with_normalized_url(
|
||||
self,
|
||||
):
|
||||
original_bookmark = self.setup_bookmark(
|
||||
url="https://EXAMPLE.com/path/?a=1&z=2", unread=False, shared=False
|
||||
)
|
||||
bookmark_data = Bookmark(
|
||||
url="HTTPS://example.com/path?z=2&a=1",
|
||||
title="Updated Title",
|
||||
description="Updated description",
|
||||
)
|
||||
updated_bookmark = create_bookmark(
|
||||
bookmark_data, "", self.get_or_create_test_user()
|
||||
)
|
||||
|
||||
self.assertEqual(Bookmark.objects.count(), 1)
|
||||
self.assertEqual(updated_bookmark.id, original_bookmark.id)
|
||||
self.assertEqual(updated_bookmark.title, bookmark_data.title)
|
||||
|
||||
def test_create_should_populate_url_normalized_field(self):
|
||||
bookmark_data = Bookmark(
|
||||
url="https://EXAMPLE.COM/path/?z=1&a=2",
|
||||
title="Test Title",
|
||||
description="Test description",
|
||||
)
|
||||
created_bookmark = create_bookmark(
|
||||
bookmark_data, "", self.get_or_create_test_user()
|
||||
)
|
||||
|
||||
created_bookmark.refresh_from_db()
|
||||
self.assertEqual(created_bookmark.url, "https://EXAMPLE.COM/path/?z=1&a=2")
|
||||
self.assertEqual(
|
||||
created_bookmark.url_normalized, "https://example.com/path?a=2&z=1"
|
||||
)
|
||||
|
||||
def test_create_should_create_web_archive_snapshot(self):
|
||||
with patch.object(
|
||||
tasks, "create_web_archive_snapshot"
|
||||
|
@@ -8,6 +8,7 @@ from bookmarks.utils import (
|
||||
humanize_absolute_date,
|
||||
humanize_relative_date,
|
||||
parse_timestamp,
|
||||
normalize_url,
|
||||
)
|
||||
|
||||
|
||||
@@ -182,3 +183,181 @@ class UtilsTestCase(TestCase):
|
||||
|
||||
with self.assertRaises(ValueError):
|
||||
self.verify_timestamp(now, 1000000000)
|
||||
|
||||
def test_normalize_url_trailing_slash_handling(self):
|
||||
test_cases = [
|
||||
("https://example.com/", "https://example.com"),
|
||||
(
|
||||
"https://example.com/path/",
|
||||
"https://example.com/path",
|
||||
),
|
||||
("https://example.com/path/to/page/", "https://example.com/path/to/page"),
|
||||
(
|
||||
"https://example.com/path",
|
||||
"https://example.com/path",
|
||||
),
|
||||
]
|
||||
|
||||
for original, expected in test_cases:
|
||||
with self.subTest(url=original):
|
||||
result = normalize_url(original)
|
||||
self.assertEqual(expected, result)
|
||||
|
||||
def test_normalize_url_query_parameters(self):
|
||||
test_cases = [
|
||||
("https://example.com?z=1&a=2", "https://example.com?a=2&z=1"),
|
||||
("https://example.com?c=3&b=2&a=1", "https://example.com?a=1&b=2&c=3"),
|
||||
("https://example.com?param=value", "https://example.com?param=value"),
|
||||
("https://example.com?", "https://example.com"),
|
||||
(
|
||||
"https://example.com?empty=&filled=value",
|
||||
"https://example.com?empty=&filled=value",
|
||||
),
|
||||
]
|
||||
|
||||
for original, expected in test_cases:
|
||||
with self.subTest(url=original):
|
||||
result = normalize_url(original)
|
||||
self.assertEqual(expected, result)
|
||||
|
||||
def test_normalize_url_case_sensitivity(self):
|
||||
test_cases = [
|
||||
(
|
||||
"https://EXAMPLE.com/Path/To/Page",
|
||||
"https://example.com/Path/To/Page",
|
||||
),
|
||||
("https://EXAMPLE.COM/API/v1/Users", "https://example.com/API/v1/Users"),
|
||||
(
|
||||
"HTTPS://EXAMPLE.COM/path",
|
||||
"https://example.com/path",
|
||||
),
|
||||
]
|
||||
|
||||
for original, expected in test_cases:
|
||||
with self.subTest(url=original):
|
||||
result = normalize_url(original)
|
||||
self.assertEqual(expected, result)
|
||||
|
||||
def test_normalize_url_special_characters_and_encoding(self):
|
||||
test_cases = [
|
||||
(
|
||||
"https://example.com/path%20with%20spaces",
|
||||
"https://example.com/path%20with%20spaces",
|
||||
),
|
||||
("https://example.com/caf%C3%A9", "https://example.com/caf%C3%A9"),
|
||||
(
|
||||
"https://example.com/path?q=hello%20world",
|
||||
"https://example.com/path?q=hello%20world",
|
||||
),
|
||||
("https://example.com/pàth", "https://example.com/pàth"),
|
||||
]
|
||||
|
||||
for original, expected in test_cases:
|
||||
with self.subTest(url=original):
|
||||
result = normalize_url(original)
|
||||
self.assertEqual(expected, result)
|
||||
|
||||
def test_normalize_url_various_protocols(self):
|
||||
test_cases = [
|
||||
("FTP://example.com", "ftp://example.com"),
|
||||
("HTTP://EXAMPLE.COM", "http://example.com"),
|
||||
("https://example.com", "https://example.com"),
|
||||
("file:///path/to/file", "file:///path/to/file"),
|
||||
]
|
||||
|
||||
for original, expected in test_cases:
|
||||
with self.subTest(url=original):
|
||||
result = normalize_url(original)
|
||||
self.assertEqual(expected, result)
|
||||
|
||||
def test_normalize_url_port_handling(self):
|
||||
test_cases = [
|
||||
("https://example.com:8080", "https://example.com:8080"),
|
||||
("https://EXAMPLE.COM:8080", "https://example.com:8080"),
|
||||
("http://example.com:80", "http://example.com:80"),
|
||||
("https://example.com:443", "https://example.com:443"),
|
||||
]
|
||||
|
||||
for original, expected in test_cases:
|
||||
with self.subTest(url=original):
|
||||
result = normalize_url(original)
|
||||
self.assertEqual(expected, result)
|
||||
|
||||
def test_normalize_url_authentication_handling(self):
|
||||
test_cases = [
|
||||
("https://user:pass@EXAMPLE.COM", "https://user:pass@example.com"),
|
||||
("https://user@EXAMPLE.COM", "https://user@example.com"),
|
||||
("ftp://admin:secret@EXAMPLE.COM", "ftp://admin:secret@example.com"),
|
||||
]
|
||||
|
||||
for original, expected in test_cases:
|
||||
with self.subTest(url=original):
|
||||
result = normalize_url(original)
|
||||
self.assertEqual(expected, result)
|
||||
|
||||
def test_normalize_url_fragment_handling(self):
|
||||
test_cases = [
|
||||
("https://example.com#", "https://example.com"),
|
||||
("https://example.com#section", "https://example.com#section"),
|
||||
("https://EXAMPLE.COM/path#Section", "https://example.com/path#Section"),
|
||||
("https://EXAMPLE.COM/path/#Section", "https://example.com/path#Section"),
|
||||
("https://example.com?a=1#fragment", "https://example.com?a=1#fragment"),
|
||||
(
|
||||
"https://example.com?z=2&a=1#fragment",
|
||||
"https://example.com?a=1&z=2#fragment",
|
||||
),
|
||||
]
|
||||
|
||||
for original, expected in test_cases:
|
||||
with self.subTest(url=original):
|
||||
result = normalize_url(original)
|
||||
self.assertEqual(expected, result)
|
||||
|
||||
def test_normalize_url_edge_cases(self):
|
||||
test_cases = [
|
||||
("", ""),
|
||||
(" ", ""),
|
||||
(" https://example.com ", "https://example.com"),
|
||||
("not-a-url", "not-a-url"),
|
||||
("://invalid", "://invalid"),
|
||||
]
|
||||
|
||||
for original, expected in test_cases:
|
||||
with self.subTest(url=original):
|
||||
result = normalize_url(original)
|
||||
self.assertEqual(expected, result)
|
||||
|
||||
def test_normalize_url_internationalized_domain_names(self):
|
||||
test_cases = [
|
||||
(
|
||||
"https://xn--fsq.xn--0zwm56d",
|
||||
"https://xn--fsq.xn--0zwm56d",
|
||||
),
|
||||
("https://测试.中国", "https://测试.中国"),
|
||||
]
|
||||
|
||||
for original, expected in test_cases:
|
||||
with self.subTest(url=original):
|
||||
result = normalize_url(original)
|
||||
self.assertEqual(expected.lower() if expected else expected, result)
|
||||
|
||||
def test_normalize_url_complex_query_parameters(self):
|
||||
test_cases = [
|
||||
(
|
||||
"https://example.com?z=1&a=2&z=3&b=4",
|
||||
"https://example.com?a=2&b=4&z=1&z=3", # Multiple values for same key
|
||||
),
|
||||
(
|
||||
"https://example.com?param=value1¶m=value2",
|
||||
"https://example.com?param=value1¶m=value2",
|
||||
),
|
||||
(
|
||||
"https://example.com?special=%21%40%23%24%25",
|
||||
"https://example.com?special=%21%40%23%24%25",
|
||||
),
|
||||
]
|
||||
|
||||
for original, expected in test_cases:
|
||||
with self.subTest(url=original):
|
||||
result = normalize_url(original)
|
||||
self.assertEqual(expected, result)
|
||||
|
@@ -139,3 +139,49 @@ def generate_username(email, claims):
|
||||
else:
|
||||
username = email
|
||||
return unicodedata.normalize("NFKC", username)[:150]
|
||||
|
||||
|
||||
def normalize_url(url: str) -> str:
|
||||
if not url or not isinstance(url, str):
|
||||
return ""
|
||||
|
||||
url = url.strip()
|
||||
if not url:
|
||||
return ""
|
||||
|
||||
try:
|
||||
parsed = urllib.parse.urlparse(url)
|
||||
|
||||
# Normalize the scheme to lowercase
|
||||
scheme = parsed.scheme.lower()
|
||||
|
||||
# Normalize the netloc (domain) to lowercase
|
||||
netloc = parsed.hostname.lower() if parsed.hostname else ""
|
||||
if parsed.port:
|
||||
netloc += f":{parsed.port}"
|
||||
if parsed.username:
|
||||
auth = parsed.username
|
||||
if parsed.password:
|
||||
auth += f":{parsed.password}"
|
||||
netloc = f"{auth}@{netloc}"
|
||||
|
||||
# Remove trailing slashes from all paths
|
||||
path = parsed.path.rstrip("/") if parsed.path else ""
|
||||
|
||||
# Sort query parameters alphabetically
|
||||
query = ""
|
||||
if parsed.query:
|
||||
query_params = urllib.parse.parse_qsl(parsed.query, keep_blank_values=True)
|
||||
query_params.sort(key=lambda x: (x[0], x[1]))
|
||||
query = urllib.parse.urlencode(query_params, quote_via=urllib.parse.quote)
|
||||
|
||||
# Keep fragment as-is
|
||||
fragment = parsed.fragment
|
||||
|
||||
# Reconstruct the normalized URL
|
||||
return urllib.parse.urlunparse(
|
||||
(scheme, netloc, path, parsed.params, query, fragment)
|
||||
)
|
||||
|
||||
except (ValueError, AttributeError):
|
||||
return url
|
||||
|
Reference in New Issue
Block a user