diff --git a/bookmarks/api/routes.py b/bookmarks/api/routes.py index 562d007..649d581 100644 --- a/bookmarks/api/routes.py +++ b/bookmarks/api/routes.py @@ -27,6 +27,7 @@ from bookmarks.models import ( BookmarkBundle, ) from bookmarks.services import assets, bookmarks, bundles, auto_tagging, website_loader +from bookmarks.utils import normalize_url from bookmarks.type_defs import HttpRequest from bookmarks.views import access @@ -107,7 +108,10 @@ class BookmarkViewSet( def check(self, request: HttpRequest): url = request.GET.get("url") ignore_cache = request.GET.get("ignore_cache", False) in ["true"] - bookmark = Bookmark.objects.filter(owner=request.user, url=url).first() + normalized_url = normalize_url(url) + bookmark = Bookmark.objects.filter( + owner=request.user, url_normalized=normalized_url + ).first() existing_bookmark_data = ( self.get_serializer(bookmark).data if bookmark else None ) @@ -151,7 +155,10 @@ class BookmarkViewSet( status=status.HTTP_400_BAD_REQUEST, ) - bookmark = Bookmark.objects.filter(owner=request.user, url=url).first() + normalized_url = normalize_url(url) + bookmark = Bookmark.objects.filter( + owner=request.user, url_normalized=normalized_url + ).first() if not bookmark: bookmark = Bookmark(url=url) diff --git a/bookmarks/forms.py b/bookmarks/forms.py index 5aa3efc..c571d0f 100644 --- a/bookmarks/forms.py +++ b/bookmarks/forms.py @@ -5,6 +5,7 @@ from bookmarks.models import Bookmark, build_tag_string from bookmarks.validators import BookmarkURLValidator from bookmarks.type_defs import HttpRequest from bookmarks.services.bookmarks import create_bookmark, update_bookmark +from bookmarks.utils import normalize_url class CustomErrorList(ErrorList): @@ -85,8 +86,11 @@ class BookmarkForm(forms.ModelForm): # raise a validation error in that case. url = self.cleaned_data["url"] if self.instance.pk: + normalized_url = normalize_url(url) is_duplicate = ( - Bookmark.objects.filter(owner=self.instance.owner, url=url) + Bookmark.objects.filter( + owner=self.instance.owner, url_normalized=normalized_url + ) .exclude(pk=self.instance.pk) .exists() ) diff --git a/bookmarks/migrations/0046_add_url_normalized_field.py b/bookmarks/migrations/0046_add_url_normalized_field.py new file mode 100644 index 0000000..2b689ad --- /dev/null +++ b/bookmarks/migrations/0046_add_url_normalized_field.py @@ -0,0 +1,18 @@ +# Generated by Django 5.2.3 on 2025-08-22 08:26 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("bookmarks", "0045_userprofile_hide_bundles_bookmarkbundle"), + ] + + operations = [ + migrations.AddField( + model_name="bookmark", + name="url_normalized", + field=models.CharField(blank=True, db_index=True, max_length=2048), + ), + ] diff --git a/bookmarks/migrations/0047_populate_url_normalized_field.py b/bookmarks/migrations/0047_populate_url_normalized_field.py new file mode 100644 index 0000000..bb4fae2 --- /dev/null +++ b/bookmarks/migrations/0047_populate_url_normalized_field.py @@ -0,0 +1,38 @@ +# Generated by Django 5.2.3 on 2025-08-22 08:28 + +from django.db import migrations, transaction +from bookmarks.utils import normalize_url + + +def populate_url_normalized(apps, schema_editor): + Bookmark = apps.get_model("bookmarks", "Bookmark") + + batch_size = 500 + with transaction.atomic(): + qs = Bookmark.objects.all() + for start in range(0, qs.count(), batch_size): + batch = list(qs[start : start + batch_size]) + for bookmark in batch: + bookmark.url_normalized = normalize_url(bookmark.url) + Bookmark.objects.bulk_update( + batch, ["url_normalized"], batch_size=batch_size + ) + + +def reverse_populate_url_normalized(apps, schema_editor): + Bookmark = apps.get_model("bookmarks", "Bookmark") + Bookmark.objects.all().update(url_normalized="") + + +class Migration(migrations.Migration): + + dependencies = [ + ("bookmarks", "0046_add_url_normalized_field"), + ] + + operations = [ + migrations.RunPython( + populate_url_normalized, + reverse_populate_url_normalized, + ), + ] diff --git a/bookmarks/models.py b/bookmarks/models.py index fe7efce..2e860d7 100644 --- a/bookmarks/models.py +++ b/bookmarks/models.py @@ -14,7 +14,7 @@ from django.db.models.signals import post_save, post_delete from django.dispatch import receiver from django.http import QueryDict -from bookmarks.utils import unique +from bookmarks.utils import unique, normalize_url from bookmarks.validators import BookmarkURLValidator logger = logging.getLogger(__name__) @@ -54,6 +54,7 @@ def build_tag_string(tag_names: List[str], delimiter: str = ","): class Bookmark(models.Model): url = models.CharField(max_length=2048, validators=[BookmarkURLValidator()]) + url_normalized = models.CharField(max_length=2048, blank=True, db_index=True) title = models.CharField(max_length=512, blank=True) description = models.TextField(blank=True) notes = models.TextField(blank=True) @@ -96,6 +97,10 @@ class Bookmark(models.Model): names = [tag.name for tag in self.tags.all()] return sorted(names) + def save(self, *args, **kwargs): + self.url_normalized = normalize_url(self.url) + super().save(*args, **kwargs) + def __str__(self): return self.resolved_title + " (" + self.url[:30] + "...)" diff --git a/bookmarks/services/bookmarks.py b/bookmarks/services/bookmarks.py index 82b1fb2..923ba87 100644 --- a/bookmarks/services/bookmarks.py +++ b/bookmarks/services/bookmarks.py @@ -4,6 +4,7 @@ from typing import Union from django.utils import timezone from bookmarks.models import Bookmark, User, parse_tag_string +from bookmarks.utils import normalize_url from bookmarks.services import auto_tagging from bookmarks.services import tasks from bookmarks.services import website_loader @@ -19,8 +20,9 @@ def create_bookmark( disable_html_snapshot: bool = False, ): # If URL is already bookmarked, then update it + normalized_url = normalize_url(bookmark.url) existing_bookmark: Bookmark = Bookmark.objects.filter( - owner=current_user, url=bookmark.url + owner=current_user, url_normalized=normalized_url ).first() if existing_bookmark is not None: diff --git a/bookmarks/tests/test_bookmark_edit_view.py b/bookmarks/tests/test_bookmark_edit_view.py index bec1ee4..da7dc67 100644 --- a/bookmarks/tests/test_bookmark_edit_view.py +++ b/bookmarks/tests/test_bookmark_edit_view.py @@ -188,6 +188,25 @@ class BookmarkEditViewTestCase(TestCase, BookmarkFactoryMixin): edited_bookmark.refresh_from_db() self.assertNotEqual(edited_bookmark.url, existing_bookmark.url) + def test_should_prevent_duplicate_normalized_urls(self): + self.setup_bookmark(url="https://EXAMPLE.COM/path/?z=1&a=2") + + edited_bookmark = self.setup_bookmark(url="http://different.com") + + form_data = self.create_form_data({"url": "https://example.com/path?a=2&z=1"}) + response = self.client.post( + reverse("linkding:bookmarks.edit", args=[edited_bookmark.id]), form_data + ) + + self.assertEqual(response.status_code, 422) + self.assertInHTML( + "
  • A bookmark with this URL already exists.
  • ", + response.content.decode(), + ) + + edited_bookmark.refresh_from_db() + self.assertEqual(edited_bookmark.url, "http://different.com") + def test_should_redirect_to_return_url(self): bookmark = self.setup_bookmark() form_data = self.create_form_data() diff --git a/bookmarks/tests/test_bookmarks_api.py b/bookmarks/tests/test_bookmarks_api.py index 5f22d03..ea77a0c 100644 --- a/bookmarks/tests/test_bookmarks_api.py +++ b/bookmarks/tests/test_bookmarks_api.py @@ -1047,6 +1047,29 @@ class BookmarksApiTestCase(LinkdingApiTestCase, BookmarkFactoryMixin): self.assertEqual(expected_metadata.description, metadata["description"]) self.assertEqual(expected_metadata.preview_image, metadata["preview_image"]) + def test_check_returns_bookmark_using_normalized_url(self): + self.authenticate() + + # Create bookmark with one URL variant + bookmark = self.setup_bookmark( + url="https://EXAMPLE.COM/path/?z=1&a=2", + title="Example title", + description="Example description", + ) + + # Check with different URL variant that should normalize to the same URL + url = reverse("linkding:bookmark-check") + check_url = urllib.parse.quote_plus("https://example.com/path?a=2&z=1") + response = self.get( + f"{url}?url={check_url}", expected_status_code=status.HTTP_200_OK + ) + bookmark_data = response.data["bookmark"] + + # Should find the existing bookmark despite URL differences + self.assertIsNotNone(bookmark_data) + self.assertEqual(bookmark.id, bookmark_data["id"]) + self.assertEqual(bookmark.title, bookmark_data["title"]) + def test_check_returns_no_auto_tags_if_none_configured(self): self.authenticate() diff --git a/bookmarks/tests/test_bookmarks_service.py b/bookmarks/tests/test_bookmarks_service.py index 42cb78d..a930580 100644 --- a/bookmarks/tests/test_bookmarks_service.py +++ b/bookmarks/tests/test_bookmarks_service.py @@ -95,6 +95,41 @@ class BookmarkServiceTestCase(TestCase, BookmarkFactoryMixin): # Saving a duplicate bookmark should not modify archive flag - right? self.assertFalse(updated_bookmark.is_archived) + def test_create_should_update_existing_bookmark_with_normalized_url( + self, + ): + original_bookmark = self.setup_bookmark( + url="https://EXAMPLE.com/path/?a=1&z=2", unread=False, shared=False + ) + bookmark_data = Bookmark( + url="HTTPS://example.com/path?z=2&a=1", + title="Updated Title", + description="Updated description", + ) + updated_bookmark = create_bookmark( + bookmark_data, "", self.get_or_create_test_user() + ) + + self.assertEqual(Bookmark.objects.count(), 1) + self.assertEqual(updated_bookmark.id, original_bookmark.id) + self.assertEqual(updated_bookmark.title, bookmark_data.title) + + def test_create_should_populate_url_normalized_field(self): + bookmark_data = Bookmark( + url="https://EXAMPLE.COM/path/?z=1&a=2", + title="Test Title", + description="Test description", + ) + created_bookmark = create_bookmark( + bookmark_data, "", self.get_or_create_test_user() + ) + + created_bookmark.refresh_from_db() + self.assertEqual(created_bookmark.url, "https://EXAMPLE.COM/path/?z=1&a=2") + self.assertEqual( + created_bookmark.url_normalized, "https://example.com/path?a=2&z=1" + ) + def test_create_should_create_web_archive_snapshot(self): with patch.object( tasks, "create_web_archive_snapshot" diff --git a/bookmarks/tests/test_utils.py b/bookmarks/tests/test_utils.py index 136cf70..7bdeb38 100644 --- a/bookmarks/tests/test_utils.py +++ b/bookmarks/tests/test_utils.py @@ -8,6 +8,7 @@ from bookmarks.utils import ( humanize_absolute_date, humanize_relative_date, parse_timestamp, + normalize_url, ) @@ -182,3 +183,181 @@ class UtilsTestCase(TestCase): with self.assertRaises(ValueError): self.verify_timestamp(now, 1000000000) + + def test_normalize_url_trailing_slash_handling(self): + test_cases = [ + ("https://example.com/", "https://example.com"), + ( + "https://example.com/path/", + "https://example.com/path", + ), + ("https://example.com/path/to/page/", "https://example.com/path/to/page"), + ( + "https://example.com/path", + "https://example.com/path", + ), + ] + + for original, expected in test_cases: + with self.subTest(url=original): + result = normalize_url(original) + self.assertEqual(expected, result) + + def test_normalize_url_query_parameters(self): + test_cases = [ + ("https://example.com?z=1&a=2", "https://example.com?a=2&z=1"), + ("https://example.com?c=3&b=2&a=1", "https://example.com?a=1&b=2&c=3"), + ("https://example.com?param=value", "https://example.com?param=value"), + ("https://example.com?", "https://example.com"), + ( + "https://example.com?empty=&filled=value", + "https://example.com?empty=&filled=value", + ), + ] + + for original, expected in test_cases: + with self.subTest(url=original): + result = normalize_url(original) + self.assertEqual(expected, result) + + def test_normalize_url_case_sensitivity(self): + test_cases = [ + ( + "https://EXAMPLE.com/Path/To/Page", + "https://example.com/Path/To/Page", + ), + ("https://EXAMPLE.COM/API/v1/Users", "https://example.com/API/v1/Users"), + ( + "HTTPS://EXAMPLE.COM/path", + "https://example.com/path", + ), + ] + + for original, expected in test_cases: + with self.subTest(url=original): + result = normalize_url(original) + self.assertEqual(expected, result) + + def test_normalize_url_special_characters_and_encoding(self): + test_cases = [ + ( + "https://example.com/path%20with%20spaces", + "https://example.com/path%20with%20spaces", + ), + ("https://example.com/caf%C3%A9", "https://example.com/caf%C3%A9"), + ( + "https://example.com/path?q=hello%20world", + "https://example.com/path?q=hello%20world", + ), + ("https://example.com/pàth", "https://example.com/pàth"), + ] + + for original, expected in test_cases: + with self.subTest(url=original): + result = normalize_url(original) + self.assertEqual(expected, result) + + def test_normalize_url_various_protocols(self): + test_cases = [ + ("FTP://example.com", "ftp://example.com"), + ("HTTP://EXAMPLE.COM", "http://example.com"), + ("https://example.com", "https://example.com"), + ("file:///path/to/file", "file:///path/to/file"), + ] + + for original, expected in test_cases: + with self.subTest(url=original): + result = normalize_url(original) + self.assertEqual(expected, result) + + def test_normalize_url_port_handling(self): + test_cases = [ + ("https://example.com:8080", "https://example.com:8080"), + ("https://EXAMPLE.COM:8080", "https://example.com:8080"), + ("http://example.com:80", "http://example.com:80"), + ("https://example.com:443", "https://example.com:443"), + ] + + for original, expected in test_cases: + with self.subTest(url=original): + result = normalize_url(original) + self.assertEqual(expected, result) + + def test_normalize_url_authentication_handling(self): + test_cases = [ + ("https://user:pass@EXAMPLE.COM", "https://user:pass@example.com"), + ("https://user@EXAMPLE.COM", "https://user@example.com"), + ("ftp://admin:secret@EXAMPLE.COM", "ftp://admin:secret@example.com"), + ] + + for original, expected in test_cases: + with self.subTest(url=original): + result = normalize_url(original) + self.assertEqual(expected, result) + + def test_normalize_url_fragment_handling(self): + test_cases = [ + ("https://example.com#", "https://example.com"), + ("https://example.com#section", "https://example.com#section"), + ("https://EXAMPLE.COM/path#Section", "https://example.com/path#Section"), + ("https://EXAMPLE.COM/path/#Section", "https://example.com/path#Section"), + ("https://example.com?a=1#fragment", "https://example.com?a=1#fragment"), + ( + "https://example.com?z=2&a=1#fragment", + "https://example.com?a=1&z=2#fragment", + ), + ] + + for original, expected in test_cases: + with self.subTest(url=original): + result = normalize_url(original) + self.assertEqual(expected, result) + + def test_normalize_url_edge_cases(self): + test_cases = [ + ("", ""), + (" ", ""), + (" https://example.com ", "https://example.com"), + ("not-a-url", "not-a-url"), + ("://invalid", "://invalid"), + ] + + for original, expected in test_cases: + with self.subTest(url=original): + result = normalize_url(original) + self.assertEqual(expected, result) + + def test_normalize_url_internationalized_domain_names(self): + test_cases = [ + ( + "https://xn--fsq.xn--0zwm56d", + "https://xn--fsq.xn--0zwm56d", + ), + ("https://测试.中国", "https://测试.中国"), + ] + + for original, expected in test_cases: + with self.subTest(url=original): + result = normalize_url(original) + self.assertEqual(expected.lower() if expected else expected, result) + + def test_normalize_url_complex_query_parameters(self): + test_cases = [ + ( + "https://example.com?z=1&a=2&z=3&b=4", + "https://example.com?a=2&b=4&z=1&z=3", # Multiple values for same key + ), + ( + "https://example.com?param=value1¶m=value2", + "https://example.com?param=value1¶m=value2", + ), + ( + "https://example.com?special=%21%40%23%24%25", + "https://example.com?special=%21%40%23%24%25", + ), + ] + + for original, expected in test_cases: + with self.subTest(url=original): + result = normalize_url(original) + self.assertEqual(expected, result) diff --git a/bookmarks/utils.py b/bookmarks/utils.py index 27db7d4..da814fa 100644 --- a/bookmarks/utils.py +++ b/bookmarks/utils.py @@ -139,3 +139,49 @@ def generate_username(email, claims): else: username = email return unicodedata.normalize("NFKC", username)[:150] + + +def normalize_url(url: str) -> str: + if not url or not isinstance(url, str): + return "" + + url = url.strip() + if not url: + return "" + + try: + parsed = urllib.parse.urlparse(url) + + # Normalize the scheme to lowercase + scheme = parsed.scheme.lower() + + # Normalize the netloc (domain) to lowercase + netloc = parsed.hostname.lower() if parsed.hostname else "" + if parsed.port: + netloc += f":{parsed.port}" + if parsed.username: + auth = parsed.username + if parsed.password: + auth += f":{parsed.password}" + netloc = f"{auth}@{netloc}" + + # Remove trailing slashes from all paths + path = parsed.path.rstrip("/") if parsed.path else "" + + # Sort query parameters alphabetically + query = "" + if parsed.query: + query_params = urllib.parse.parse_qsl(parsed.query, keep_blank_values=True) + query_params.sort(key=lambda x: (x[0], x[1])) + query = urllib.parse.urlencode(query_params, quote_via=urllib.parse.quote) + + # Keep fragment as-is + fragment = parsed.fragment + + # Reconstruct the normalized URL + return urllib.parse.urlunparse( + (scheme, netloc, path, parsed.params, query, fragment) + ) + + except (ValueError, AttributeError): + return url