Add new search engine that supports logical expressions (and, or, not) (#1198)

* parser implementation * add support for quoted strings * add support for tags * ignore empty tags * implicit and * prepare query conversion by disabling tests * convert query logic * fix nested combined tag searches * simplify query logic * Add special keyword support to parser * Add special keyword support to query builder * Handle invalid queries in query builder * Notify user about invalid queries * Add helper to strip tags from search query * Make tag cloud show all tags from search query * Use new method for extracting tags * Add query for getting tags from search query * Get selected tags through specific context * Properly remove selected tags from complex queries * cleanup * Clarify bundle search terms * Add documentation draft * Improve adding tags to search query * Add option to switch back to the old search
2025-11-21 05:24:02 +01:00 · 2025-10-05 12:51:08 +02:00
parent 229d3b511f
commit 051bd39256
22 changed files with 2831 additions and 127 deletions
--- a/bookmarks/services/search_query_parser.py
+++ b/bookmarks/services/search_query_parser.py
@@ -0,0 +1,575 @@
+from dataclasses import dataclass
+from enum import Enum
+from typing import List, Optional
+
+from bookmarks.models import UserProfile
+
+
+class TokenType(Enum):
+    TERM = "TERM"
+    TAG = "TAG"
+    SPECIAL_KEYWORD = "SPECIAL_KEYWORD"
+    AND = "AND"
+    OR = "OR"
+    NOT = "NOT"
+    LPAREN = "LPAREN"
+    RPAREN = "RPAREN"
+    EOF = "EOF"
+
+
+@dataclass
+class Token:
+    type: TokenType
+    value: str
+    position: int
+
+
+class SearchQueryTokenizer:
+    def __init__(self, query: str):
+        self.query = query.strip()
+        self.position = 0
+        self.current_char = self.query[0] if self.query else None
+
+    def advance(self):
+        """Move to the next character in the query."""
+        self.position += 1
+        if self.position >= len(self.query):
+            self.current_char = None
+        else:
+            self.current_char = self.query[self.position]
+
+    def skip_whitespace(self):
+        """Skip whitespace characters."""
+        while self.current_char and self.current_char.isspace():
+            self.advance()
+
+    def read_term(self) -> str:
+        """Read a search term (sequence of non-whitespace, non-special characters)."""
+        term = ""
+
+        while (
+            self.current_char
+            and not self.current_char.isspace()
+            and self.current_char not in "()\"'#!"
+        ):
+            term += self.current_char
+            self.advance()
+
+        return term
+
+    def read_quoted_string(self, quote_char: str) -> str:
+        """Read a quoted string, handling escaped quotes."""
+        content = ""
+        self.advance()  # skip opening quote
+
+        while self.current_char and self.current_char != quote_char:
+            if self.current_char == "\\":
+                # Handle escaped characters
+                self.advance()
+                if self.current_char:
+                    if self.current_char == "n":
+                        content += "\n"
+                    elif self.current_char == "t":
+                        content += "\t"
+                    elif self.current_char == "r":
+                        content += "\r"
+                    elif self.current_char == "\\":
+                        content += "\\"
+                    elif self.current_char == quote_char:
+                        content += quote_char
+                    else:
+                        # For any other escaped character, just include it as-is
+                        content += self.current_char
+                    self.advance()
+            else:
+                content += self.current_char
+                self.advance()
+
+        if self.current_char == quote_char:
+            self.advance()  # skip closing quote
+        else:
+            # Unclosed quote - we could raise an error here, but let's be lenient
+            # and treat it as if the quote was closed at the end
+            pass
+
+        return content
+
+    def read_tag(self) -> str:
+        """Read a tag (starts with # and continues until whitespace or special chars)."""
+        tag = ""
+        self.advance()  # skip the # character
+
+        while (
+            self.current_char
+            and not self.current_char.isspace()
+            and self.current_char not in "()\"'"
+        ):
+            tag += self.current_char
+            self.advance()
+
+        return tag
+
+    def read_special_keyword(self) -> str:
+        """Read a special keyword (starts with ! and continues until whitespace or special chars)."""
+        keyword = ""
+        self.advance()  # skip the ! character
+
+        while (
+            self.current_char
+            and not self.current_char.isspace()
+            and self.current_char not in "()\"'"
+        ):
+            keyword += self.current_char
+            self.advance()
+
+        return keyword
+
+    def tokenize(self) -> List[Token]:
+        """Convert the query string into a list of tokens."""
+        tokens = []
+
+        while self.current_char:
+            self.skip_whitespace()
+
+            if not self.current_char:
+                break
+
+            start_pos = self.position
+
+            if self.current_char == "(":
+                tokens.append(Token(TokenType.LPAREN, "(", start_pos))
+                self.advance()
+            elif self.current_char == ")":
+                tokens.append(Token(TokenType.RPAREN, ")", start_pos))
+                self.advance()
+            elif self.current_char in "\"'":
+                # Read a quoted string - always treated as a term
+                quote_char = self.current_char
+                term = self.read_quoted_string(quote_char)
+                tokens.append(Token(TokenType.TERM, term, start_pos))
+            elif self.current_char == "#":
+                # Read a tag
+                tag = self.read_tag()
+                # Only add the tag token if it has content
+                if tag:
+                    tokens.append(Token(TokenType.TAG, tag, start_pos))
+            elif self.current_char == "!":
+                # Read a special keyword
+                keyword = self.read_special_keyword()
+                # Only add the keyword token if it has content
+                if keyword:
+                    tokens.append(Token(TokenType.SPECIAL_KEYWORD, keyword, start_pos))
+            else:
+                # Read a term and check if it's an operator
+                term = self.read_term()
+                term_lower = term.lower()
+
+                if term_lower == "and":
+                    tokens.append(Token(TokenType.AND, term, start_pos))
+                elif term_lower == "or":
+                    tokens.append(Token(TokenType.OR, term, start_pos))
+                elif term_lower == "not":
+                    tokens.append(Token(TokenType.NOT, term, start_pos))
+                else:
+                    tokens.append(Token(TokenType.TERM, term, start_pos))
+
+        tokens.append(Token(TokenType.EOF, "", len(self.query)))
+        return tokens
+
+
+class SearchExpression:
+    pass
+
+
+@dataclass
+class TermExpression(SearchExpression):
+    term: str
+
+
+@dataclass
+class TagExpression(SearchExpression):
+    tag: str
+
+
+@dataclass
+class SpecialKeywordExpression(SearchExpression):
+    keyword: str
+
+
+@dataclass
+class AndExpression(SearchExpression):
+    left: SearchExpression
+    right: SearchExpression
+
+
+@dataclass
+class OrExpression(SearchExpression):
+    left: SearchExpression
+    right: SearchExpression
+
+
+@dataclass
+class NotExpression(SearchExpression):
+    operand: SearchExpression
+
+
+class SearchQueryParseError(Exception):
+    def __init__(self, message: str, position: int):
+        self.message = message
+        self.position = position
+        super().__init__(f"{message} at position {position}")
+
+
+class SearchQueryParser:
+    def __init__(self, tokens: List[Token]):
+        self.tokens = tokens
+        self.position = 0
+        self.current_token = tokens[0] if tokens else Token(TokenType.EOF, "", 0)
+
+    def advance(self):
+        """Move to the next token."""
+        if self.position < len(self.tokens) - 1:
+            self.position += 1
+            self.current_token = self.tokens[self.position]
+
+    def consume(self, expected_type: TokenType) -> Token:
+        """Consume a token of the expected type or raise an error."""
+        if self.current_token.type == expected_type:
+            token = self.current_token
+            self.advance()
+            return token
+        else:
+            raise SearchQueryParseError(
+                f"Expected {expected_type.value}, got {self.current_token.type.value}",
+                self.current_token.position,
+            )
+
+    def parse(self) -> Optional[SearchExpression]:
+        """Parse the tokens into an AST."""
+        if not self.tokens or (
+            len(self.tokens) == 1 and self.tokens[0].type == TokenType.EOF
+        ):
+            return None
+
+        expr = self.parse_or_expression()
+
+        if self.current_token.type != TokenType.EOF:
+            raise SearchQueryParseError(
+                f"Unexpected token {self.current_token.type.value}",
+                self.current_token.position,
+            )
+
+        return expr
+
+    def parse_or_expression(self) -> SearchExpression:
+        """Parse OR expressions (lowest precedence)."""
+        left = self.parse_and_expression()
+
+        while self.current_token.type == TokenType.OR:
+            self.advance()  # consume OR
+            right = self.parse_and_expression()
+            left = OrExpression(left, right)
+
+        return left
+
+    def parse_and_expression(self) -> SearchExpression:
+        """Parse AND expressions (medium precedence), including implicit AND."""
+        left = self.parse_not_expression()
+
+        while self.current_token.type == TokenType.AND or self.current_token.type in [
+            TokenType.TERM,
+            TokenType.TAG,
+            TokenType.SPECIAL_KEYWORD,
+            TokenType.LPAREN,
+            TokenType.NOT,
+        ]:
+
+            if self.current_token.type == TokenType.AND:
+                self.advance()  # consume explicit AND
+            # else: implicit AND (don't advance token)
+
+            right = self.parse_not_expression()
+            left = AndExpression(left, right)
+
+        return left
+
+    def parse_not_expression(self) -> SearchExpression:
+        """Parse NOT expressions (high precedence)."""
+        if self.current_token.type == TokenType.NOT:
+            self.advance()  # consume NOT
+            operand = self.parse_not_expression()  # right associative
+            return NotExpression(operand)
+
+        return self.parse_primary_expression()
+
+    def parse_primary_expression(self) -> SearchExpression:
+        """Parse primary expressions (terms, tags, special keywords, and parenthesized expressions)."""
+        if self.current_token.type == TokenType.TERM:
+            term = self.current_token.value
+            self.advance()
+            return TermExpression(term)
+        elif self.current_token.type == TokenType.TAG:
+            tag = self.current_token.value
+            self.advance()
+            return TagExpression(tag)
+        elif self.current_token.type == TokenType.SPECIAL_KEYWORD:
+            keyword = self.current_token.value
+            self.advance()
+            return SpecialKeywordExpression(keyword)
+        elif self.current_token.type == TokenType.LPAREN:
+            self.advance()  # consume (
+            expr = self.parse_or_expression()
+            self.consume(TokenType.RPAREN)  # consume )
+            return expr
+        else:
+            raise SearchQueryParseError(
+                f"Unexpected token {self.current_token.type.value}",
+                self.current_token.position,
+            )
+
+
+def parse_search_query(query: str) -> Optional[SearchExpression]:
+    if not query or not query.strip():
+        return None
+
+    tokenizer = SearchQueryTokenizer(query)
+    tokens = tokenizer.tokenize()
+    parser = SearchQueryParser(tokens)
+    return parser.parse()
+
+
+def _needs_parentheses(expr: SearchExpression, parent_type: type) -> bool:
+    if isinstance(expr, OrExpression) and parent_type == AndExpression:
+        return True
+    # AndExpression or OrExpression needs parentheses when inside NotExpression
+    if isinstance(expr, (AndExpression, OrExpression)) and parent_type == NotExpression:
+        return True
+    return False
+
+
+def _is_simple_expression(expr: SearchExpression) -> bool:
+    """Check if an expression is simple (term, tag, or keyword)."""
+    return isinstance(expr, (TermExpression, TagExpression, SpecialKeywordExpression))
+
+
+def _expression_to_string(expr: SearchExpression, parent_type: type = None) -> str:
+    if isinstance(expr, TermExpression):
+        # Quote terms if they contain spaces or special characters
+        if " " in expr.term or any(c in expr.term for c in ["(", ")", '"', "'"]):
+            # Escape any quotes in the term
+            escaped = expr.term.replace("\\", "\\\\").replace('"', '\\"')
+            return f'"{escaped}"'
+        return expr.term
+
+    elif isinstance(expr, TagExpression):
+        return f"#{expr.tag}"
+
+    elif isinstance(expr, SpecialKeywordExpression):
+        return f"!{expr.keyword}"
+
+    elif isinstance(expr, NotExpression):
+        # Don't pass parent type to children
+        operand_str = _expression_to_string(expr.operand, None)
+        # Add parentheses if the operand is a binary operation
+        if isinstance(expr.operand, (AndExpression, OrExpression)):
+            return f"not ({operand_str})"
+        return f"not {operand_str}"
+
+    elif isinstance(expr, AndExpression):
+        # Don't pass parent type to children - they'll add their own parens only if needed
+        left_str = _expression_to_string(expr.left, None)
+        right_str = _expression_to_string(expr.right, None)
+
+        # Add parentheses to children if needed for precedence
+        if _needs_parentheses(expr.left, AndExpression):
+            left_str = f"({left_str})"
+        if _needs_parentheses(expr.right, AndExpression):
+            right_str = f"({right_str})"
+
+        result = f"{left_str} {right_str}"
+
+        # Add outer parentheses if needed based on parent context
+        if parent_type and _needs_parentheses(expr, parent_type):
+            result = f"({result})"
+
+        return result
+
+    elif isinstance(expr, OrExpression):
+        # Don't pass parent type to children
+        left_str = _expression_to_string(expr.left, None)
+        right_str = _expression_to_string(expr.right, None)
+
+        # OrExpression children don't need parentheses unless they're also OR (handled by recursion)
+        result = f"{left_str} or {right_str}"
+
+        # Add outer parentheses if needed based on parent context
+        if parent_type and _needs_parentheses(expr, parent_type):
+            result = f"({result})"
+
+        return result
+
+    else:
+        raise ValueError(f"Unknown expression type: {type(expr)}")
+
+
+def expression_to_string(expr: Optional[SearchExpression]) -> str:
+    if expr is None:
+        return ""
+    return _expression_to_string(expr)
+
+
+def _strip_tag_from_expression(
+    expr: Optional[SearchExpression], tag_name: str, enable_lax_search: bool = False
+) -> Optional[SearchExpression]:
+    if expr is None:
+        return None
+
+    if isinstance(expr, TagExpression):
+        # Remove this tag if it matches
+        if expr.tag.lower() == tag_name.lower():
+            return None
+        return expr
+
+    elif isinstance(expr, TermExpression):
+        # In lax search mode, also remove terms that match the tag name
+        if enable_lax_search and expr.term.lower() == tag_name.lower():
+            return None
+        return expr
+
+    elif isinstance(expr, SpecialKeywordExpression):
+        # Keep special keywords as-is
+        return expr
+
+    elif isinstance(expr, NotExpression):
+        # Recursively filter the operand
+        filtered_operand = _strip_tag_from_expression(
+            expr.operand, tag_name, enable_lax_search
+        )
+        if filtered_operand is None:
+            # If the operand is removed, the whole NOT expression should be removed
+            return None
+        return NotExpression(filtered_operand)
+
+    elif isinstance(expr, AndExpression):
+        # Recursively filter both sides
+        left = _strip_tag_from_expression(expr.left, tag_name, enable_lax_search)
+        right = _strip_tag_from_expression(expr.right, tag_name, enable_lax_search)
+
+        # If both sides are removed, remove the AND expression
+        if left is None and right is None:
+            return None
+        # If one side is removed, return the other side
+        elif left is None:
+            return right
+        elif right is None:
+            return left
+        else:
+            return AndExpression(left, right)
+
+    elif isinstance(expr, OrExpression):
+        # Recursively filter both sides
+        left = _strip_tag_from_expression(expr.left, tag_name, enable_lax_search)
+        right = _strip_tag_from_expression(expr.right, tag_name, enable_lax_search)
+
+        # If both sides are removed, remove the OR expression
+        if left is None and right is None:
+            return None
+        # If one side is removed, return the other side
+        elif left is None:
+            return right
+        elif right is None:
+            return left
+        else:
+            return OrExpression(left, right)
+
+    else:
+        # Unknown expression type, return as-is
+        return expr
+
+
+def strip_tag_from_query(
+    query: str, tag_name: str, user_profile: UserProfile | None = None
+) -> str:
+    try:
+        ast = parse_search_query(query)
+    except SearchQueryParseError:
+        return query
+
+    if ast is None:
+        return ""
+
+    # Determine if lax search is enabled
+    enable_lax_search = False
+    if user_profile is not None:
+        enable_lax_search = user_profile.tag_search == UserProfile.TAG_SEARCH_LAX
+
+    # Strip the tag from the AST
+    filtered_ast = _strip_tag_from_expression(ast, tag_name, enable_lax_search)
+
+    # Convert back to a query string
+    return expression_to_string(filtered_ast)
+
+
+def _extract_tag_names_from_expression(
+    expr: Optional[SearchExpression], enable_lax_search: bool = False
+) -> List[str]:
+    if expr is None:
+        return []
+
+    if isinstance(expr, TagExpression):
+        return [expr.tag]
+
+    elif isinstance(expr, TermExpression):
+        # In lax search mode, terms are also considered tags
+        if enable_lax_search:
+            return [expr.term]
+        return []
+
+    elif isinstance(expr, SpecialKeywordExpression):
+        # Special keywords are not tags
+        return []
+
+    elif isinstance(expr, NotExpression):
+        # Recursively extract from the operand
+        return _extract_tag_names_from_expression(expr.operand, enable_lax_search)
+
+    elif isinstance(expr, (AndExpression, OrExpression)):
+        # Recursively extract from both sides and combine
+        left_tags = _extract_tag_names_from_expression(expr.left, enable_lax_search)
+        right_tags = _extract_tag_names_from_expression(expr.right, enable_lax_search)
+        return left_tags + right_tags
+
+    else:
+        # Unknown expression type
+        return []
+
+
+def extract_tag_names_from_query(
+    query: str, user_profile: UserProfile | None = None
+) -> List[str]:
+    try:
+        ast = parse_search_query(query)
+    except SearchQueryParseError:
+        return []
+
+    if ast is None:
+        return []
+
+    # Determine if lax search is enabled
+    enable_lax_search = False
+    if user_profile is not None:
+        enable_lax_search = user_profile.tag_search == UserProfile.TAG_SEARCH_LAX
+
+    # Extract tag names from the AST
+    tag_names = _extract_tag_names_from_expression(ast, enable_lax_search)
+
+    # Deduplicate (case-insensitive) and sort
+    seen = set()
+    unique_tags = []
+    for tag in tag_names:
+        tag_lower = tag.lower()
+        if tag_lower not in seen:
+            seen.add(tag_lower)
+            unique_tags.append(tag_lower)
+
+    return sorted(unique_tags)