freelawproject · ERosendo · Jul 7, 2025 · Jul 10, 2025 · Jul 10, 2025 · Jul 16, 2025
diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py
@@ -1,4 +1,6 @@
+import asyncio
 import datetime
+import json
 import logging
 import operator
 import re
@@ -33,6 +35,7 @@
 from cl.custom_filters.templatetags.text_filters import html_decode
 from cl.lib.courts import lookup_child_courts_cache
 from cl.lib.date_time import midnight_pt
+from cl.lib.microservice_utils import microservice
 from cl.lib.string_utils import trunc
 from cl.lib.types import (
     ApiPositionMapping,
@@ -82,6 +85,7 @@
     BadProximityQuery,
     DisallowedWildcardPattern,
     ElasticBadRequestError,
+    InputTooLongError,
     InvalidRelativeDateSyntax,
     QueryType,
     UnbalancedParenthesesQuery,
@@ -2679,6 +2683,74 @@ def apply_custom_score_to_main_query(
     return query
 
 
+def build_semantic_query(
+    text_query: str, fields: list[str], filters: list[QueryString | Range]
+) -> tuple[str, list[Query]]:
+    """
+    Build a hybrid Elasticsearch query using both exact keyword matching and
+    semantic vector search.
+
+    :param text_query: The raw user query string, which may include quoted
+        phrases for exact matching.
+    :param fields: A list of fields to target with the full-text keyword query.
+    :param filters: A list of filter clauses to apply as pre-filtering to the
+        semantic KNN search query.
+    :return: A two-tuple:
+        - keyword_query: A string representing the AND-joined quoted phrases, if any.
+        - semantic_query: A list of Elasticsearch Q objects, including the KNN vector search
+          and optionally a keyword-based full-text query.
+    :raises InputTooLongError: If the cleaned query string exceeds the maximum allowed length
+        for generating embeddings.
+    """
+    semantic_query: list[Query] = []
+    # Extract quoted phrases from the input string (for exact keyword matching)
+    exact_keywords = re.findall(r'"([^"]*)"', text_query)
+
+    # Join extracted phrases with AND to form a keyword query string
+    keyword_query = " AND ".join([f'"{s}"' for s in exact_keywords])
+
+    # Remove quotes from the query to prepare for embedding
+    cleaned_text_query = text_query.replace('"', "")
+
+    # Enforce character limit to avoid exceeding embedding constraints
+    if len(cleaned_text_query) > settings.MAX_EMBEDDING_CHAR_LENGTH:
+        raise InputTooLongError(QueryType.QUERY_STRING)
+
+    # Generate embedding vector using external microservice
+    embedding_request = asyncio.run(
+        microservice(
+            service="inception-query",
+            method="POST",
+            data=json.dumps({"text": cleaned_text_query}),
+        )
+    )
+    vectors = embedding_request.json()["embedding"]
+
+    # If exact keyword query exists, build and add full-text query to results
+    # This enables hybrid search by combining keyword and semantic results
+    if keyword_query:
+        semantic_query.extend(
+            build_fulltext_query(fields, keyword_query, only_queries=True)
+        )
+
+    # Add the semantic vector-based query using KNN with pre-filtering
+    semantic_query.append(
+        Q(
+            "nested",
+            path="embeddings",
+            query=Q(
+                "knn",
+                field="embeddings.embedding",
+                k=settings.KNN_SEARCH_K,
+                query_vector=vectors,
+                filter=filters,
+                boost=settings.KNN_SEARCH_BOOST,
+            ),
+        )
+    )
+    return keyword_query, semantic_query
+
+
 def build_full_join_es_queries(
     cd: CleanData,
     child_query_fields: dict[str, list[str]],

diff --git a/cl/lib/microservice_utils.py b/cl/lib/microservice_utils.py
@@ -1,13 +1,31 @@
 from io import BufferedReader
 
+from asgiref.sync import sync_to_async
 from django.conf import settings
 from httpx import AsyncClient, Response
 
 from cl.audio.models import Audio
-from cl.lib.search_utils import clean_up_recap_document_file
 from cl.search.models import Opinion, RECAPDocument
 
 
+async def clean_up_recap_document_file(item: RECAPDocument) -> None:
+    """Clean up the RecapDocument file-related fields after detecting the file
+    doesn't exist in the storage.
+
+    :param item: The RECAPDocument to work on.
+    :return: None
+    """
+
+    if isinstance(item, RECAPDocument):
+        await sync_to_async(item.filepath_local.delete)()
+        item.sha1 = ""
+        item.date_upload = None
+        item.file_size = None
+        item.page_count = None
+        item.is_available = False
+        await item.asave()
+
+
 async def microservice(
     service: str,
     method: str = "POST",

diff --git a/cl/lib/search_utils.py b/cl/lib/search_utils.py
@@ -5,7 +5,7 @@
 from typing import Any, TypedDict
 from urllib.parse import parse_qs, urlencode
 
-from asgiref.sync import async_to_sync, sync_to_async
+from asgiref.sync import async_to_sync
 from django.conf import settings
 from django.core.cache import cache
 from django.core.exceptions import PermissionDenied
@@ -59,7 +59,6 @@
     SEARCH_TYPES,
     Court,
     OpinionCluster,
-    RECAPDocument,
     SearchQuery,
 )
 
@@ -251,24 +250,6 @@ async def add_depth_counts(
         return None
 
 
-async def clean_up_recap_document_file(item: RECAPDocument) -> None:
-    """Clean up the RecapDocument file-related fields after detecting the file
-    doesn't exist in the storage.
-
-    :param item: The RECAPDocument to work on.
-    :return: None
-    """
-
-    if isinstance(item, RECAPDocument):
-        await sync_to_async(item.filepath_local.delete)()
-        item.sha1 = ""
-        item.date_upload = None
-        item.file_size = None
-        item.page_count = None
-        item.is_available = False
-        await item.asave()
-
-
 def store_search_query(request: HttpRequest, search_results: dict) -> None:
     """Saves an user's search query in a SearchQuery model
 

diff --git a/cl/search/exception.py b/cl/search/exception.py
@@ -1,6 +1,7 @@
 from enum import Enum
 from http import HTTPStatus
 
+from django.conf import settings
 from elasticsearch.exceptions import SerializationError
 from rest_framework.exceptions import APIException
 
@@ -68,3 +69,12 @@ class InvalidRelativeDateSyntax(SyntaxQueryError):
     """The date entered has an invalid format."""
 
     message = "The date entered has an invalid format."
+
+
+class InputTooLongError(SyntaxQueryError):
+    """The input text is too long to be processed for embedding."""
+
+    message = (
+        "The input is too long to process. The maximum allowed is "
+        f"{settings.MAX_EMBEDDING_CHAR_LENGTH} characters."
+    )
diff --git a/cl/search/forms.py b/cl/search/forms.py
@@ -478,6 +478,15 @@ class SearchForm(forms.Form):
         ),
     )
 
+    semantic = forms.BooleanField(
+        label="Whether to enable semantic search in the Search API.",
+        label_suffix="",
+        required=False,
+        widget=forms.CheckboxInput(
+            attrs={"class": "external-input form-control left"}
+        ),
+    )
+
     def get_date_field_names(self):
         return {
             f_name.split("_")[0]

diff --git a/cl/settings/project/microservices.py b/cl/settings/project/microservices.py
@@ -79,4 +79,8 @@
         "url": f"{INCEPTION_HOST}/api/v1/embed/batch",
         "timeout": INCEPTION_TIMEOUT,
     },
+    "inception-query": {
+        "url": f"{INCEPTION_HOST}/api/v1/embed/query",
+        "timeout": INCEPTION_TIMEOUT,
+    },
 }
diff --git a/cl/settings/project/search.py b/cl/settings/project/search.py
@@ -56,3 +56,10 @@
 PERCOLATOR_MISSING_DOCUMENT_MAX_RETRIES = env(
     "PERCOLATOR_MISSING_DOCUMENT_MAX_RETRIES", default=4
 )
+
+#################
+# VECTOR SEARCH #
+#################
+MAX_EMBEDDING_CHAR_LENGTH = env("MAX_EMBEDDING_CHAR_LENGTH", default=1000)
+KNN_SEARCH_BOOST = env("KNN_SEARCH_BOOST", default=150)
+VECTOR_SEARCH_K = env("VECTOR_SEARCH_K", default=5)