Skip to content

feat(search): add semantic search toggle to SearchForm #5899

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions cl/lib/elasticsearch_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import asyncio
import datetime
import json
import logging
import operator
import re
Expand Down Expand Up @@ -33,6 +35,7 @@
from cl.custom_filters.templatetags.text_filters import html_decode
from cl.lib.courts import lookup_child_courts_cache
from cl.lib.date_time import midnight_pt
from cl.lib.microservice_utils import microservice
from cl.lib.string_utils import trunc
from cl.lib.types import (
ApiPositionMapping,
Expand Down Expand Up @@ -82,6 +85,7 @@
BadProximityQuery,
DisallowedWildcardPattern,
ElasticBadRequestError,
InputTooLongError,
InvalidRelativeDateSyntax,
QueryType,
UnbalancedParenthesesQuery,
Expand Down Expand Up @@ -2679,6 +2683,74 @@ def apply_custom_score_to_main_query(
return query


def build_semantic_query(
text_query: str, fields: list[str], filters: list[QueryString | Range]
) -> tuple[str, list[Query]]:
"""
Build a hybrid Elasticsearch query using both exact keyword matching and
semantic vector search.

:param text_query: The raw user query string, which may include quoted
phrases for exact matching.
:param fields: A list of fields to target with the full-text keyword query.
:param filters: A list of filter clauses to apply as pre-filtering to the
semantic KNN search query.
:return: A two-tuple:
- keyword_query: A string representing the AND-joined quoted phrases, if any.
- semantic_query: A list of Elasticsearch Q objects, including the KNN vector search
and optionally a keyword-based full-text query.
:raises InputTooLongError: If the cleaned query string exceeds the maximum allowed length
for generating embeddings.
"""
semantic_query: list[Query] = []
# Extract quoted phrases from the input string (for exact keyword matching)
exact_keywords = re.findall(r'"([^"]*)"', text_query)

# Join extracted phrases with AND to form a keyword query string
keyword_query = " AND ".join([f'"{s}"' for s in exact_keywords])

# Remove quotes from the query to prepare for embedding
cleaned_text_query = text_query.replace('"', "")

# Enforce character limit to avoid exceeding embedding constraints
if len(cleaned_text_query) > settings.MAX_EMBEDDING_CHAR_LENGTH:
raise InputTooLongError(QueryType.QUERY_STRING)

# Generate embedding vector using external microservice
embedding_request = asyncio.run(
microservice(
service="inception-query",
method="POST",
data=json.dumps({"text": cleaned_text_query}),
)
)
vectors = embedding_request.json()["embedding"]

# If exact keyword query exists, build and add full-text query to results
# This enables hybrid search by combining keyword and semantic results
if keyword_query:
semantic_query.extend(
build_fulltext_query(fields, keyword_query, only_queries=True)
)

# Add the semantic vector-based query using KNN with pre-filtering
semantic_query.append(
Q(
"nested",
path="embeddings",
query=Q(
"knn",
field="embeddings.embedding",
k=settings.KNN_SEARCH_K,
query_vector=vectors,
filter=filters,
boost=settings.KNN_SEARCH_BOOST,
),
)
)
return keyword_query, semantic_query


def build_full_join_es_queries(
cd: CleanData,
child_query_fields: dict[str, list[str]],
Expand Down
20 changes: 19 additions & 1 deletion cl/lib/microservice_utils.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,31 @@
from io import BufferedReader

from asgiref.sync import sync_to_async
from django.conf import settings
from httpx import AsyncClient, Response

from cl.audio.models import Audio
from cl.lib.search_utils import clean_up_recap_document_file
from cl.search.models import Opinion, RECAPDocument


async def clean_up_recap_document_file(item: RECAPDocument) -> None:
"""Clean up the RecapDocument file-related fields after detecting the file
doesn't exist in the storage.

:param item: The RECAPDocument to work on.
:return: None
"""

if isinstance(item, RECAPDocument):
await sync_to_async(item.filepath_local.delete)()
item.sha1 = ""
item.date_upload = None
item.file_size = None
item.page_count = None
item.is_available = False
await item.asave()


async def microservice(
service: str,
method: str = "POST",
Expand Down
21 changes: 1 addition & 20 deletions cl/lib/search_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import Any, TypedDict
from urllib.parse import parse_qs, urlencode

from asgiref.sync import async_to_sync, sync_to_async
from asgiref.sync import async_to_sync
from django.conf import settings
from django.core.cache import cache
from django.core.exceptions import PermissionDenied
Expand Down Expand Up @@ -59,7 +59,6 @@
SEARCH_TYPES,
Court,
OpinionCluster,
RECAPDocument,
SearchQuery,
)

Expand Down Expand Up @@ -251,24 +250,6 @@ async def add_depth_counts(
return None


async def clean_up_recap_document_file(item: RECAPDocument) -> None:
"""Clean up the RecapDocument file-related fields after detecting the file
doesn't exist in the storage.

:param item: The RECAPDocument to work on.
:return: None
"""

if isinstance(item, RECAPDocument):
await sync_to_async(item.filepath_local.delete)()
item.sha1 = ""
item.date_upload = None
item.file_size = None
item.page_count = None
item.is_available = False
await item.asave()


def store_search_query(request: HttpRequest, search_results: dict) -> None:
"""Saves an user's search query in a SearchQuery model

Expand Down
10 changes: 10 additions & 0 deletions cl/search/exception.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from enum import Enum
from http import HTTPStatus

from django.conf import settings
from elasticsearch.exceptions import SerializationError
from rest_framework.exceptions import APIException

Expand Down Expand Up @@ -68,3 +69,12 @@ class InvalidRelativeDateSyntax(SyntaxQueryError):
"""The date entered has an invalid format."""

message = "The date entered has an invalid format."


class InputTooLongError(SyntaxQueryError):
"""The input text is too long to be processed for embedding."""

message = (
"The input is too long to process. The maximum allowed is "
f"{settings.MAX_EMBEDDING_CHAR_LENGTH} characters."
)
9 changes: 9 additions & 0 deletions cl/search/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -478,6 +478,15 @@ class SearchForm(forms.Form):
),
)

semantic = forms.BooleanField(
label="Whether to enable semantic search in the Search API.",
label_suffix="",
required=False,
widget=forms.CheckboxInput(
attrs={"class": "external-input form-control left"}
),
)

def get_date_field_names(self):
return {
f_name.split("_")[0]
Expand Down
4 changes: 4 additions & 0 deletions cl/settings/project/microservices.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,4 +79,8 @@
"url": f"{INCEPTION_HOST}/api/v1/embed/batch",
"timeout": INCEPTION_TIMEOUT,
},
"inception-query": {
"url": f"{INCEPTION_HOST}/api/v1/embed/query",
"timeout": INCEPTION_TIMEOUT,
},
}
7 changes: 7 additions & 0 deletions cl/settings/project/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,10 @@
PERCOLATOR_MISSING_DOCUMENT_MAX_RETRIES = env(
"PERCOLATOR_MISSING_DOCUMENT_MAX_RETRIES", default=4
)

#################
# VECTOR SEARCH #
#################
MAX_EMBEDDING_CHAR_LENGTH = env("MAX_EMBEDDING_CHAR_LENGTH", default=1000)
KNN_SEARCH_BOOST = env("KNN_SEARCH_BOOST", default=150)
VECTOR_SEARCH_K = env("VECTOR_SEARCH_K", default=5)
Loading