Skip to content

Commit 217efdb

Browse files
committed
feat(search): Adds helper to build semantic search query
Introduces `build_semantic_query`, a helper that combines vector-based KNN search with optional exact keyword matching for flexible case law retrieval.
1 parent b27b7fe commit 217efdb

File tree

1 file changed

+72
-0
lines changed

1 file changed

+72
-0
lines changed

cl/lib/elasticsearch_utils.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1+
import asyncio
12
import datetime
3+
import json
24
import logging
35
import operator
46
import re
@@ -33,6 +35,7 @@
3335
from cl.custom_filters.templatetags.text_filters import html_decode
3436
from cl.lib.courts import lookup_child_courts_cache
3537
from cl.lib.date_time import midnight_pt
38+
from cl.lib.microservice_utils import microservice
3639
from cl.lib.string_utils import trunc
3740
from cl.lib.types import (
3841
ApiPositionMapping,
@@ -82,6 +85,7 @@
8285
BadProximityQuery,
8386
DisallowedWildcardPattern,
8487
ElasticBadRequestError,
88+
InputTooLongError,
8589
InvalidRelativeDateSyntax,
8690
QueryType,
8791
UnbalancedParenthesesQuery,
@@ -2679,6 +2683,74 @@ def apply_custom_score_to_main_query(
26792683
return query
26802684

26812685

2686+
def build_semantic_query(
2687+
text_query: str, fields: list[str], filters: list[QueryString | Range]
2688+
) -> tuple[str, list[Query]]:
2689+
"""
2690+
Build a hybrid Elasticsearch query using both exact keyword matching and
2691+
semantic vector search.
2692+
2693+
:param text_query: The raw user query string, which may include quoted
2694+
phrases for exact matching.
2695+
:param fields: A list of fields to target with the full-text keyword query.
2696+
:param filters: A list of filter clauses to apply as pre-filtering to the
2697+
semantic KNN search query.
2698+
:return: A two-tuple:
2699+
- keyword_query: A string representing the AND-joined quoted phrases, if any.
2700+
- semantic_query: A list of Elasticsearch Q objects, including the KNN vector search
2701+
and optionally a keyword-based full-text query.
2702+
:raises InputTooLongError: If the cleaned query string exceeds the maximum allowed length
2703+
for generating embeddings.
2704+
"""
2705+
semantic_query: list[Query] = []
2706+
# Extract quoted phrases from the input string (for exact keyword matching)
2707+
exact_keywords = re.findall(r'"([^"]*)"', text_query)
2708+
2709+
# Join extracted phrases with AND to form a keyword query string
2710+
keyword_query = " AND ".join([f'"{s}"' for s in exact_keywords])
2711+
2712+
# Remove quotes from the query to prepare for embedding
2713+
cleaned_text_query = text_query.replace('"', "")
2714+
2715+
# Enforce character limit to avoid exceeding embedding constraints
2716+
if len(cleaned_text_query) > settings.MAX_EMBEDDING_CHAR_LENGTH:
2717+
raise InputTooLongError(QueryType.QUERY_STRING)
2718+
2719+
# Generate embedding vector using external microservice
2720+
embedding_request = asyncio.run(
2721+
microservice(
2722+
service="inception-query",
2723+
method="POST",
2724+
data=json.dumps({"text": cleaned_text_query}),
2725+
)
2726+
)
2727+
vectors = embedding_request.json()["embedding"]
2728+
2729+
# If exact keyword query exists, build and add full-text query to results
2730+
# This enables hybrid search by combining keyword and semantic results
2731+
if keyword_query:
2732+
semantic_query.extend(
2733+
build_fulltext_query(fields, keyword_query, only_queries=True)
2734+
)
2735+
2736+
# Add the semantic vector-based query using KNN with pre-filtering
2737+
semantic_query.append(
2738+
Q(
2739+
"nested",
2740+
path="embeddings",
2741+
query=Q(
2742+
"knn",
2743+
field="embeddings.embedding",
2744+
k=5,
2745+
query_vector=vectors,
2746+
filter=filters,
2747+
boost=settings.KNN_SEARCH_BOOST,
2748+
),
2749+
)
2750+
)
2751+
return keyword_query, semantic_query
2752+
2753+
26822754
def build_full_join_es_queries(
26832755
cd: CleanData,
26842756
child_query_fields: dict[str, list[str]],

0 commit comments

Comments
 (0)