|
| 1 | +import asyncio |
1 | 2 | import datetime
|
| 3 | +import json |
2 | 4 | import logging
|
3 | 5 | import operator
|
4 | 6 | import re
|
|
33 | 35 | from cl.custom_filters.templatetags.text_filters import html_decode
|
34 | 36 | from cl.lib.courts import lookup_child_courts_cache
|
35 | 37 | from cl.lib.date_time import midnight_pt
|
| 38 | +from cl.lib.microservice_utils import microservice |
36 | 39 | from cl.lib.string_utils import trunc
|
37 | 40 | from cl.lib.types import (
|
38 | 41 | ApiPositionMapping,
|
|
82 | 85 | BadProximityQuery,
|
83 | 86 | DisallowedWildcardPattern,
|
84 | 87 | ElasticBadRequestError,
|
| 88 | + InputTooLongError, |
85 | 89 | InvalidRelativeDateSyntax,
|
86 | 90 | QueryType,
|
87 | 91 | UnbalancedParenthesesQuery,
|
@@ -2679,6 +2683,74 @@ def apply_custom_score_to_main_query(
|
2679 | 2683 | return query
|
2680 | 2684 |
|
2681 | 2685 |
|
| 2686 | +def build_semantic_query( |
| 2687 | + text_query: str, fields: list[str], filters: list[QueryString | Range] |
| 2688 | +) -> tuple[str, list[Query]]: |
| 2689 | + """ |
| 2690 | + Build a hybrid Elasticsearch query using both exact keyword matching and |
| 2691 | + semantic vector search. |
| 2692 | +
|
| 2693 | + :param text_query: The raw user query string, which may include quoted |
| 2694 | + phrases for exact matching. |
| 2695 | + :param fields: A list of fields to target with the full-text keyword query. |
| 2696 | + :param filters: A list of filter clauses to apply as pre-filtering to the |
| 2697 | + semantic KNN search query. |
| 2698 | + :return: A two-tuple: |
| 2699 | + - keyword_query: A string representing the AND-joined quoted phrases, if any. |
| 2700 | + - semantic_query: A list of Elasticsearch Q objects, including the KNN vector search |
| 2701 | + and optionally a keyword-based full-text query. |
| 2702 | + :raises InputTooLongError: If the cleaned query string exceeds the maximum allowed length |
| 2703 | + for generating embeddings. |
| 2704 | + """ |
| 2705 | + semantic_query: list[Query] = [] |
| 2706 | + # Extract quoted phrases from the input string (for exact keyword matching) |
| 2707 | + exact_keywords = re.findall(r'"([^"]*)"', text_query) |
| 2708 | + |
| 2709 | + # Join extracted phrases with AND to form a keyword query string |
| 2710 | + keyword_query = " AND ".join([f'"{s}"' for s in exact_keywords]) |
| 2711 | + |
| 2712 | + # Remove quotes from the query to prepare for embedding |
| 2713 | + cleaned_text_query = text_query.replace('"', "") |
| 2714 | + |
| 2715 | + # Enforce character limit to avoid exceeding embedding constraints |
| 2716 | + if len(cleaned_text_query) > settings.MAX_EMBEDDING_CHAR_LENGTH: |
| 2717 | + raise InputTooLongError(QueryType.QUERY_STRING) |
| 2718 | + |
| 2719 | + # Generate embedding vector using external microservice |
| 2720 | + embedding_request = asyncio.run( |
| 2721 | + microservice( |
| 2722 | + service="inception-query", |
| 2723 | + method="POST", |
| 2724 | + data=json.dumps({"text": cleaned_text_query}), |
| 2725 | + ) |
| 2726 | + ) |
| 2727 | + vectors = embedding_request.json()["embedding"] |
| 2728 | + |
| 2729 | + # If exact keyword query exists, build and add full-text query to results |
| 2730 | + # This enables hybrid search by combining keyword and semantic results |
| 2731 | + if keyword_query: |
| 2732 | + semantic_query.extend( |
| 2733 | + build_fulltext_query(fields, keyword_query, only_queries=True) |
| 2734 | + ) |
| 2735 | + |
| 2736 | + # Add the semantic vector-based query using KNN with pre-filtering |
| 2737 | + semantic_query.append( |
| 2738 | + Q( |
| 2739 | + "nested", |
| 2740 | + path="embeddings", |
| 2741 | + query=Q( |
| 2742 | + "knn", |
| 2743 | + field="embeddings.embedding", |
| 2744 | + k=settings.KNN_SEARCH_K, |
| 2745 | + query_vector=vectors, |
| 2746 | + filter=filters, |
| 2747 | + boost=settings.KNN_SEARCH_BOOST, |
| 2748 | + ), |
| 2749 | + ) |
| 2750 | + ) |
| 2751 | + return keyword_query, semantic_query |
| 2752 | + |
| 2753 | + |
2682 | 2754 | def build_full_join_es_queries(
|
2683 | 2755 | cd: CleanData,
|
2684 | 2756 | child_query_fields: dict[str, list[str]],
|
|
0 commit comments