OpenConceptLab/ocl_issues#2163 | Optional use of reranker for authorised users

snyaggarwal · snyaggarwal · commit e13ddd8b9cfa · 2025-12-05T09:50:39.000+05:30
diff --git a/core/common/search.py b/core/common/search.py
@@ -6,7 +6,7 @@
 from pydash import compact, get
 
 from core.common.constants import ES_REQUEST_TIMEOUT
-from core.common.utils import is_url_encoded_string
+from core.common.utils import is_url_encoded_string, get_cross_encoder
 
 
 class CustomESFacetedSearch(FacetedSearch):
@@ -203,28 +203,28 @@ def apply_aggregation_score_histogram(self):
     def apply_aggregation_score_stats(self):
         self._dsl_search.aggs.bucket("score", "stats", script="_score")
 
-    def to_queryset(self, keep_order=True, normalized_score=False, exact_count=True):  # pylint:disable=too-many-locals
+    def to_queryset(self, keep_order=True, normalized_score=False, exact_count=True, txt=None, encoder_model=None):  # pylint:disable=too-many-locals,too-many-arguments
         """
         This method return a django queryset from the an elasticsearch result.
         It cost a query to the sql db.
         """
-        import time
-        start_time = time.time()
-        s, hits, total = self.__get_response(exact_count)
-        print("ES query execute", time.time() - start_time)
+        encoder = bool(txt)
+        s, hits, total = self.__get_response(exact_count, encoder)
         max_score = hits.max_score or 1
 
-        start_time = time.time()
-        for result in hits.hits:
+        hits = get_cross_encoder(txt, hits.hits, encoder_model) if encoder else hits.hits
+        for result in hits:
             _id = get(result, '_id')
+            rerank_score = get(result, '_rerank_score')
+            raw_score = get(result, '_score') or 0
             self.scores[int(_id)] = {
-                'raw': get(result, '_score'),
-                'normalized': (get(result, '_score') or 0) / max_score
-            } if normalized_score else get(result, '_score')
+                'raw': raw_score,
+                'rerank': rerank_score,
+                'normalized': rerank_score if encoder else (raw_score / max_score)
+            } if normalized_score else raw_score
             highlight = get(result, 'highlight')
             if highlight:
                 self.highlights[int(_id)] = highlight.to_dict()
-        print("Highlights/Score", time.time() - start_time)
         if self.document and self.document.__name__ == 'RepoDocument':
             from core.sources.models import Source
             from core.collections.models import Collection
@@ -308,12 +308,14 @@ def append_to_bucket(_bucket, _score, count):
 
         return [build_confidence(high), build_confidence(medium), build_confidence(low)]
 
-    def __get_response(self, exact_count=True):
+    def __get_response(self, exact_count=True, load_fields=False):
         # Do not query again if the es result is already cached
         total = None
         if not hasattr(self._dsl_search, '_response'):
             # We only need the meta fields with the models ids
-            s = self._dsl_search.source(False)
+            s = self._dsl_search.source(
+                excludes=['_embeddings', '_synonyms_embeddings']
+            ) if load_fields else self._dsl_search.source(False)
             s = s.params(request_timeout=ES_REQUEST_TIMEOUT)
             if exact_count:
                 total = s.count()
diff --git a/core/common/serializers.py b/core/common/serializers.py
@@ -173,6 +173,7 @@ def validate_identifier(value):
 class SearchResultSerializer(Serializer):  # pylint: disable=abstract-method
     match_type = CharField(source='_match_type', allow_null=True, allow_blank=True)
     search_score = FloatField(source='_score', allow_null=True)
+    search_rerank_score = FloatField(source='_rerank_score', allow_null=True)
     search_confidence = CharField(source='_confidence', allow_null=True, allow_blank=True)
     search_highlight = SerializerMethodField()
 
diff --git a/core/common/utils.py b/core/common/utils.py
@@ -27,6 +27,7 @@
 from requests import ConnectTimeout
 from requests.auth import HTTPBasicAuth
 from rest_framework.utils import encoders
+from sentence_transformers import CrossEncoder
 
 from core.common.constants import UPDATED_SINCE_PARAM, BULK_IMPORT_QUEUES_COUNT, CURRENT_USER, REQUEST_URL, \
     TEMP_PREFIX
@@ -927,3 +928,51 @@ def get_embeddings(txt):
         from sentence_transformers import SentenceTransformer
         model = SentenceTransformer(settings.LM_MODEL_NAME)
     return model.encode(str(txt))
+
+
+ENCODERS = [
+    # Best and Fastest overall lightweight medical reranker
+    # Size: ~110M
+    # Speed: similar to MiniLM CrossEncoder
+    # Training: includes clinical, medical, question-answering datasets
+    # Output: positive similarity scores (not raw logits!)
+    # 0.6B params
+    # https://huggingface.co/BAAI/bge-reranker-v2-m3
+    "BAAI/bge-reranker-v2-m3",
+
+    # Model: jinhybr/OA-MedBERT-cross-encoder or similar
+    # Size: ~110M
+    # Domain: PubMed abstracts, biomedical QA
+    # Type: binary classifier (logits)
+    # Not huggin face model -- ???
+    # "jinhybr/OA-MedBERT-cross-encoder",
+
+    # Model: microsoft/BioLinkBERT-base
+    # Type: CrossEncoder
+    # Size: ~120M
+    # Domain: UMLS, PubMed, MeSH, SNOMED (closest to OCL)
+    # Not huggin face model -- doesn't work with sentence_transformers
+    # "microsoft/BioLinkBERT-base",
+
+    # 22.7M params
+    # https://huggingface.co/cross-encoder/ms-marco-MiniLM-L6-v2
+    # doesn't work with logits, so not between 0-1
+    "cross-encoder/ms-marco-MiniLM-L-6-v2",
+]
+
+ENCODER = CrossEncoder(ENCODERS[0], device="cpu")
+
+def get_encoder(model):
+    if model in ENCODERS:
+        return CrossEncoder(model, device="cpu")
+    return ENCODER
+
+
+def get_cross_encoder(txt, hits, model=None):
+    docs = [get(dict(hit["_source"]), 'name') for hit in hits]
+    encoder = get_encoder(model) if model else ENCODER
+    scores = encoder.predict([(txt, d) for d in docs])
+
+    for hit, score in zip(hits, scores):
+        hit["_rerank_score"] = float(score)
+    return hits
diff --git a/core/concepts/views.py b/core/concepts/views.py
@@ -797,7 +797,7 @@ def get_serializer_class(self):
 
         return ConceptListSerializer
 
-    def filter_queryset(self, _=None):  # pylint:disable=too-many-locals,too-many-statements
+    def filter_queryset(self, _=None):  # pylint: disable=too-many-locals
         rows = self.request.data.get('rows')
         target_repo_url = self.request.data.get('target_repo_url')
         target_repo_params = self.request.data.get('target_repo')
@@ -808,8 +808,8 @@ def filter_queryset(self, _=None):  # pylint:disable=too-many-locals,too-many-st
         map_config = self.request.data.get('map_config', [])
         filters = self.request.data.get('filter', {})
         include_retired = self.request.query_params.get(INCLUDE_RETIRED_PARAM) in get_truthy_values()
-        num_candidates = min(to_int(self.request.query_params.get('numCandidates', 0), 2000), 2000)
-        k_nearest = min(to_int(self.request.query_params.get('kNearest', 0), 50), 50)
+        num_candidates = min(to_int(self.request.query_params.get('numCandidates', 0), 3000), 3000)
+        k_nearest = min(to_int(self.request.query_params.get('kNearest', 0), 100), 100)
         offset = max(to_int(self.request.GET.get('offset'), 0), 0)
         limit = max(to_int(self.request.GET.get('limit'), 0), 0) or self.default_limit
         page = max(to_int(self.request.GET.get('page'), 1), 1)
@@ -823,57 +823,82 @@ def filter_queryset(self, _=None):  # pylint:disable=too-many-locals,too-many-st
         locale_filter = filters.pop('locale', None) if is_semantic else get(filters, 'locale', None)
         faceted_criterion = self.get_faceted_criterion(False, filters, minimum_should_match=1) if filters else None
         apply_for_name_locale = locale_filter and isinstance(locale_filter, str) and len(locale_filter.split(',')) == 1
+        encoder_model = self.request.GET.get('encoder_model', None)
+        reranker = self.request.GET.get('reranker', None) in get_truthy_values()  # enables reranker
+        reranker = reranker and self.request.user.is_mapper_cross_encoder_group
+        score_to_sort = 'search_rerank_score' if reranker else 'search_normalized_score'
         results = []
-        import time
         for row in rows:
-            start_time = time.time()
             search = ConceptFuzzySearch.search(
                 row, target_repo_url, repo_params, include_retired,
                 is_semantic, num_candidates, k_nearest, map_config, faceted_criterion, locale_filter
             )
-            print("Search Query", time.time() - start_time)
-            start_time = time.time()
             search = search.params(track_total_hits=False, request_cache=True)
             es_search = CustomESSearch(search[start:end], ConceptDocument)
-            es_search.to_queryset(False, True, False)
-            print("Search to Queryset", time.time() - start_time)
+            name = row.get('name') or row.get('Name') if reranker else None
+            es_search.to_queryset(False, True, False, name, encoder_model)
             result = {'row': row, 'results': [], 'map_config': map_config, 'filter': filters}
-            start_time = time.time()
             for concept in es_search.queryset:
                 concept._highlight = es_search.highlights.get(concept.id, {})  # pylint:disable=protected-access
                 score_info = es_search.scores.get(concept.id, {})
-                score = get(score_info, 'raw') or None
-                normalized_score = get(score_info, 'normalized') or None
-                concept._score = score  # pylint:disable=protected-access
-                concept._normalized_score = normalized_score  # pylint:disable=protected-access
-                if limit > 1:
-                    concept._match_type = 'low'  # pylint:disable=protected-access
-                    score_to_check = normalized_score if normalized_score is not None else score
-                    if concept._highlight.get('name', None) or (is_semantic and score_to_check >= score_threshold):  # pylint:disable=protected-access
-                        concept._match_type = 'very_high'  # pylint:disable=protected-access
-                    elif concept._highlight.get('synonyms', None):   # pylint:disable=protected-access
-                        concept._match_type = 'high'  # pylint:disable=protected-access
-                    elif concept._highlight:   # pylint:disable=protected-access
-                        concept._match_type = 'medium'  # pylint:disable=protected-access
-                else:
-                    concept._match_type = 'very_high' # pylint:disable=protected-access
+                normalized_score = get(score_info, 'normalized') or 0
+                self.apply_score(concept, is_semantic, score_info, score_threshold, reranker, limit)
                 if not best_match or concept._match_type in ['medium', 'high', 'very_high']:  # pylint:disable=protected-access
                     if apply_for_name_locale:
                         concept._requested_locale = locale_filter  # pylint:disable=protected-access
                     serializer = ConceptDetailSerializer if self.is_verbose() else ConceptMinimalSerializer
                     data = serializer(concept, context={'request': self.request}).data
                     data['search_meta']['search_normalized_score'] = normalized_score * 100
                     result['results'].append(data)
-            print("Queryset to Serializer", time.time() - start_time)
-            start_time = time.time()
             if 'results' in result:
                 result['results'] = sorted(
-                    result['results'], key=lambda res: get(res, 'search_meta.search_normalized_score'), reverse=True)
+                    result['results'], key=lambda res: get(res, f'search_meta.{score_to_sort}'), reverse=True)
             results.append(result)
-            print("Sorting", time.time() - start_time)
 
         return results
 
+    @staticmethod
+    def apply_score(concept, is_semantic, scores, score_threshold, reranker, limit):  # pylint: disable=too-many-arguments,too-many-branches
+        score = get(scores, 'raw') or 0
+        normalized_score = get(scores, 'normalized') or 0
+        rerank_score = get(scores, 'rerank') or 0
+
+        concept._score = score  # pylint:disable=protected-access
+        concept._normalized_score = normalized_score  # pylint:disable=protected-access
+        if reranker:
+            concept._rerank_score = rerank_score  # pylint:disable=protected-access
+        highlight = concept._highlight  # pylint:disable=protected-access
+
+        match_type = 'low'
+        if limit > 1:
+            if is_semantic:
+                if reranker:
+                    if normalized_score >= 0.9:
+                        match_type = 'very_high'
+                    elif normalized_score >= 0.65:
+                        match_type = 'high'
+                    elif normalized_score >= 0.5:
+                        match_type = 'medium'
+                else:
+                    score_to_check = normalized_score if normalized_score is not None else score
+                    if highlight.get('name', None) or score_to_check >= score_threshold:
+                        match_type = 'very_high'
+                    elif highlight.get('synonyms', None):
+                        match_type = 'high'
+                    elif highlight:
+                        match_type = 'medium'
+            else:
+                if highlight.get('name', None):
+                    match_type = 'very_high'
+                elif highlight.get('synonyms', None):
+                    match_type = 'high'
+                elif highlight:
+                    match_type = 'medium'
+        else:
+            match_type = 'very_high'
+
+        concept._match_type = match_type  # pylint:disable=protected-access
+
     @staticmethod
     def get_repo_params(is_semantic, target_repo_params, target_repo_url):
         repo = ConceptFuzzySearch.get_target_repo(target_repo_url)
diff --git a/core/fixtures/auth_groups.yaml b/core/fixtures/auth_groups.yaml
@@ -46,3 +46,7 @@
   pk: 12
   fields:
     name: superadmin_user
+- model: "auth.group"
+  pk: 13
+  fields:
+    name: mapper_cross_encoder
diff --git a/core/map_projects/migrations/0019_mapproject_reranker.py b/core/map_projects/migrations/0019_mapproject_reranker.py
@@ -0,0 +1,18 @@
+# Generated by Django 4.2.16 on 2025-12-05 03:45
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('map_projects', '0018_mapproject_candidates'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='mapproject',
+            name='reranker',
+            field=models.BooleanField(default=False),
+        ),
+    ]
diff --git a/core/map_projects/models.py b/core/map_projects/models.py
@@ -36,6 +36,7 @@ class MapProject(BaseModel):
     score_configuration = models.JSONField(default=default_score_configuration, null=True, blank=True)
     filters = models.JSONField(default=dict, null=True, blank=True)
     candidates = models.JSONField(default=dict, null=True, blank=True)
+    reranker = models.BooleanField(default=False)
 
     # Custom API
     match_api_url = models.TextField(null=True, blank=True)
diff --git a/core/map_projects/serializers.py b/core/map_projects/serializers.py
@@ -22,7 +22,7 @@ class Meta:
             'created_by', 'updated_by', 'created_at', 'updated_at', 'url', 'is_active',
             'public_access', 'file', 'user_id', 'organization_id', 'description',
             'target_repo_url', 'matching_algorithm', 'include_retired', 'score_configuration',
-            'match_api_url', 'match_api_token', 'batch_size', 'filters', 'candidates'
+            'match_api_url', 'match_api_token', 'batch_size', 'filters', 'candidates', 'reranker'
         ]
 
     def prepare_object(self, validated_data, instance=None, file=None):
@@ -35,7 +35,7 @@ def prepare_object(self, validated_data, instance=None, file=None):
         if columns is not False:
             instance.columns = columns
         for attr in [
-            'name', 'description', 'extras', 'target_repo_url', 'matching_algorithm', 'include_retired',
+            'name', 'description', 'extras', 'target_repo_url', 'matching_algorithm', 'include_retired', 'reranker',
             'score_configuration', 'match_api_url', 'match_api_token', 'batch_size', 'filters', 'candidates'
         ]:
             setattr(instance, attr, validated_data.get(attr, get(instance, attr)))
@@ -90,7 +90,8 @@ class Meta:
             'created_by', 'updated_by', 'created_at', 'updated_at', 'url', 'is_active',
             'owner', 'owner_type', 'owner_url', 'public_access',
             'target_repo_url', 'matching_algorithm', 'summary', 'logs', 'include_retired',
-            'score_configuration', 'match_api_url', 'match_api_token', 'batch_size', 'filters', 'candidates'
+            'score_configuration', 'match_api_url', 'match_api_token', 'batch_size', 'filters', 'candidates',
+            'reranker'
         ]
 
     def __init__(self, *args, **kwargs):
diff --git a/core/users/constants.py b/core/users/constants.py
@@ -12,6 +12,7 @@
 MAPPER_AI_ASSISTANT_GROUP = 'mapper_ai_assistant'
 MAPPER_WAITLIST_GROUP = 'mapper-waitlist'
 MAPPER_APPROVED_GROUP = 'mapper-approved'
+MAPPER_CROSS_ENCODER_GROUP = 'mapper_cross_encoder'
 EARLY_ACCESS_NGO_GROUP = 'early_access_ngo'
 GUEST_GROUP = 'guest_user'
 STANDARD_GROUP = 'standard_user'
@@ -33,5 +34,6 @@
     PREMIUM_GROUP,
     STAFF_GROUP,
     SUPERADMIN_GROUP,
+    MAPPER_CROSS_ENCODER_GROUP
 ]
 INVALID_AUTH_GROUP_NAME = 'Invalid auth group.'
diff --git a/core/users/models.py b/core/users/models.py
@@ -13,7 +13,8 @@
 from core.common.models import BaseModel, CommonLogoModel
 from core.common.tasks import send_user_verification_email, send_user_reset_password_email
 from core.common.utils import web_url
-from core.users.constants import AUTH_GROUPS, MAPPER_WAITLIST_GROUP, STAFF_GROUP, SUPERADMIN_GROUP, GUEST_GROUP
+from core.users.constants import AUTH_GROUPS, MAPPER_WAITLIST_GROUP, STAFF_GROUP, SUPERADMIN_GROUP, GUEST_GROUP, \
+    MAPPER_APPROVED_GROUP, MAPPER_CROSS_ENCODER_GROUP
 from .constants import USER_OBJECT_TYPE
 from ..common.checksums import ChecksumModel
 
@@ -226,6 +227,14 @@ def has_auth_group(self, group_name):
     def is_mapper_waitlisted(self):
         return self.has_auth_group(MAPPER_WAITLIST_GROUP)
 
+    @property
+    def is_mapper_approved(self):
+        return self.has_auth_group(MAPPER_APPROVED_GROUP)
+
+    @property
+    def is_mapper_cross_encoder_group(self):
+        return self.has_auth_group(MAPPER_CROSS_ENCODER_GROUP)
+
     @property
     def is_guest_group(self):
         return self.has_auth_group(GUEST_GROUP)