Skip to content

Commit fa599e1

Browse files
authored
Merge pull request #30 from man-group/search-speedup
Project results to remove heavyweight components
2 parents 67dabaf + 5e12c7c commit fa599e1

File tree

2 files changed

+25
-8
lines changed

2 files changed

+25
-8
lines changed

notebooker/serialization/mongo.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010
from notebooker.constants import JobStatus, NotebookResultComplete, NotebookResultError, NotebookResultPending
1111

1212
logger = getLogger(__name__)
13+
REMOVE_ID_PROJECTION = {"_id": 0}
14+
REMOVE_PAYLOAD_FIELDS_PROJECTION = {"raw_html_resources": 0, "raw_html": 0, "raw_ipynb_json": 0}
15+
REMOVE_PAYLOAD_FIELDS_AND_ID_PROJECTION = dict(REMOVE_PAYLOAD_FIELDS_PROJECTION, **REMOVE_ID_PROJECTION)
1316

1417

1518
class MongoResultSerializer:
@@ -238,9 +241,7 @@ def get_all_results(
238241
base_filter.update(mongo_filter)
239242
if since:
240243
base_filter.update({"update_time": {"$gt": since}})
241-
projection = (
242-
{"_id": 0} if load_payload else {"raw_html_resources": 0, "raw_html": 0, "raw_ipynb_json": 0, "_id": 0}
243-
)
244+
projection = REMOVE_ID_PROJECTION if load_payload else REMOVE_PAYLOAD_FIELDS_AND_ID_PROJECTION
244245
results = self.library.find(base_filter, projection).sort("update_time", -1).limit(limit)
245246
for res in results:
246247
if res:
@@ -253,8 +254,19 @@ def get_all_result_keys(self, limit: int = 0, mongo_filter: Optional[Dict] = Non
253254
base_filter = {"status": {"$ne": JobStatus.DELETED.value}}
254255
if mongo_filter:
255256
base_filter.update(mongo_filter)
256-
projection = {"report_name": 1, "job_id": 1, "_id": 0}
257-
for result in self.library.find(base_filter, projection).sort("update_time", -1).limit(limit):
257+
results = self.library.aggregate(
258+
[
259+
stage
260+
for stage in (
261+
{"$match": base_filter},
262+
{"$sort": {"update_time": -1}},
263+
{"$limit": limit} if limit else {},
264+
{"$project": {"report_name": 1, "job_id": 1}},
265+
)
266+
if stage
267+
]
268+
)
269+
for result in results:
258270
keys.append((result["report_name"], result["job_id"]))
259271
return keys
260272

@@ -313,6 +325,7 @@ def get_latest_successful_job_ids_for_name_all_params(self, report_name: str) ->
313325
results = self.library.aggregate(
314326
[
315327
{"$match": mongo_filter},
328+
{"$project": REMOVE_PAYLOAD_FIELDS_PROJECTION},
316329
{"$sort": {"update_time": -1}},
317330
{"$group": {"_id": "$overrides", "job_id": {"$first": "$job_id"}}},
318331
]

tests/unit/serialization/test_mongoose.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,11 @@ def test_get_latest_job_id_for_name_and_params(_get_all_job_ids, conn, gridfs):
3232
def test__get_all_job_ids(conn, gridfs):
3333
serializer = MongoResultSerializer()
3434
serializer._get_all_job_ids("report_name", None, limit=1)
35-
serializer.library.find.assert_called_once_with(
36-
{"status": {"$ne": JobStatus.DELETED.value}, "report_name": "report_name"},
37-
{"_id": 0, "job_id": 1, "report_name": 1},
35+
serializer.library.aggregate.assert_called_once_with(
36+
[
37+
{"$match": {"status": {"$ne": JobStatus.DELETED.value}, "report_name": "report_name"}},
38+
{"$sort": {"update_time": -1}},
39+
{"$limit": 1},
40+
{"$project": {"report_name": 1, "job_id": 1}},
41+
]
3842
)

0 commit comments

Comments
 (0)