Skip to content

Commit 3ea735d

Browse files
Add script for restoring the global summary index (#281)
* Add script for restoring the global summary index * Clarify run directions * Remove irrelevant change * Remove irrelevant change * Satisfy linters
1 parent 3be2e68 commit 3ea735d

File tree

1 file changed

+182
-0
lines changed

1 file changed

+182
-0
lines changed
Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
#
2+
# CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
3+
# (C) Cloudera, Inc. 2024
4+
# All rights reserved.
5+
#
6+
# Applicable Open Source License: Apache 2.0
7+
#
8+
#
9+
# This code is provided to you pursuant a written agreement with
10+
# (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
11+
# this code. If you do not have a written agreement with Cloudera nor
12+
# with an authorized and properly licensed third party, you do not
13+
# have any rights to access nor to use this code.
14+
#
15+
# Absent a written agreement with Cloudera, Inc. ("Cloudera") to the
16+
# contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
17+
# KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
18+
# WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
19+
# IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
20+
# FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
21+
# AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
22+
# ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
23+
# OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
24+
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
25+
# CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
26+
# RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
27+
# BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
28+
# DATA.
29+
#
30+
31+
"""This script reconstructs RAG Studio's databases/doc_summary_index_global/index_store.json if somehow it (and only it) is corrupted.
32+
33+
NOTE:
34+
35+
* Make sure to back up the global directory!
36+
37+
Requirements:
38+
39+
* databases/doc_summary_index_global/docstore.json must exist.
40+
* Run this script from the llm-service/ directory:
41+
```python
42+
uv run python scripts/restore_global_index.py
43+
```
44+
45+
"""
46+
import json
47+
import os
48+
import sys
49+
import uuid
50+
from collections import defaultdict
51+
from time import sleep
52+
from typing import Any, cast
53+
54+
from llama_index.core.schema import (
55+
NodeRelationship,
56+
ObjectType,
57+
)
58+
from llama_index.core.storage.docstore.types import (
59+
DEFAULT_PERSIST_FNAME as DEFAULT_DOC_STORE_FILENAME,
60+
)
61+
from llama_index.core.storage.index_store.types import (
62+
DEFAULT_PERSIST_FNAME as DEFAULT_INDEX_STORE_FILENAME,
63+
)
64+
from pydantic import BaseModel
65+
66+
sys.path.append(".")
67+
from app.ai.indexing.summary_indexer import SummaryIndexer
68+
69+
GLOBAL_PERSIST_DIR = SummaryIndexer._SummaryIndexer__persist_root_dir() # type: ignore
70+
GLOBAL_INDEX_STORE_FILEPATH = os.path.join(
71+
GLOBAL_PERSIST_DIR,
72+
DEFAULT_INDEX_STORE_FILENAME,
73+
)
74+
GLOBAL_DOC_STORE_FILEPATH = os.path.join(
75+
GLOBAL_PERSIST_DIR,
76+
DEFAULT_DOC_STORE_FILENAME,
77+
)
78+
79+
80+
def load_doc_store() -> dict[str, Any]:
81+
with open(GLOBAL_DOC_STORE_FILEPATH, "r") as f:
82+
doc_store = json.load(f)
83+
return cast(dict[str, Any], doc_store)
84+
85+
86+
def write_index_store(index_store: dict[str, Any]) -> None:
87+
with open(GLOBAL_INDEX_STORE_FILEPATH, "w") as f:
88+
json.dump(index_store, f)
89+
90+
91+
class DataSource(BaseModel):
92+
id: int
93+
summary_id: uuid.UUID
94+
doc_summary_ids: list[uuid.UUID]
95+
96+
97+
def build_index_store(data_sources: list[DataSource]) -> dict[str, Any]:
98+
id_ = str(uuid.uuid4())
99+
100+
data = {
101+
"index_id": id_,
102+
"summary": None,
103+
"summary_id_to_node_ids": {
104+
str(data_source.summary_id): list(map(str, data_source.doc_summary_ids))
105+
for data_source in data_sources
106+
},
107+
"node_id_to_summary_id": {
108+
str(doc_summary_id): str(data_source.summary_id)
109+
for data_source in data_sources
110+
for doc_summary_id in data_source.doc_summary_ids
111+
},
112+
"doc_id_to_summary_id": {
113+
str(data_source.id): str(data_source.summary_id)
114+
for data_source in data_sources
115+
},
116+
}
117+
118+
return {
119+
"index_store/data": {
120+
id_: {
121+
"__type__": "document_summary",
122+
"__data__": json.dumps(data),
123+
}
124+
}
125+
}
126+
127+
128+
def read_doc_store(doc_store: dict[str, Any]) -> list[DataSource]:
129+
data_sources: dict[str, dict[str, Any]] = {}
130+
documents: dict[str, dict[str, Any]] = {}
131+
for summary_id, summary in doc_store["docstore/data"].items():
132+
match summary_type := summary["__type__"]:
133+
case ObjectType.TEXT: # data source
134+
data_sources[summary_id] = summary
135+
case ObjectType.DOCUMENT: # document
136+
documents[summary_id] = summary
137+
case _:
138+
raise ValueError(
139+
f"Unrecognized type for {summary_type} summary {summary_id}"
140+
)
141+
142+
data_source_documents: dict[str, list[str]] = defaultdict(list)
143+
for summary in documents.values():
144+
summary = summary["__data__"]
145+
source = summary["relationships"][NodeRelationship.SOURCE]
146+
147+
data_source_documents[source["node_id"]].append(summary["id_"])
148+
149+
ret: list[DataSource] = []
150+
for summary in data_sources.values():
151+
summary = summary["__data__"]
152+
source = summary["relationships"][NodeRelationship.SOURCE]
153+
154+
data_source = DataSource(
155+
id=source["node_id"],
156+
summary_id=summary["id_"],
157+
doc_summary_ids=data_source_documents[source["node_id"]], # type: ignore
158+
)
159+
print(
160+
f"Collected data source {data_source.id}",
161+
f"with {len(data_source.doc_summary_ids)} documents.",
162+
)
163+
ret.append(data_source)
164+
return ret
165+
166+
167+
def main() -> None:
168+
doc_store = load_doc_store()
169+
data_sources = read_doc_store(doc_store)
170+
index_store = build_index_store(data_sources)
171+
172+
print(
173+
"Waiting 5 seconds before writing index",
174+
"in case we want to cancel or something.",
175+
)
176+
sleep(5)
177+
write_index_store(index_store)
178+
print("It is written.")
179+
180+
181+
if __name__ == "__main__":
182+
main()

0 commit comments

Comments
 (0)