Skip to content

Commit 3ba1f08

Browse files
Merge pull request #81 from cloudera/mob/main
Mob/main
2 parents d32bcba + d56ca40 commit 3ba1f08

File tree

8 files changed

+235
-49
lines changed

8 files changed

+235
-49
lines changed

.project-metadata.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ environment_variables:
1616
required: false
1717
S3_RAG_BUCKET_PREFIX:
1818
default: "rag-studio"
19-
description: "A prefix added to all S3 paths used by Rag Studio. Only needed if S3 is used for file storage."
19+
description: "A prefix added to all S3 paths used by Rag Studio. Only recommended if S3 is used for file storage."
2020
required: false
2121
AWS_ACCESS_KEY_ID:
2222
default: ""

llm-service/app/ai/indexing/readers/pdf.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,14 +123,15 @@ def process_with_docling(self, file_path: Path) -> list[TextNode] | None:
123123
[
124124
"docling",
125125
"-v",
126+
"--image-export-mode=placeholder",
126127
"--abort-on-error",
127128
f"--output={directory}",
128129
str(file_path),
129130
],
130131
stdout=f,
131132
stderr=f,
132133
)
133-
logger.debug(f"docling return code = {process.returncode}")
134+
logger.info(f"docling return code = {process.returncode}")
134135
# todo: figure out page numbers & look into the docling llama-index integration
135136
markdown_file_path = file_path.with_suffix(".md")
136137
if process.returncode == 0 and markdown_file_path.exists():

llm-service/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ dependencies = [
2626
"torch>=2.5.1",
2727
"pillow>=10.4.0",
2828
"transformers>=4.46.3",
29-
"docling>=2.7.0",
29+
"docling>=2.11.0",
3030
"llvmlite==0.43.0",
3131
"llama-index-llms-bedrock-converse>=0.2.0",
3232
]

llm-service/uv.lock

Lines changed: 113 additions & 46 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

local-dev.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@
3939
set -exo pipefail
4040
set -a && source .env && set +a
4141

42+
python3.10 scripts/validator/validate_env.py
43+
4244
export RAG_DATABASES_DIR=$(pwd)/databases
4345

4446
cleanup() {

scripts/01_install_base.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@
3838

3939
import subprocess
4040

41+
print(subprocess.run(["python /home/cdsw/scripts/validator/validate_env.py"], shell=True, check=True))
42+
4143
print(
4244
subprocess.run(["bash /home/cdsw/scripts/install_java.sh"], shell=True, check=True)
4345
)

scripts/startup_app.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,5 +39,7 @@
3939
import subprocess
4040

4141
while True:
42+
print(subprocess.run(["python /home/cdsw/scripts/validator/validate_env.py"], shell=True, check=True))
43+
4244
print(subprocess.run(["bash /home/cdsw/scripts/startup_app.sh"], shell=True))
4345
print("Application Restarting")

scripts/validator/validate_env.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
#
2+
# CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
3+
# (C) Cloudera, Inc. 2024
4+
# All rights reserved.
5+
#
6+
# Applicable Open Source License: Apache 2.0
7+
#
8+
# NOTE: Cloudera open source products are modular software products
9+
# made up of hundreds of individual components, each of which was
10+
# individually copyrighted. Each Cloudera open source product is a
11+
# collective work under U.S. Copyright Law. Your license to use the
12+
# collective work is as provided in your written agreement with
13+
# Cloudera. Used apart from the collective work, this file is
14+
# licensed for your use pursuant to the open source license
15+
# identified above.
16+
#
17+
# This code is provided to you pursuant a written agreement with
18+
# (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
19+
# this code. If you do not have a written agreement with Cloudera nor
20+
# with an authorized and properly licensed third party, you do not
21+
# have any rights to access nor to use this code.
22+
#
23+
# Absent a written agreement with Cloudera, Inc. ("Cloudera") to the
24+
# contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
25+
# KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
26+
# WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
27+
# IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
28+
# FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
29+
# AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
30+
# ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
31+
# OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
32+
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
33+
# CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
34+
# RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
35+
# BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
36+
# DATA.
37+
#
38+
39+
#
40+
# CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
41+
# (C) Cloudera, Inc. 2024
42+
# All rights reserved.
43+
#
44+
# Applicable Open Source License: Apache 2.0
45+
#
46+
#
47+
# This code is provided to you pursuant a written agreement with
48+
# (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
49+
# this code. If you do not have a written agreement with Cloudera nor
50+
# with an authorized and properly licensed third party, you do not
51+
# have any rights to access nor to use this code.
52+
#
53+
# Absent a written agreement with Cloudera, Inc. ("Cloudera") to the
54+
# contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
55+
# KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
56+
# WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
57+
# IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
58+
# FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
59+
# AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
60+
# ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
61+
# OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
62+
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
63+
# CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
64+
# RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
65+
# BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
66+
# DATA.
67+
#
68+
69+
import os
70+
import socket
71+
72+
def validate():
73+
print("Validating environment variables...")
74+
access_key_id = os.environ.get("AWS_ACCESS_KEY_ID", None)
75+
secret_key_id = os.environ.get("AWS_SECRET_ACCESS_KEY", None)
76+
default_region = os.environ.get("AWS_DEFAULT_REGION", None)
77+
document_bucket = os.environ.get("S3_RAG_DOCUMENT_BUCKET", None)
78+
79+
caii_domain = os.environ.get("CAII_DOMAIN", None)
80+
81+
# 1. if you don't have a caii_domain, you _must_ have an access key, secret key, and default region
82+
if caii_domain is None:
83+
if access_key_id is None or secret_key_id is None or default_region is None:
84+
print("ERROR: Using Bedrock for LLMs/embeddings; missing required environment variables: AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_DEFAULT_REGION")
85+
exit(1)
86+
else:
87+
try:
88+
socket.gethostbyname(caii_domain)
89+
print(f"CAII domain {caii_domain} can be resolved")
90+
except socket.error:
91+
print(f"ERROR: CAII domain {caii_domain} can not be resolved")
92+
exit(1)
93+
94+
# 2. if you have a document_bucket, you _must_ have an access key, secret key, and default region
95+
if document_bucket is not None:
96+
if access_key_id is None or secret_key_id is None or default_region is None:
97+
print("ERROR: Using S3 for document storage; missing required environment variables: AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_DEFAULT_REGION")
98+
exit(1)
99+
100+
if caii_domain is not None:
101+
print("Using CAII for LLMs/embeddings; CAII_DOMAIN is set")
102+
103+
else:
104+
print("Using Bedrock for LLMs/embeddings; AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, and AWS_DEFAULT_REGION are set")
105+
106+
if document_bucket is not None:
107+
print("Using S3 for document storage (S3_RAG_DOCUMENT_BUCKET is set)")
108+
else:
109+
print("Using the project filesystem for document storage (S3_RAG_DOCUMENT_BUCKET is not set)")
110+
# TODO: verify that the bucket prefix is always optional
111+
112+
validate()

0 commit comments

Comments
 (0)