Skip to content

Commit d4dffa1

Browse files
dlqqqsrdas
andauthored
Backport PR #733: Handle single files, pdfs, errors from missing loader dependencies in /learn (#744)
Co-authored-by: Sanjiv Das <[email protected]>
1 parent 4d77b00 commit d4dffa1

File tree

3 files changed

+13
-13
lines changed

3 files changed

+13
-13
lines changed

packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -125,14 +125,16 @@ async def process_message(self, message: HumanChatMessage):
125125
if args.verbose:
126126
self.reply(f"Loading and splitting files for {load_path}", message)
127127

128-
await self.learn_dir(
129-
load_path, args.chunk_size, args.chunk_overlap, args.all_files
130-
)
131-
self.save()
132-
133-
response = f"""🎉 I have learned from documents at **{load_path}** and
134-
I am ready to answer questions about them. You can ask questions about these
135-
documents by starting your message with **/ask**."""
128+
try:
129+
await self.learn_dir(
130+
load_path, args.chunk_size, args.chunk_overlap, args.all_files
131+
)
132+
except Exception as e:
133+
response = f"""Learn documents in **{load_path}** failed. {str(e)}."""
134+
else:
135+
self.save()
136+
response = f"""🎉 I have learned documents at **{load_path}** and I am ready to answer questions about them.
137+
You can ask questions about these docs by prefixing your message with **/ask**."""
136138
self.reply(response, message)
137139

138140
def _build_list_response(self):
@@ -163,7 +165,6 @@ async def learn_dir(
163165

164166
delayed = split(path, all_files, splitter=splitter)
165167
doc_chunks = await dask_client.compute(delayed)
166-
167168
em_provider_cls, em_provider_args = self.get_embedding_provider()
168169
delayed = get_embeddings(doc_chunks, em_provider_cls, em_provider_args)
169170
embedding_records = await dask_client.compute(delayed)

packages/jupyter-ai/jupyter_ai/document_loaders/directory.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,12 @@
88
from langchain.document_loaders import PyPDFLoader
99
from langchain.schema import Document
1010
from langchain.text_splitter import TextSplitter
11-
from pypdf import PdfReader
1211

1312

1413
# Uses pypdf which is used by PyPDFLoader from langchain
1514
def pdf_to_text(path):
16-
reader = PdfReader(path)
17-
text = "\n \n".join([page.extract_text() for page in reader.pages])
15+
pages = PyPDFLoader(path)
16+
text = "\n \n".join([page.page_content for page in pages.load_and_split()])
1817
return text
1918

2019

packages/jupyter-ai/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ test = [
5454

5555
dev = ["jupyter_ai_magics[dev]"]
5656

57-
all = ["jupyter_ai_magics[all]"]
57+
all = ["jupyter_ai_magics[all]", "pypdf"]
5858

5959
[tool.hatch.version]
6060
source = "nodejs"

0 commit comments

Comments
 (0)