From 5dad71474a5757bcc3769379e9de7ca35803033d Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Thu, 5 Mar 2026 14:14:19 +0530 Subject: [PATCH 01/96] feat: add kubernetes app role selection Signed-off-by: Anupam Kumar --- appinfo/info.xml | 14 ++++++++++++++ context_chat_backend/controller.py | 15 ++++++++------- context_chat_backend/task_fetcher.py | 4 ++++ context_chat_backend/types.py | 8 ++++++++ context_chat_backend/utils.py | 13 ++++++++++++- 5 files changed, 46 insertions(+), 8 deletions(-) create mode 100644 context_chat_backend/task_fetcher.py diff --git a/appinfo/info.xml b/appinfo/info.xml index 9760cd2..30194ba 100644 --- a/appinfo/info.xml +++ b/appinfo/info.xml @@ -82,5 +82,19 @@ Setup background job workers as described here: https://docs.nextcloud.com/serve Password to be used for authenticating requests to the OpenAI-compatible endpoint set in CC_EM_BASE_URL. + + + rp + Request Processing Mode + APP_ROLE=rp + true + + + indexing + Indexing Mode + APP_ROLE=indexing + false + + diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py index c26b930..0b6b53d 100644 --- a/context_chat_backend/controller.py +++ b/context_chat_backend/controller.py @@ -75,6 +75,7 @@ def enabled_handler(enabled: bool, _: NextcloudApp | AsyncNextcloudApp) -> str: if enabled: app_enabled.set() + # todo: start bg threads to fetch docs, updates and requests to process else: app_enabled.clear() @@ -213,6 +214,13 @@ def _(): return JSONResponse(content={'enabled': app_enabled.is_set()}, status_code=200) +@app.post('/countIndexedDocuments') +@enabled_guard(app) +def _(): + counts = exec_in_proc(target=count_documents_by_provider, args=(vectordb_loader,)) + return JSONResponse(counts) + + @app.post('/updateAccessDeclarative') @enabled_guard(app) def _( @@ -328,13 +336,6 @@ def _(userId: str = Body(embed=True)): return JSONResponse('User deleted') -@app.post('/countIndexedDocuments') -@enabled_guard(app) -def _(): - counts = exec_in_proc(target=count_documents_by_provider, args=(vectordb_loader,)) - return JSONResponse(counts) - - @app.put('/loadSources') @enabled_guard(app) def _(sources: list[UploadFile]): diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py new file mode 100644 index 0000000..5e2f317 --- /dev/null +++ b/context_chat_backend/task_fetcher.py @@ -0,0 +1,4 @@ +# +# SPDX-FileCopyrightText: 2026 Nextcloud GmbH and Nextcloud contributors +# SPDX-License-Identifier: AGPL-3.0-or-later +# diff --git a/context_chat_backend/types.py b/context_chat_backend/types.py index 500a97d..7868086 100644 --- a/context_chat_backend/types.py +++ b/context_chat_backend/types.py @@ -2,6 +2,8 @@ # SPDX-FileCopyrightText: 2024 Nextcloud GmbH and Nextcloud contributors # SPDX-License-Identifier: AGPL-3.0-or-later # +from enum import Enum + from pydantic import BaseModel __all__ = [ @@ -71,3 +73,9 @@ class FatalEmbeddingException(EmbeddingException): Either malformed request, authentication error, or other non-retryable error. """ + + +class AppRole(str, Enum): + NORMAL = 'normal' + INDEXING = 'indexing' + RP = 'rp' diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py index f6d6e67..224f466 100644 --- a/context_chat_backend/utils.py +++ b/context_chat_backend/utils.py @@ -4,6 +4,7 @@ # import logging import multiprocessing as mp +import os import re import traceback from collections.abc import Callable @@ -14,7 +15,7 @@ from fastapi.responses import JSONResponse as FastAPIJSONResponse -from .types import TConfig, TEmbeddingAuthApiKey, TEmbeddingAuthBasic, TEmbeddingConfig +from .types import AppRole, TConfig, TEmbeddingAuthApiKey, TEmbeddingAuthBasic, TEmbeddingConfig T = TypeVar('T') _logger = logging.getLogger('ccb.utils') @@ -144,3 +145,13 @@ def redact_config(config: TConfig | TEmbeddingConfig) -> TConfig | TEmbeddingCon em_conf.auth.password = '***REDACTED***' # noqa: S105 return config_copy + + +def get_app_role() -> AppRole: + role = os.getenv('APP_ROLE', '').lower() + if role == '': + return AppRole.NORMAL + if role not in ['indexing', 'rp']: + _logger.warning(f'Invalid app role: {role}, defaulting to all roles') + return AppRole.NORMAL + return AppRole(role) From 089d27a41643c165d0474258c840ba6e048279a9 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Thu, 5 Mar 2026 16:42:41 +0530 Subject: [PATCH 02/96] feat: add thread start and stop logic Signed-off-by: Anupam Kumar --- context_chat_backend/controller.py | 17 ++++-- context_chat_backend/task_fetcher.py | 82 ++++++++++++++++++++++++++++ 2 files changed, 94 insertions(+), 5 deletions(-) diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py index 0b6b53d..fadc5f8 100644 --- a/context_chat_backend/controller.py +++ b/context_chat_backend/controller.py @@ -42,6 +42,7 @@ from .models.types import LlmException from nc_py_api.ex_app import AppAPIAuthMiddleware from .utils import JSONResponse, exec_in_proc, is_valid_provider_id, is_valid_source_id, value_of +from .task_fetcher import start_bg_threads, stop_bg_threads from .vectordb.service import ( count_documents_by_provider, decl_update_access, @@ -73,11 +74,16 @@ app_enabled = Event() def enabled_handler(enabled: bool, _: NextcloudApp | AsyncNextcloudApp) -> str: - if enabled: - app_enabled.set() - # todo: start bg threads to fetch docs, updates and requests to process - else: - app_enabled.clear() + try: + if enabled: + app_enabled.set() + start_bg_threads() + else: + app_enabled.clear() + stop_bg_threads() + except Exception as e: + logger.exception('Error in enabled handler:', exc_info=e) + return f'Error in enabled handler: {e}' logger.info(f'App {("disabled", "enabled")[enabled]}') return '' @@ -95,6 +101,7 @@ async def lifespan(app: FastAPI): yield vectordb_loader.offload() llm_loader.offload() + stop_bg_threads() app_config = get_config(os.environ['CC_CONFIG_PATH']) diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index 5e2f317..9660b44 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -2,3 +2,85 @@ # SPDX-FileCopyrightText: 2026 Nextcloud GmbH and Nextcloud contributors # SPDX-License-Identifier: AGPL-3.0-or-later # + +from enum import Enum +from threading import Thread + +from .types import AppRole +from .utils import get_app_role + +APP_ROLE = get_app_role() +THREADS = {} +THREADS_STOP_EVENTS = {} + + +class ThreadType(Enum): + FILES_INDEXING = 'files_indexing' + UPDATES_PROCESSING = 'updates_processing' + REQUEST_PROCESSING = 'request_processing' + + +def files_indexing_thread(): + ... + + +def updates_processing_thread(): + ... + + +def request_processing_thread(): + ... + + +def start_bg_threads(): + match APP_ROLE: + case AppRole.INDEXING | AppRole.NORMAL: + THREADS[ThreadType.FILES_INDEXING] = Thread( + target=files_indexing_thread, + name='FilesIndexingThread', + daemon=True, + ) + THREADS[ThreadType.UPDATES_PROCESSING] = Thread( + target=updates_processing_thread, + name='UpdatesProcessingThread', + daemon=True, + ) + THREADS[ThreadType.FILES_INDEXING].start() + THREADS[ThreadType.UPDATES_PROCESSING].start() + case AppRole.RP | AppRole.NORMAL: + THREADS[ThreadType.REQUEST_PROCESSING] = Thread( + target=request_processing_thread, + name='RequestProcessingThread', + daemon=True, + ) + THREADS[ThreadType.REQUEST_PROCESSING].start() + + +def stop_bg_threads(): + match APP_ROLE: + case AppRole.INDEXING | AppRole.NORMAL: + if ( + ThreadType.FILES_INDEXING not in THREADS + or ThreadType.UPDATES_PROCESSING not in THREADS + or ThreadType.FILES_INDEXING not in THREADS_STOP_EVENTS + or ThreadType.UPDATES_PROCESSING not in THREADS_STOP_EVENTS + ): + return + THREADS_STOP_EVENTS[ThreadType.FILES_INDEXING].set() + THREADS_STOP_EVENTS[ThreadType.UPDATES_PROCESSING].set() + THREADS[ThreadType.FILES_INDEXING].join() + THREADS[ThreadType.UPDATES_PROCESSING].join() + THREADS.pop(ThreadType.FILES_INDEXING) + THREADS.pop(ThreadType.UPDATES_PROCESSING) + THREADS_STOP_EVENTS.pop(ThreadType.FILES_INDEXING) + THREADS_STOP_EVENTS.pop(ThreadType.UPDATES_PROCESSING) + case AppRole.RP | AppRole.NORMAL: + if ( + ThreadType.REQUEST_PROCESSING not in THREADS + or ThreadType.REQUEST_PROCESSING not in THREADS_STOP_EVENTS + ): + return + THREADS_STOP_EVENTS[ThreadType.REQUEST_PROCESSING].set() + THREADS[ThreadType.REQUEST_PROCESSING].join() + THREADS.pop(ThreadType.REQUEST_PROCESSING) + THREADS_STOP_EVENTS.pop(ThreadType.REQUEST_PROCESSING) From 64ffdaf2b83dae9f450a86024cad9f3a41849c30 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Mon, 9 Mar 2026 19:22:45 +0530 Subject: [PATCH 03/96] wip: migrate the indexing process Signed-off-by: Anupam Kumar --- .../chain/ingest/doc_loader.py | 53 +-- context_chat_backend/chain/ingest/injest.py | 201 ++++++----- context_chat_backend/controller.py | 165 +++++----- .../{chain/ingest => }/mimetype_list.py | 0 context_chat_backend/task_fetcher.py | 311 ++++++++++++++++-- context_chat_backend/types.py | 121 ++++++- context_chat_backend/vectordb/base.py | 9 +- context_chat_backend/vectordb/pgvector.py | 61 ++-- 8 files changed, 659 insertions(+), 262 deletions(-) rename context_chat_backend/{chain/ingest => }/mimetype_list.py (100%) diff --git a/context_chat_backend/chain/ingest/doc_loader.py b/context_chat_backend/chain/ingest/doc_loader.py index efb81b6..d26f74b 100644 --- a/context_chat_backend/chain/ingest/doc_loader.py +++ b/context_chat_backend/chain/ingest/doc_loader.py @@ -7,11 +7,10 @@ import re import tempfile from collections.abc import Callable -from typing import BinaryIO +from io import BytesIO import docx2txt from epub2txt import epub2txt -from fastapi import UploadFile from langchain_unstructured import UnstructuredLoader from odfdo import Document from pandas import read_csv, read_excel @@ -19,9 +18,11 @@ from pypdf.errors import FileNotDecryptedError as PdfFileNotDecryptedError from striprtf import striprtf +from ...types import SourceItem + logger = logging.getLogger('ccb.doc_loader') -def _temp_file_wrapper(file: BinaryIO, loader: Callable, sep: str = '\n') -> str: +def _temp_file_wrapper(file: BytesIO, loader: Callable, sep: str = '\n') -> str: raw_bytes = file.read() with tempfile.NamedTemporaryFile(mode='wb') as tmp: tmp.write(raw_bytes) @@ -35,46 +36,46 @@ def _temp_file_wrapper(file: BinaryIO, loader: Callable, sep: str = '\n') -> str # -- LOADERS -- # -def _load_pdf(file: BinaryIO) -> str: +def _load_pdf(file: BytesIO) -> str: pdf_reader = PdfReader(file) return '\n\n'.join([page.extract_text().strip() for page in pdf_reader.pages]) -def _load_csv(file: BinaryIO) -> str: +def _load_csv(file: BytesIO) -> str: return read_csv(file).to_string(header=False, na_rep='') -def _load_epub(file: BinaryIO) -> str: +def _load_epub(file: BytesIO) -> str: return _temp_file_wrapper(file, epub2txt).strip() -def _load_docx(file: BinaryIO) -> str: +def _load_docx(file: BytesIO) -> str: return docx2txt.process(file).strip() -def _load_odt(file: BinaryIO) -> str: +def _load_odt(file: BytesIO) -> str: return _temp_file_wrapper(file, lambda fp: Document(fp).get_formatted_text()).strip() -def _load_ppt_x(file: BinaryIO) -> str: +def _load_ppt_x(file: BytesIO) -> str: return _temp_file_wrapper(file, lambda fp: UnstructuredLoader(fp).load()).strip() -def _load_rtf(file: BinaryIO) -> str: +def _load_rtf(file: BytesIO) -> str: return striprtf.rtf_to_text(file.read().decode('utf-8', 'ignore')).strip() -def _load_xml(file: BinaryIO) -> str: +def _load_xml(file: BytesIO) -> str: data = file.read().decode('utf-8', 'ignore') data = re.sub(r'', '', data) return data.strip() -def _load_xlsx(file: BinaryIO) -> str: +def _load_xlsx(file: BytesIO) -> str: return read_excel(file, na_filter=False).to_string(header=False, na_rep='') -def _load_email(file: BinaryIO, ext: str = 'eml') -> str | None: +def _load_email(file: BytesIO, ext: str = 'eml') -> str | None: # NOTE: msg format is not tested if ext not in ['eml', 'msg']: return None @@ -115,30 +116,34 @@ def attachment_partitioner( } -def decode_source(source: UploadFile) -> str | None: +def decode_source(source: SourceItem) -> str | None: + io_obj: BytesIO | None = None try: # .pot files are powerpoint templates but also plain text files, # so we skip them to prevent decoding errors - if source.headers['title'].endswith('.pot'): + if source.title.endswith('.pot'): return None - mimetype = source.headers['type'] + mimetype = source.type if mimetype is None: return None + if isinstance(source.content, str): + io_obj = BytesIO(source.content.encode('utf-8', 'ignore')) + else: + io_obj = source.content + if _loader_map.get(mimetype): - result = _loader_map[mimetype](source.file) - source.file.close() + result = _loader_map[mimetype](io_obj) return result.encode('utf-8', 'ignore').decode('utf-8', 'ignore') - result = source.file.read().decode('utf-8', 'ignore') - source.file.close() - return result + return io_obj.read().decode('utf-8', 'ignore') except PdfFileNotDecryptedError: - logger.warning(f'PDF file ({source.filename}) is encrypted and cannot be read') + logger.warning(f'PDF file ({source.reference}) is encrypted and cannot be read') return None except Exception: - logger.exception(f'Error decoding source file ({source.filename})', stack_info=True) + logger.exception(f'Error decoding source file ({source.reference})', stack_info=True) return None finally: - source.file.close() # Ensure file is closed after processing + if io_obj is not None: + io_obj.close() diff --git a/context_chat_backend/chain/ingest/injest.py b/context_chat_backend/chain/ingest/injest.py index 5871ebb..0eb70e0 100644 --- a/context_chat_backend/chain/ingest/injest.py +++ b/context_chat_backend/chain/ingest/injest.py @@ -5,29 +5,23 @@ import logging import re -from fastapi.datastructures import UploadFile from langchain.schema import Document from ...dyn_loader import VectorDBLoader -from ...types import TConfig -from ...utils import is_valid_source_id, to_int +from ...types import IndexingError, SourceItem, TConfig from ...vectordb.base import BaseVectorDB from ...vectordb.types import DbException, SafeDbException, UpdateAccessOp from ..types import InDocument from .doc_loader import decode_source from .doc_splitter import get_splitter_for -from .mimetype_list import SUPPORTED_MIMETYPES logger = logging.getLogger('ccb.injest') -def _allowed_file(file: UploadFile) -> bool: - return file.headers['type'] in SUPPORTED_MIMETYPES - def _filter_sources( vectordb: BaseVectorDB, - sources: list[UploadFile] -) -> tuple[list[UploadFile], list[UploadFile]]: + sources: dict[int, SourceItem] +) -> tuple[dict[int, SourceItem], dict[int, SourceItem]]: ''' Returns ------- @@ -37,30 +31,42 @@ def _filter_sources( ''' try: - existing_sources, new_sources = vectordb.check_sources(sources) + existing_source_ids, to_embed_source_ids = vectordb.check_sources(sources) except Exception as e: - raise DbException('Error: Vectordb sources_to_embed error') from e + raise DbException('Error: Vectordb error while checking existing sources in indexing') from e + + existing_sources = {} + to_embed_sources = {} - return ([ - source for source in sources - if source.filename in existing_sources - ], [ - source for source in sources - if source.filename in new_sources - ]) + for db_id, source in sources.items(): + if source.reference in existing_source_ids: + existing_sources[db_id] = source + elif source.reference in to_embed_source_ids: + to_embed_sources[db_id] = source + return existing_sources, to_embed_sources -def _sources_to_indocuments(config: TConfig, sources: list[UploadFile]) -> list[InDocument]: - indocuments = [] - for source in sources: - logger.debug('processing source', extra={ 'source_id': source.filename }) +def _sources_to_indocuments( + config: TConfig, + sources: dict[int, SourceItem] +) -> tuple[dict[int, InDocument], dict[int, IndexingError]]: + indocuments = {} + errored_docs = {} + for db_id, source in sources.items(): + logger.debug('processing source', extra={ 'source_id': source.reference }) + + # todo: maybe fetch the content of the files here # transform the source to have text data content = decode_source(source) if content is None or (content := content.strip()) == '': - logger.debug('decoded empty source', extra={ 'source_id': source.filename }) + logger.debug('decoded empty source', extra={ 'source_id': source.reference }) + errored_docs[db_id] = IndexingError( + error='Decoded content is empty', + retryable=False, + ) continue # replace more than two newlines with two newlines (also blank spaces, more than 4) @@ -71,94 +77,123 @@ def _sources_to_indocuments(config: TConfig, sources: list[UploadFile]) -> list[ content = content.replace('\0', '') if content is None or content == '': - logger.debug('decoded empty source after cleanup', extra={ 'source_id': source.filename }) + logger.debug('decoded empty source after cleanup', extra={ 'source_id': source.reference }) + errored_docs[db_id] = IndexingError( + error='Decoded content is empty', + retryable=False, + ) continue - logger.debug('decoded non empty source', extra={ 'source_id': source.filename }) + logger.debug('decoded non empty source', extra={ 'source_id': source.reference }) metadata = { - 'source': source.filename, - 'title': _decode_latin_1(source.headers['title']), - 'type': source.headers['type'], + 'source': source.reference, + 'title': _decode_latin_1(source.title), + 'type': source.type, } doc = Document(page_content=content, metadata=metadata) - splitter = get_splitter_for(config.embedding_chunk_size, source.headers['type']) + splitter = get_splitter_for(config.embedding_chunk_size, source.type) split_docs = splitter.split_documents([doc]) logger.debug('split document into chunks', extra={ - 'source_id': source.filename, + 'source_id': source.reference, 'len(split_docs)': len(split_docs), }) - indocuments.append(InDocument( + indocuments[db_id] = InDocument( documents=split_docs, - userIds=list(map(_decode_latin_1, source.headers['userIds'].split(','))), - source_id=source.filename, # pyright: ignore[reportArgumentType] - provider=source.headers['provider'], - modified=to_int(source.headers['modified']), - )) + userIds=list(map(_decode_latin_1, source.userIds)), + source_id=source.reference, + provider=source.provider, + modified=source.modified, # pyright: ignore[reportArgumentType] + ) + + return indocuments, errored_docs + + +def _increase_access_for_existing_sources( + vectordb: BaseVectorDB, + existing_sources: dict[int, SourceItem] +) -> dict[int, IndexingError | None]: + ''' + update userIds for existing sources + allow the userIds as additional users, not as the only users + ''' + if len(existing_sources) == 0: + return {} - return indocuments + results = {} + logger.debug('Increasing access for existing sources', extra={ + 'source_ids': [source.reference for source in existing_sources.values()] + }) + for db_id, source in existing_sources.items(): + try: + vectordb.update_access( + UpdateAccessOp.allow, + list(map(_decode_latin_1, source.userIds)), + source.reference, + ) + results[db_id] = None + except SafeDbException as e: + logger.error(f'Failed to update access for source ({source.reference}): {e.args[0]}') + results[db_id] = IndexingError( + error=str(e), + retryable=False, + ) + continue + except Exception as e: + logger.error(f'Unexpected error while updating access for source ({source.reference}): {e}') + results[db_id] = IndexingError( + error='Unexpected error while updating access', + retryable=True, + ) + continue + return results def _process_sources( vectordb: BaseVectorDB, config: TConfig, - sources: list[UploadFile], -) -> tuple[list[str],list[str]]: + sources: dict[int, SourceItem] +) -> dict[int, IndexingError | None]: ''' Processes the sources and adds them to the vectordb. Returns the list of source ids that were successfully added and those that need to be retried. ''' - existing_sources, filtered_sources = _filter_sources(vectordb, sources) + existing_sources, to_embed_sources = _filter_sources(vectordb, sources) logger.debug('db filter source results', extra={ 'len(existing_sources)': len(existing_sources), 'existing_sources': existing_sources, - 'len(filtered_sources)': len(filtered_sources), - 'filtered_sources': filtered_sources, + 'len(to_embed_sources)': len(to_embed_sources), + 'to_embed_sources': to_embed_sources, }) - loaded_source_ids = [source.filename for source in existing_sources] - # update userIds for existing sources - # allow the userIds as additional users, not as the only users - if len(existing_sources) > 0: - logger.debug('Increasing access for existing sources', extra={ - 'source_ids': [source.filename for source in existing_sources] - }) - for source in existing_sources: - try: - vectordb.update_access( - UpdateAccessOp.allow, - list(map(_decode_latin_1, source.headers['userIds'].split(','))), - source.filename, # pyright: ignore[reportArgumentType] - ) - except SafeDbException as e: - logger.error(f'Failed to update access for source ({source.filename}): {e.args[0]}') - continue - - if len(filtered_sources) == 0: + source_proc_results = _increase_access_for_existing_sources(vectordb, existing_sources) + + if len(to_embed_sources) == 0: # no new sources to embed logger.debug('Filtered all sources, nothing to embed') - return loaded_source_ids, [] # pyright: ignore[reportReturnType] + return source_proc_results logger.debug('Filtered sources:', extra={ - 'source_ids': [source.filename for source in filtered_sources] + 'source_ids': [source.reference for source in to_embed_sources.values()] }) # invalid/empty sources are filtered out here and not counted in loaded/retryable - indocuments = _sources_to_indocuments(config, filtered_sources) + indocuments, errored_docs = _sources_to_indocuments(config, to_embed_sources) - logger.debug('Converted all sources to documents') + source_proc_results.update(errored_docs) + logger.debug('Converted sources to documents') if len(indocuments) == 0: # filtered document(s) were invalid/empty, not an error logger.debug('All documents were found empty after being processed') - return loaded_source_ids, [] # pyright: ignore[reportReturnType] + return source_proc_results - added_source_ids, retry_source_ids = vectordb.add_indocuments(indocuments) - loaded_source_ids.extend(added_source_ids) + doc_add_results = vectordb.add_indocuments(indocuments) + source_proc_results.update(doc_add_results) logger.debug('Added documents to vectordb') - return loaded_source_ids, retry_source_ids # pyright: ignore[reportReturnType] + return source_proc_results def _decode_latin_1(s: str) -> str: @@ -172,31 +207,15 @@ def _decode_latin_1(s: str) -> str: def embed_sources( vectordb_loader: VectorDBLoader, config: TConfig, - sources: list[UploadFile], -) -> tuple[list[str],list[str]]: - # either not a file or a file that is allowed - sources_filtered = [ - source for source in sources - if is_valid_source_id(source.filename) # pyright: ignore[reportArgumentType] - or _allowed_file(source) - ] - + sources: dict[int, SourceItem] +) -> dict[int, IndexingError | None]: logger.debug('Embedding sources:', extra={ 'source_ids': [ - f'{source.filename} ({_decode_latin_1(source.headers["title"])})' - for source in sources_filtered - ], - 'invalid_source_ids': [ - source.filename for source in sources - if not is_valid_source_id(source.filename) # pyright: ignore[reportArgumentType] - ], - 'not_allowed_file_ids': [ - source.filename for source in sources - if not _allowed_file(source) + f'{source.reference} ({_decode_latin_1(source.title)})' + for source in sources.values() ], - 'len(source_ids)': len(sources_filtered), - 'len(total_source_ids)': len(sources), + 'len(source_ids)': len(sources), }) vectordb = vectordb_loader.load() - return _process_sources(vectordb, config, sources_filtered) + return _process_sources(vectordb, config, sources) diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py index fadc5f8..3e70ee1 100644 --- a/context_chat_backend/controller.py +++ b/context_chat_backend/controller.py @@ -27,7 +27,7 @@ from time import sleep from typing import Annotated, Any -from fastapi import Body, FastAPI, Request, UploadFile +from fastapi import Body, FastAPI, Request from langchain.llms.base import LLM from nc_py_api import AsyncNextcloudApp, NextcloudApp from nc_py_api.ex_app import persistent_storage, set_handlers @@ -35,14 +35,13 @@ from starlette.responses import FileResponse from .chain.context import do_doc_search -from .chain.ingest.injest import embed_sources from .chain.one_shot import process_context_query, process_query from .config_parser import get_config from .dyn_loader import LLMModelLoader, VectorDBLoader from .models.types import LlmException from nc_py_api.ex_app import AppAPIAuthMiddleware from .utils import JSONResponse, exec_in_proc, is_valid_provider_id, is_valid_source_id, value_of -from .task_fetcher import start_bg_threads, stop_bg_threads +from .task_fetcher import start_bg_threads, wait_for_bg_threads from .vectordb.service import ( count_documents_by_provider, decl_update_access, @@ -57,6 +56,7 @@ repair_run() ensure_config_file() logger = logging.getLogger('ccb.controller') +app_config = get_config(os.environ['CC_CONFIG_PATH']) __download_models_from_hf = os.environ.get('CC_DOWNLOAD_MODELS_FROM_HF', 'true').lower() in ('1', 'true', 'yes') models_to_fetch = { @@ -77,10 +77,10 @@ def enabled_handler(enabled: bool, _: NextcloudApp | AsyncNextcloudApp) -> str: try: if enabled: app_enabled.set() - start_bg_threads() + start_bg_threads(app_config, app_enabled) else: app_enabled.clear() - stop_bg_threads() + wait_for_bg_threads() except Exception as e: logger.exception('Error in enabled handler:', exc_info=e) return f'Error in enabled handler: {e}' @@ -101,10 +101,9 @@ async def lifespan(app: FastAPI): yield vectordb_loader.offload() llm_loader.offload() - stop_bg_threads() + wait_for_bg_threads() -app_config = get_config(os.environ['CC_CONFIG_PATH']) app = FastAPI(debug=app_config.debug, lifespan=lifespan) # pyright: ignore[reportArgumentType] app.extra['CONFIG'] = app_config @@ -343,86 +342,78 @@ def _(userId: str = Body(embed=True)): return JSONResponse('User deleted') -@app.put('/loadSources') -@enabled_guard(app) -def _(sources: list[UploadFile]): - global _indexing - - if len(sources) == 0: - return JSONResponse('No sources provided', 400) - - filtered_sources = [] - - for source in sources: - if not value_of(source.filename): - logger.warning('Skipping source with invalid source_id', extra={ - 'source_id': source.filename, - 'title': source.headers.get('title'), - }) - continue - - with index_lock: - if source.filename in _indexing: - # this request will be retried by the client - return JSONResponse( - f'This source ({source.filename}) is already being processed in another request, try again later', - 503, - headers={'cc-retry': 'true'}, - ) - - if not ( - value_of(source.headers.get('userIds')) - and source.headers.get('title', None) is not None - and value_of(source.headers.get('type')) - and value_of(source.headers.get('modified')) - and source.headers['modified'].isdigit() - and value_of(source.headers.get('provider')) - ): - logger.warning('Skipping source with invalid/missing headers', extra={ - 'source_id': source.filename, - 'title': source.headers.get('title'), - 'headers': source.headers, - }) - continue - - filtered_sources.append(source) - - # wait for 10 minutes before failing the request - semres = doc_parse_semaphore.acquire(block=True, timeout=10*60) - if not semres: - return JSONResponse( - 'Document parser worker limit reached, try again in some time or consider increasing the limit', - 503, - headers={'cc-retry': 'true'} - ) - - with index_lock: - for source in filtered_sources: - _indexing[source.filename] = source.size - - try: - loaded_sources, not_added_sources = exec_in_proc( - target=embed_sources, - args=(vectordb_loader, app.extra['CONFIG'], filtered_sources) - ) - except (DbException, EmbeddingException): - raise - except Exception as e: - raise DbException('Error: failed to load sources') from e - finally: - with index_lock: - for source in filtered_sources: - _indexing.pop(source.filename, None) - doc_parse_semaphore.release() - - if len(loaded_sources) != len(filtered_sources): - logger.debug('Some sources were not loaded', extra={ - 'Count of loaded sources': f'{len(loaded_sources)}/{len(filtered_sources)}', - 'source_ids': loaded_sources, - }) - - # loaded sources include the existing sources that may only have their access updated - return JSONResponse({'loaded_sources': loaded_sources, 'sources_to_retry': not_added_sources}) +# @app.put('/loadSources') +# @enabled_guard(app) +# def _(sources: list[UploadFile]): +# global _indexing + +# if len(sources) == 0: +# return JSONResponse('No sources provided', 400) + +# for source in sources: +# if not value_of(source.filename): +# return JSONResponse(f'Invalid source filename for: {source.headers.get("title")}', 400) + +# with index_lock: +# if source.filename in _indexing: +# # this request will be retried by the client +# return JSONResponse( +# f'This source ({source.filename}) is already being processed in another request, try again later', +# 503, +# headers={'cc-retry': 'true'}, +# ) + +# if not ( +# value_of(source.headers.get('userIds')) +# and source.headers.get('title', None) is not None +# and value_of(source.headers.get('type')) +# and value_of(source.headers.get('modified')) +# and source.headers['modified'].isdigit() +# and value_of(source.headers.get('provider')) +# ): +# logger.error('Invalid/missing headers received', extra={ +# 'source_id': source.filename, +# 'title': source.headers.get('title'), +# 'headers': source.headers, +# }) +# return JSONResponse(f'Invaild/missing headers for: {source.filename}', 400) + +# # wait for 10 minutes before failing the request +# semres = doc_parse_semaphore.acquire(block=True, timeout=10*60) +# if not semres: +# return JSONResponse( +# 'Document parser worker limit reached, try again in some time or consider increasing the limit', +# 503, +# headers={'cc-retry': 'true'} +# ) + +# with index_lock: +# for source in sources: +# _indexing[source.filename] = source.size + +# try: +# loaded_sources, not_added_sources = exec_in_proc( +# target=embed_sources, +# args=(vectordb_loader, app.extra['CONFIG'], sources) +# ) +# except (DbException, EmbeddingException): +# raise +# except Exception as e: +# raise DbException('Error: failed to load sources') from e +# finally: +# with index_lock: +# for source in sources: +# _indexing.pop(source.filename, None) +# doc_parse_semaphore.release() + +# if len(loaded_sources) != len(sources): +# logger.debug('Some sources were not loaded', extra={ +# 'Count of loaded sources': f'{len(loaded_sources)}/{len(sources)}', +# 'source_ids': loaded_sources, +# }) + +# # loaded sources include the existing sources that may only have their access updated +# return JSONResponse({'loaded_sources': loaded_sources, 'sources_to_retry': not_added_sources}) class Query(BaseModel): diff --git a/context_chat_backend/chain/ingest/mimetype_list.py b/context_chat_backend/mimetype_list.py similarity index 100% rename from context_chat_backend/chain/ingest/mimetype_list.py rename to context_chat_backend/mimetype_list.py diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index 9660b44..a548bcf 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -3,15 +3,41 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # +import asyncio +import logging +from contextlib import suppress from enum import Enum -from threading import Thread +from io import BytesIO +from threading import Event, Thread +from time import sleep -from .types import AppRole -from .utils import get_app_role +import niquests +from nc_py_api import AsyncNextcloudApp, NextcloudApp +from pydantic import ValidationError + +from .chain.ingest.injest import embed_sources +from .dyn_loader import VectorDBLoader +from .types import ( + AppRole, + EmbeddingException, + FilesQueueItem, + IndexingError, + IndexingException, + LoaderException, + ReceivedFileItem, + SourceItem, + TConfig, +) +from .utils import exec_in_proc, get_app_role +from .vectordb.types import DbException APP_ROLE = get_app_role() THREADS = {} -THREADS_STOP_EVENTS = {} +LOGGER = logging.getLogger('ccb.task_fetcher') +FILES_INDEXING_BATCH_SIZE = 64 # todo: config? +# max concurrent fetches to avoid overloading the NC server or hitting rate limits +CONCURRENT_FILE_FETCHES = 10 # todo: config? +MAX_FILE_SIZE = 100 * 1024 * 1024 # 100 MB, todo: config? class ThreadType(Enum): @@ -20,67 +46,294 @@ class ThreadType(Enum): REQUEST_PROCESSING = 'request_processing' -def files_indexing_thread(): - ... +async def __fetch_file_content( + semaphore: asyncio.Semaphore, + file_id: int, + user_id: str, + _rlimit = 3, +) -> BytesIO: + ''' + Raises + ------ + IndexingException + ''' + + async with semaphore: + nc = AsyncNextcloudApp() + try: + # a file pointer for storing the stream in memory until it is consumed + fp = BytesIO() + await nc._session.download2fp( + url_path=f'/apps/context_chat/files/{file_id}', + fp=fp, + dav=False, + params={ 'userId': user_id }, + ) + return fp + except niquests.exceptions.RequestException as e: + # todo: raise IndexingException with retryable=True for rate limit errors, + # todo: and handle it in the caller to not delete the source from the queue and retry later through + # todo: the normal lock expiry mechanism + if e.response is None: + raise + + if e.response.status_code == niquests.codes.too_many_requests: # pyright: ignore[reportAttributeAccessIssue] + # todo: implement rate limits in php CC? + wait_for = int(e.response.headers.get('Retry-After', '30')) + if _rlimit <= 0: + raise IndexingException( + f'Rate limited when fetching content for file id {file_id}, user id {user_id},' + ' max retries exceeded', + retryable=True, + ) from e + LOGGER.warning( + f'Rate limited when fetching content for file id {file_id}, user id {user_id},' + f' waiting {wait_for} before retrying', + exc_info=e, + ) + await asyncio.sleep(wait_for) + return await __fetch_file_content(semaphore, file_id, user_id, _rlimit - 1) + + raise + except IndexingException: + raise + except Exception as e: + LOGGER.error(f'Error fetching content for file id {file_id}, user id {user_id}: {e}', exc_info=e) + raise IndexingException(f'Error fetching content for file id {file_id}, user id {user_id}: {e}') from e + + +async def __fetch_files_content( + files: dict[int, ReceivedFileItem] +) -> dict[int, SourceItem | IndexingError]: + source_items = {} + semaphore = asyncio.Semaphore(CONCURRENT_FILE_FETCHES) + tasks = [] + + for file_id, file_item in files.items(): + if file_item.size > MAX_FILE_SIZE: + LOGGER.info( + f'Skipping file id {file_id}, source id {file_item.reference} due to size' + f' {(file_item.size/(1024*1024)):.2f} MiB exceeding the limit {(MAX_FILE_SIZE/(1024*1024)):.2f} MiB', + ) + source_items[file_id] = IndexingError( + error=( + f'File size {(file_item.size/(1024*1024)):.2f} MiB' + f' exceeds the limit {(MAX_FILE_SIZE/(1024*1024)):.2f} MiB' + ), + retryable=False, + ) + continue + # todo: perform the existing file check before fetching the content to avoid unnecessary fetches + # any user id from the list should have read access to the file + tasks.append(asyncio.ensure_future(__fetch_file_content(semaphore, file_id, file_item.userIds[0]))) + results = await asyncio.gather(*tasks, return_exceptions=True) + for (file_id, file_item), result in zip(files.items(), results, strict=True): + if isinstance(result, IndexingException): + LOGGER.error( + f'Error fetching content for file id {file_id}, reference {file_item.reference}: {result}', + exc_info=result, + ) + source_items[file_id] = IndexingError( + error=str(result), + retryable=result.retryable, + ) + elif isinstance(result, str) or isinstance(result, BytesIO): + source_items[file_id] = SourceItem( + **file_item.model_dump(), + content=result, + ) + elif isinstance(result, BaseException): + LOGGER.error( + f'Unexpected error fetching content for file id {file_id}, reference {file_item.reference}: {result}', + exc_info=result, + ) + source_items[file_id] = IndexingError( + error=f'Unexpected error: {result}', + retryable=True, + ) + else: + LOGGER.error( + f'Unknown error fetching content for file id {file_id}, reference {file_item.reference}: {result}', + exc_info=True, + ) + source_items[file_id] = IndexingError( + error='Unknown error', + retryable=True, + ) + return source_items + + +def files_indexing_thread(app_config: TConfig, app_enabled: Event) -> None: + try: + vectordb_loader = VectorDBLoader(app_config) + except LoaderException as e: + LOGGER.error('Error initializing vector DB loader, files indexing thread will not start:', exc_info=e) + return + + def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingError | None]: + try: + return exec_in_proc( + target=embed_sources, + args=(vectordb_loader, app_config, source_items), + ) + except (DbException, EmbeddingException): + raise + except Exception as e: + raise DbException('Error: failed to load sources') from e -def updates_processing_thread(): + + while True: + if not app_enabled.is_set(): + LOGGER.info('Files indexing thread is stopping as the app is disabled') + return + + try: + nc = NextcloudApp() + # todo: add the 'size' param to the return of this call. + q_items_res = nc.ocs( + 'GET', + '/apps/context_chat/queues/documents', + params={ 'n': FILES_INDEXING_BATCH_SIZE } + ) + + try: + q_items = FilesQueueItem.model_validate(q_items_res) + except ValidationError as e: + raise Exception(f'Error validating queue items response: {e}\nResponse content: {q_items_res}') from e + + # populate files content and convert to source items + fetched_files = {} + source_files = {} + # unified error structure for files and content providers + source_errors = {} + + if q_items.files: + fetched_files = asyncio.run(__fetch_files_content(q_items.files)) + + for file_id, result in fetched_files.items(): + if isinstance(result, SourceItem): + source_files[file_id] = result + else: + source_errors[file_id] = result + + files_result = _load_sources(source_files) + providers_result = _load_sources(q_items.content_providers) + + if ( + any(isinstance(res, IndexingError) for res in files_result.values()) + or any(isinstance(res, IndexingError) for res in providers_result.values()) + ): + LOGGER.error('Some sources failed to index', extra={ + 'file_errors': { + file_id: error + for file_id, error in files_result.items() + if isinstance(error, IndexingError) + }, + 'provider_errors': { + provider_id: error + for provider_id, error in providers_result.items() + if isinstance(error, IndexingError) + }, + }) + except ( + niquests.exceptions.ConnectionError, + niquests.exceptions.Timeout, + ) as e: + LOGGER.info('Temporary error fetching documents to index, will retry:', exc_info=e) + sleep(5) + continue + except Exception as e: + LOGGER.exception('Error fetching documents to index:', exc_info=e) + sleep(5) + continue + + # delete the entries from the PHP side queue where indexing succeeded or the error is not retryable + to_delete_file_ids = [ + file_id for file_id, result in files_result.items() + if result is None or (isinstance(result, IndexingError) and not result.retryable) + ] + to_delete_provider_ids = [ + provider_id for provider_id, result in providers_result.items() + if result is None or (isinstance(result, IndexingError) and not result.retryable) + ] + + try: + nc.ocs( + 'DELETE', + '/apps/context_chat/queues/documents/', + json={ + 'files': to_delete_file_ids, + 'content_providers': to_delete_provider_ids, + }, + ) + except ( + niquests.exceptions.ConnectionError, + niquests.exceptions.Timeout, + ) as e: + LOGGER.info('Temporary error reporting indexing results, will retry:', exc_info=e) + sleep(5) + with suppress(Exception): + nc = NextcloudApp() + nc.ocs( + 'DELETE', + '/apps/context_chat/queues/documents/', + json={ + 'files': to_delete_file_ids, + 'content_providers': to_delete_provider_ids, + }, + ) + continue + except Exception as e: + LOGGER.exception('Error reporting indexing results:', exc_info=e) + sleep(5) + continue + + + +def updates_processing_thread(app_config: TConfig): ... -def request_processing_thread(): +def request_processing_thread(app_config: TConfig): ... -def start_bg_threads(): +def start_bg_threads(app_config: TConfig, app_enabled: Event): match APP_ROLE: case AppRole.INDEXING | AppRole.NORMAL: THREADS[ThreadType.FILES_INDEXING] = Thread( target=files_indexing_thread, + args=(app_config, Event), name='FilesIndexingThread', - daemon=True, ) THREADS[ThreadType.UPDATES_PROCESSING] = Thread( target=updates_processing_thread, + args=(app_config, Event), name='UpdatesProcessingThread', - daemon=True, ) THREADS[ThreadType.FILES_INDEXING].start() THREADS[ThreadType.UPDATES_PROCESSING].start() case AppRole.RP | AppRole.NORMAL: THREADS[ThreadType.REQUEST_PROCESSING] = Thread( target=request_processing_thread, + args=(app_config, Event), name='RequestProcessingThread', - daemon=True, ) THREADS[ThreadType.REQUEST_PROCESSING].start() -def stop_bg_threads(): +def wait_for_bg_threads(): match APP_ROLE: case AppRole.INDEXING | AppRole.NORMAL: - if ( - ThreadType.FILES_INDEXING not in THREADS - or ThreadType.UPDATES_PROCESSING not in THREADS - or ThreadType.FILES_INDEXING not in THREADS_STOP_EVENTS - or ThreadType.UPDATES_PROCESSING not in THREADS_STOP_EVENTS - ): + if (ThreadType.FILES_INDEXING not in THREADS or ThreadType.UPDATES_PROCESSING not in THREADS): return - THREADS_STOP_EVENTS[ThreadType.FILES_INDEXING].set() - THREADS_STOP_EVENTS[ThreadType.UPDATES_PROCESSING].set() THREADS[ThreadType.FILES_INDEXING].join() THREADS[ThreadType.UPDATES_PROCESSING].join() THREADS.pop(ThreadType.FILES_INDEXING) THREADS.pop(ThreadType.UPDATES_PROCESSING) - THREADS_STOP_EVENTS.pop(ThreadType.FILES_INDEXING) - THREADS_STOP_EVENTS.pop(ThreadType.UPDATES_PROCESSING) case AppRole.RP | AppRole.NORMAL: - if ( - ThreadType.REQUEST_PROCESSING not in THREADS - or ThreadType.REQUEST_PROCESSING not in THREADS_STOP_EVENTS - ): + if (ThreadType.REQUEST_PROCESSING not in THREADS): return - THREADS_STOP_EVENTS[ThreadType.REQUEST_PROCESSING].set() THREADS[ThreadType.REQUEST_PROCESSING].join() THREADS.pop(ThreadType.REQUEST_PROCESSING) - THREADS_STOP_EVENTS.pop(ThreadType.REQUEST_PROCESSING) diff --git a/context_chat_backend/types.py b/context_chat_backend/types.py index 7868086..97d48ce 100644 --- a/context_chat_backend/types.py +++ b/context_chat_backend/types.py @@ -3,8 +3,13 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # from enum import Enum +from io import BytesIO +from typing import Self -from pydantic import BaseModel +from pydantic import BaseModel, field_validator + +from .mimetype_list import SUPPORTED_MIMETYPES +from .utils import is_valid_provider_id, is_valid_source_id __all__ = [ 'DEFAULT_EM_MODEL_ALIAS', @@ -17,6 +22,7 @@ ] DEFAULT_EM_MODEL_ALIAS = 'em_model' +FILES_PROVIDER_ID = 'files__default' class TEmbeddingAuthApiKey(BaseModel): @@ -79,3 +85,116 @@ class AppRole(str, Enum): NORMAL = 'normal' INDEXING = 'indexing' RP = 'rp' + + +class CommonSourceItem(BaseModel): + userIds: list[str] + reference: str # source_id of the form "appId__providerId: itemId" + title: str + modified: int | str # todo: int/string? + type: str + provider: str + size: int + + @field_validator('modified', mode='before') + @classmethod + def validate_modified(cls, v): + if isinstance(v, int): + return v + if isinstance(v, str): + try: + return int(v) + except ValueError as e: + raise ValueError(f'Invalid modified value: {v}') from e + raise ValueError(f'Invalid modified type: {type(v)}') + + @field_validator('reference', 'title', 'type', 'provider') + @classmethod + def validate_strings_non_empty(cls, v): + if not isinstance(v, str) or v.strip() == '': + raise ValueError('Must be a non-empty string') + return v.strip() + + @field_validator('userIds', mode='after') + def validate_user_ids(self) -> Self: + if ( + not isinstance(self.userIds, list) + or not all( + isinstance(uid, str) + and uid.strip() != '' + for uid in self.userIds + ) + or len(self.userIds) == 0 + ): + raise ValueError('userIds must be a non-empty list of non-empty strings') + self.userIds = [uid.strip() for uid in self.userIds] + return self + + @field_validator('reference', mode='after') + def validate_reference_format(self) -> Self: + # validate reference format: "appId__providerId: itemId" + if not is_valid_source_id(self.reference): + raise ValueError('Invalid reference format, must be "appId__providerId: itemId"') + return self + + @field_validator('provider', mode='after') + def validate_provider_format(self) -> Self: + # validate provider format: "appId__providerId" + if not is_valid_provider_id(self.provider): + raise ValueError('Invalid provider format, must be "appId__providerId"') + return self + + @field_validator('type', mode='after') + def validate_type(self) -> Self: + if self.reference.startswith(FILES_PROVIDER_ID) and self.type not in SUPPORTED_MIMETYPES: + raise ValueError(f'Unsupported file type: {self.type} for reference {self.reference}') + return self + + @field_validator('size', mode='after') + def validate_size(self) -> Self: + if not isinstance(self.size, int) or self.size < 0: + raise ValueError(f'Invalid size value: {self.size}, must be a non-negative integer') + return self + + +class ReceivedFileItem(CommonSourceItem): + content: None + + +class SourceItem(CommonSourceItem): + ''' + Used for the unified queue of items to process, after fetching the content for files + and for directly fetched content providers. + ''' + content: str | BytesIO + + @field_validator('content') + @classmethod + def validate_content(cls, v): + if isinstance(v, str): + if v.strip() == '': + raise ValueError('Content must be a non-empty string') + return v.strip() + if isinstance(v, BytesIO): + if v.getbuffer().nbytes == 0: + raise ValueError('Content must be a non-empty BytesIO') + return v + raise ValueError('Content must be either a non-empty string or a non-empty BytesIO') + + +class FilesQueueItem(BaseModel): + files: dict[int, ReceivedFileItem] # [db id]: FileItem + content_providers: dict[int, SourceItem] # [db id]: SourceItem + + +class IndexingException(Exception): + retryable: bool = False + + def __init__(self, message: str, retryable: bool = False): + super().__init__(message) + self.retryable = retryable + + +class IndexingError(BaseModel): + error: str + retryable: bool = False diff --git a/context_chat_backend/vectordb/base.py b/context_chat_backend/vectordb/base.py index 0bf1020..ebd5407 100644 --- a/context_chat_backend/vectordb/base.py +++ b/context_chat_backend/vectordb/base.py @@ -5,12 +5,12 @@ from abc import ABC, abstractmethod from typing import Any -from fastapi import UploadFile from langchain.schema import Document from langchain.schema.embeddings import Embeddings from langchain.schema.vectorstore import VectorStore from ..chain.types import InDocument, ScopeType +from ..types import IndexingError, SourceItem from ..utils import timed from .types import UpdateAccessOp @@ -62,7 +62,7 @@ def get_instance(self) -> VectorStore: ''' @abstractmethod - def add_indocuments(self, indocuments: list[InDocument]) -> tuple[list[str],list[str]]: + def add_indocuments(self, indocuments: dict[int, InDocument]) -> dict[int, IndexingError | None]: ''' Adds the given indocuments to the vectordb and updates the docs + access tables. @@ -79,10 +79,7 @@ def add_indocuments(self, indocuments: list[InDocument]) -> tuple[list[str],list @timed @abstractmethod - def check_sources( - self, - sources: list[UploadFile], - ) -> tuple[list[str], list[str]]: + def check_sources(self, sources: dict[int, SourceItem]) -> tuple[list[str], list[str]]: ''' Checks the sources in the vectordb if they are already embedded and are up to date. diff --git a/context_chat_backend/vectordb/pgvector.py b/context_chat_backend/vectordb/pgvector.py index 2b7fc06..f5879fe 100644 --- a/context_chat_backend/vectordb/pgvector.py +++ b/context_chat_backend/vectordb/pgvector.py @@ -11,14 +11,13 @@ import sqlalchemy.dialects.postgresql as postgresql_dialects import sqlalchemy.orm as orm from dotenv import load_dotenv -from fastapi import UploadFile from langchain.schema import Document from langchain.vectorstores import VectorStore from langchain_core.embeddings import Embeddings from langchain_postgres.vectorstores import Base, PGVector from ..chain.types import InDocument, ScopeType -from ..types import EmbeddingException, RetryableEmbeddingException +from ..types import EmbeddingException, IndexingError, RetryableEmbeddingException, SourceItem from ..utils import timed from .base import BaseVectorDB from .types import DbException, SafeDbException, UpdateAccessOp @@ -130,17 +129,16 @@ def get_users(self) -> list[str]: except Exception as e: raise DbException('Error: getting a list of all users from access list') from e - def add_indocuments(self, indocuments: list[InDocument]) -> tuple[list[str], list[str]]: + def add_indocuments(self, indocuments: dict[int, InDocument]) -> dict[int, IndexingError | None]: """ Raises EmbeddingException: if the embedding request definitively fails """ - added_sources = [] - retry_sources = [] + results = {} batch_size = PG_BATCH_SIZE // 5 with self.session_maker() as session: - for indoc in indocuments: + for php_db_id, indoc in indocuments.items(): try: # query paramerters limitation in postgres is 65535 (https://www.postgresql.org/docs/current/limits.html) # so we chunk the documents into (5 values * 10k) chunks @@ -170,7 +168,7 @@ def add_indocuments(self, indocuments: list[InDocument]) -> tuple[list[str], lis ) self.decl_update_access(indoc.userIds, indoc.source_id, session) - added_sources.append(indoc.source_id) + results[php_db_id] = None session.commit() except SafeDbException as e: # for when the source_id is not found. This here can be an error in the DB @@ -178,51 +176,67 @@ def add_indocuments(self, indocuments: list[InDocument]) -> tuple[list[str], lis logger.exception('Error adding documents to vectordb', exc_info=e, extra={ 'source_id': indoc.source_id, }) - retry_sources.append(indoc.source_id) + results[php_db_id] = IndexingError( + error=str(e), + retryable=True, + ) continue except RetryableEmbeddingException as e: # temporary error, continue with the next document logger.exception('Error adding documents to vectordb, should be retried later.', exc_info=e, extra={ 'source_id': indoc.source_id, }) - retry_sources.append(indoc.source_id) + results[php_db_id] = IndexingError( + error=str(e), + retryable=True, + ) continue except EmbeddingException as e: logger.exception('Error adding documents to vectordb', exc_info=e, extra={ 'source_id': indoc.source_id, }) - raise + results[php_db_id] = IndexingError( + error=str(e), + retryable=False, + ) + continue except Exception as e: logger.exception('Error adding documents to vectordb', exc_info=e, extra={ 'source_id': indoc.source_id, }) - retry_sources.append(indoc.source_id) + results[php_db_id] = IndexingError( + error='An unexpected error occurred while adding documents to the database.', + retryable=True, + ) continue - return added_sources, retry_sources + return results @timed - def check_sources(self, sources: list[UploadFile]) -> tuple[list[str], list[str]]: + def check_sources(self, sources: dict[int, SourceItem]) -> tuple[list[str], list[str]]: + ''' + returns a tuple of (existing_source_ids, to_embed_source_ids) + ''' with self.session_maker() as session: try: stmt = ( sa.select(DocumentsStore.source_id) - .filter(DocumentsStore.source_id.in_([source.filename for source in sources])) + .filter(DocumentsStore.source_id.in_([source.reference for source in sources.values()])) .with_for_update() ) results = session.execute(stmt).fetchall() existing_sources = {r.source_id for r in results} - to_embed = [source.filename for source in sources if source.filename not in existing_sources] + to_embed = [source.reference for source in sources.values() if source.reference not in existing_sources] to_delete = [] - for source in sources: + for source in sources.values(): stmt = ( sa.select(DocumentsStore.source_id) - .filter(DocumentsStore.source_id == source.filename) + .filter(DocumentsStore.source_id == source.reference) .filter(DocumentsStore.modified < sa.cast( - datetime.fromtimestamp(int(source.headers['modified'])), + datetime.fromtimestamp(int(source.modified)), sa.DateTime, )) ) @@ -239,14 +253,13 @@ def check_sources(self, sources: list[UploadFile]) -> tuple[list[str], list[str] session.rollback() raise DbException('Error: checking sources in vectordb') from e - still_existing_sources = [ - source - for source in existing_sources - if source not in to_delete + still_existing_source_ids = [ + source_id + for source_id in existing_sources + if source_id not in to_delete ] - # the pyright issue stems from source.filename, which has already been validated - return list(still_existing_sources), to_embed # pyright: ignore[reportReturnType] + return list(still_existing_source_ids), to_embed def decl_update_access(self, user_ids: list[str], source_id: str, session_: orm.Session | None = None): session = session_ or self.session_maker() From 03a3f433caccdf7121c3171538828c8f6fefa5af Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Mon, 9 Mar 2026 19:42:21 +0530 Subject: [PATCH 04/96] wip: parallelize file parsing and processing based on cpu count Signed-off-by: Anupam Kumar --- context_chat_backend/task_fetcher.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index a548bcf..853a68c 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -5,6 +5,7 @@ import asyncio import logging +import os from contextlib import suppress from enum import Enum from io import BytesIO @@ -35,6 +36,8 @@ THREADS = {} LOGGER = logging.getLogger('ccb.task_fetcher') FILES_INDEXING_BATCH_SIZE = 64 # todo: config? +# divides the batch into these many chunks +PARALLEL_FILE_PARSING = max(1, (os.cpu_count() or 2) - 1) # todo: config? # max concurrent fetches to avoid overloading the NC server or hitting rate limits CONCURRENT_FILE_FETCHES = 10 # todo: config? MAX_FILE_SIZE = 100 * 1024 * 1024 # 100 MB, todo: config? @@ -217,8 +220,18 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro else: source_errors[file_id] = result - files_result = _load_sources(source_files) - providers_result = _load_sources(q_items.content_providers) + files_result = {} + providers_result = {} + chunk_size = FILES_INDEXING_BATCH_SIZE // PARALLEL_FILE_PARSING + + # chunk file parsing for better file operation parallelism + for i in range(0, len(source_files), chunk_size): + chunk = dict(list(source_files.items())[i:i+chunk_size]) + files_result.update(_load_sources(chunk)) + + for i in range(0, len(q_items.content_providers), chunk_size): + chunk = dict(list(q_items.content_providers.items())[i:i+chunk_size]) + providers_result.update(_load_sources(chunk)) if ( any(isinstance(res, IndexingError) for res in files_result.values()) From 0dc404bf48cff0e358b723bcb12775956d0c2eac Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Tue, 10 Mar 2026 17:36:03 +0530 Subject: [PATCH 05/96] ci: use the kubernetes branch of context_chat Signed-off-by: Anupam Kumar --- .github/workflows/integration-test.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index 10e2d61..fb06baf 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -113,6 +113,8 @@ jobs: repository: nextcloud/context_chat path: apps/context_chat persist-credentials: false + # todo: remove later + ref: feat/reverse-content-flow - name: Checkout backend uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 From c7339828818ff49e8a2c44aa7896b4b2fdf495fb Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Tue, 10 Mar 2026 17:43:27 +0530 Subject: [PATCH 06/96] fix typo Signed-off-by: Anupam Kumar --- context_chat_backend/task_fetcher.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index 853a68c..cfa9293 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -304,11 +304,11 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro -def updates_processing_thread(app_config: TConfig): +def updates_processing_thread(app_config: TConfig, app_enabled: Event) -> None: ... -def request_processing_thread(app_config: TConfig): +def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None: ... @@ -317,12 +317,12 @@ def start_bg_threads(app_config: TConfig, app_enabled: Event): case AppRole.INDEXING | AppRole.NORMAL: THREADS[ThreadType.FILES_INDEXING] = Thread( target=files_indexing_thread, - args=(app_config, Event), + args=(app_config, app_enabled), name='FilesIndexingThread', ) THREADS[ThreadType.UPDATES_PROCESSING] = Thread( target=updates_processing_thread, - args=(app_config, Event), + args=(app_config, app_enabled), name='UpdatesProcessingThread', ) THREADS[ThreadType.FILES_INDEXING].start() @@ -330,7 +330,7 @@ def start_bg_threads(app_config: TConfig, app_enabled: Event): case AppRole.RP | AppRole.NORMAL: THREADS[ThreadType.REQUEST_PROCESSING] = Thread( target=request_processing_thread, - args=(app_config, Event), + args=(app_config, app_enabled), name='RequestProcessingThread', ) THREADS[ThreadType.REQUEST_PROCESSING].start() From dda312f21f74955d70e6f5f74840a31b26bb3f9d Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Wed, 11 Mar 2026 11:58:50 +0530 Subject: [PATCH 07/96] migrate the update process to be thread based Signed-off-by: Anupam Kumar --- context_chat_backend/chain/ingest/injest.py | 2 +- context_chat_backend/controller.py | 203 ++++++++++---------- context_chat_backend/task_fetcher.py | 183 +++++++++++++++++- context_chat_backend/types.py | 183 +++++++++++++++++- context_chat_backend/vectordb/pgvector.py | 27 ++- context_chat_backend/vectordb/service.py | 54 +++++- context_chat_backend/vectordb/types.py | 4 +- 7 files changed, 531 insertions(+), 125 deletions(-) diff --git a/context_chat_backend/chain/ingest/injest.py b/context_chat_backend/chain/ingest/injest.py index 0eb70e0..7369f45 100644 --- a/context_chat_backend/chain/ingest/injest.py +++ b/context_chat_backend/chain/ingest/injest.py @@ -129,7 +129,7 @@ def _increase_access_for_existing_sources( for db_id, source in existing_sources.items(): try: vectordb.update_access( - UpdateAccessOp.allow, + UpdateAccessOp.ALLOW, list(map(_decode_latin_1, source.userIds)), source.reference, ) diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py index 3e70ee1..580416f 100644 --- a/context_chat_backend/controller.py +++ b/context_chat_backend/controller.py @@ -6,7 +6,7 @@ # isort: off from .chain.types import ContextException, LLMOutput, ScopeType, SearchResult from .types import LoaderException, EmbeddingException -from .vectordb.types import DbException, SafeDbException, UpdateAccessOp +from .vectordb.types import DbException, SafeDbException from .setup_functions import ensure_config_file, repair_run, setup_env_vars # setup env vars before importing other modules @@ -25,9 +25,9 @@ from functools import wraps from threading import Event, Thread from time import sleep -from typing import Annotated, Any +from typing import Any -from fastapi import Body, FastAPI, Request +from fastapi import FastAPI, Request from langchain.llms.base import LLM from nc_py_api import AsyncNextcloudApp, NextcloudApp from nc_py_api.ex_app import persistent_storage, set_handlers @@ -40,16 +40,9 @@ from .dyn_loader import LLMModelLoader, VectorDBLoader from .models.types import LlmException from nc_py_api.ex_app import AppAPIAuthMiddleware -from .utils import JSONResponse, exec_in_proc, is_valid_provider_id, is_valid_source_id, value_of +from .utils import JSONResponse, exec_in_proc, value_of from .task_fetcher import start_bg_threads, wait_for_bg_threads -from .vectordb.service import ( - count_documents_by_provider, - decl_update_access, - delete_by_provider, - delete_by_source, - delete_user, - update_access, -) +from .vectordb.service import count_documents_by_provider # setup @@ -227,119 +220,131 @@ def _(): return JSONResponse(counts) -@app.post('/updateAccessDeclarative') -@enabled_guard(app) -def _( - userIds: Annotated[list[str], Body()], - sourceId: Annotated[str, Body()], -): - logger.debug('Update access declarative request:', extra={ - 'user_ids': userIds, - 'source_id': sourceId, - }) +@app.get('/downloadLogs') +def download_logs() -> FileResponse: + with tempfile.NamedTemporaryFile('wb', delete=False) as tmp: + with zipfile.ZipFile(tmp, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file: + files = os.listdir(os.path.join(persistent_storage(), 'logs')) + for file in files: + file_path = os.path.join(persistent_storage(), 'logs', file) + if os.path.isfile(file_path): # Might be a folder (just skip it then) + zip_file.write(file_path) + return FileResponse(tmp.name, media_type='application/zip', filename='docker_logs.zip') - if len(userIds) == 0: - return JSONResponse('Empty list of user ids', 400) - if not is_valid_source_id(sourceId): - return JSONResponse('Invalid source id', 400) +# @app.post('/updateAccessDeclarative') +# @enabled_guard(app) +# def _( +# userIds: Annotated[list[str], Body()], +# sourceId: Annotated[str, Body()], +# ): +# logger.debug('Update access declarative request:', extra={ +# 'user_ids': userIds, +# 'source_id': sourceId, +# }) - exec_in_proc(target=decl_update_access, args=(vectordb_loader, userIds, sourceId)) +# if len(userIds) == 0: +# return JSONResponse('Empty list of user ids', 400) - return JSONResponse('Access updated') +# if not is_valid_source_id(sourceId): +# return JSONResponse('Invalid source id', 400) +# exec_in_proc(target=decl_update_access, args=(vectordb_loader, userIds, sourceId)) -@app.post('/updateAccess') -@enabled_guard(app) -def _( - op: Annotated[UpdateAccessOp, Body()], - userIds: Annotated[list[str], Body()], - sourceId: Annotated[str, Body()], -): - logger.debug('Update access request', extra={ - 'op': op, - 'user_ids': userIds, - 'source_id': sourceId, - }) +# return JSONResponse('Access updated') - if len(userIds) == 0: - return JSONResponse('Empty list of user ids', 400) - if not is_valid_source_id(sourceId): - return JSONResponse('Invalid source id', 400) +# @app.post('/updateAccess') +# @enabled_guard(app) +# def _( +# op: Annotated[UpdateAccessOp, Body()], +# userIds: Annotated[list[str], Body()], +# sourceId: Annotated[str, Body()], +# ): +# logger.debug('Update access request', extra={ +# 'op': op, +# 'user_ids': userIds, +# 'source_id': sourceId, +# }) - exec_in_proc(target=update_access, args=(vectordb_loader, op, userIds, sourceId)) +# if len(userIds) == 0: +# return JSONResponse('Empty list of user ids', 400) - return JSONResponse('Access updated') +# if not is_valid_source_id(sourceId): +# return JSONResponse('Invalid source id', 400) +# exec_in_proc(target=update_access, args=(vectordb_loader, op, userIds, sourceId)) -@app.post('/updateAccessProvider') -@enabled_guard(app) -def _( - op: Annotated[UpdateAccessOp, Body()], - userIds: Annotated[list[str], Body()], - providerId: Annotated[str, Body()], -): - logger.debug('Update access by provider request', extra={ - 'op': op, - 'user_ids': userIds, - 'provider_id': providerId, - }) +# return JSONResponse('Access updated') - if len(userIds) == 0: - return JSONResponse('Empty list of user ids', 400) - if not is_valid_provider_id(providerId): - return JSONResponse('Invalid provider id', 400) +# @app.post('/updateAccessProvider') +# @enabled_guard(app) +# def _( +# op: Annotated[UpdateAccessOp, Body()], +# userIds: Annotated[list[str], Body()], +# providerId: Annotated[str, Body()], +# ): +# logger.debug('Update access by provider request', extra={ +# 'op': op, +# 'user_ids': userIds, +# 'provider_id': providerId, +# }) - exec_in_proc(target=update_access, args=(vectordb_loader, op, userIds, providerId)) +# if len(userIds) == 0: +# return JSONResponse('Empty list of user ids', 400) - return JSONResponse('Access updated') +# if not is_valid_provider_id(providerId): +# return JSONResponse('Invalid provider id', 400) +# exec_in_proc(target=update_access_provider, args=(vectordb_loader, op, userIds, providerId)) -@app.post('/deleteSources') -@enabled_guard(app) -def _(sourceIds: Annotated[list[str], Body(embed=True)]): - logger.debug('Delete sources request', extra={ - 'source_ids': sourceIds, - }) +# return JSONResponse('Access updated') - sourceIds = [source.strip() for source in sourceIds if source.strip() != ''] - if len(sourceIds) == 0: - return JSONResponse('No sources provided', 400) +# @app.post('/deleteSources') +# @enabled_guard(app) +# def _(sourceIds: Annotated[list[str], Body(embed=True)]): +# logger.debug('Delete sources request', extra={ +# 'source_ids': sourceIds, +# }) - res = exec_in_proc(target=delete_by_source, args=(vectordb_loader, sourceIds)) - if res is False: - return JSONResponse('Error: VectorDB delete failed, check vectordb logs for more info.', 400) +# sourceIds = [source.strip() for source in sourceIds if source.strip() != ''] - return JSONResponse('All valid sources deleted') +# if len(sourceIds) == 0: +# return JSONResponse('No sources provided', 400) +# res = exec_in_proc(target=delete_by_source, args=(vectordb_loader, sourceIds)) +# if res is False: +# return JSONResponse('Error: VectorDB delete failed, check vectordb logs for more info.', 400) -@app.post('/deleteProvider') -@enabled_guard(app) -def _(providerKey: str = Body(embed=True)): - logger.debug('Delete sources by provider for all users request', extra={ 'provider_key': providerKey }) +# return JSONResponse('All valid sources deleted') - if value_of(providerKey) is None: - return JSONResponse('Invalid provider key provided', 400) - exec_in_proc(target=delete_by_provider, args=(vectordb_loader, providerKey)) +# @app.post('/deleteProvider') +# @enabled_guard(app) +# def _(providerKey: str = Body(embed=True)): +# logger.debug('Delete sources by provider for all users request', extra={ 'provider_key': providerKey }) - return JSONResponse('All valid sources deleted') +# if value_of(providerKey) is None: +# return JSONResponse('Invalid provider key provided', 400) +# exec_in_proc(target=delete_by_provider, args=(vectordb_loader, providerKey)) -@app.post('/deleteUser') -@enabled_guard(app) -def _(userId: str = Body(embed=True)): - logger.debug('Remove access list for user, and orphaned sources', extra={ 'user_id': userId }) +# return JSONResponse('All valid sources deleted') - if value_of(userId) is None: - return JSONResponse('Invalid userId provided', 400) - exec_in_proc(target=delete_user, args=(vectordb_loader, userId)) +# @app.post('/deleteUser') +# @enabled_guard(app) +# def _(userId: str = Body(embed=True)): +# logger.debug('Remove access list for user, and orphaned sources', extra={ 'user_id': userId }) + +# if value_of(userId) is None: +# return JSONResponse('Invalid userId provided', 400) - return JSONResponse('User deleted') +# exec_in_proc(target=delete_user, args=(vectordb_loader, userId)) + +# return JSONResponse('User deleted') # @app.put('/loadSources') @@ -503,15 +508,3 @@ def _(query: Query) -> list[SearchResult]: query.scopeType, query.scopeList, )) - - -@app.get('/downloadLogs') -def download_logs() -> FileResponse: - with tempfile.NamedTemporaryFile('wb', delete=False) as tmp: - with zipfile.ZipFile(tmp, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file: - files = os.listdir(os.path.join(persistent_storage(), 'logs')) - for file in files: - file_path = os.path.join(persistent_storage(), 'logs', file) - if os.path.isfile(file_path): # Might be a folder (just skip it then) - zip_file.write(file_path) - return FileResponse(tmp.name, media_type='application/zip', filename='docker_logs.zip') diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index cfa9293..84b974b 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -19,9 +19,11 @@ from .chain.ingest.injest import embed_sources from .dyn_loader import VectorDBLoader from .types import ( + ActionsQueueItems, + ActionType, AppRole, EmbeddingException, - FilesQueueItem, + FilesQueueItems, IndexingError, IndexingException, LoaderException, @@ -30,7 +32,15 @@ TConfig, ) from .utils import exec_in_proc, get_app_role -from .vectordb.types import DbException +from .vectordb.service import ( + decl_update_access, + delete_by_provider, + delete_by_source, + delete_user, + update_access, + update_access_provider, +) +from .vectordb.types import DbException, SafeDbException APP_ROLE = get_app_role() THREADS = {} @@ -41,6 +51,8 @@ # max concurrent fetches to avoid overloading the NC server or hitting rate limits CONCURRENT_FILE_FETCHES = 10 # todo: config? MAX_FILE_SIZE = 100 * 1024 * 1024 # 100 MB, todo: config? +ACTIONS_BATCH_SIZE = 512 # todo: config? +POLLING_COOLDOWN = 30 class ThreadType(Enum): @@ -201,10 +213,15 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro ) try: - q_items = FilesQueueItem.model_validate(q_items_res) + q_items: FilesQueueItems = FilesQueueItems.model_validate(q_items_res) except ValidationError as e: raise Exception(f'Error validating queue items response: {e}\nResponse content: {q_items_res}') from e + if not q_items.files and not q_items.content_providers: + LOGGER.debug('No documents to index') + sleep(POLLING_COOLDOWN) + continue + # populate files content and convert to source items fetched_files = {} source_files = {} @@ -305,7 +322,165 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro def updates_processing_thread(app_config: TConfig, app_enabled: Event) -> None: - ... + try: + vectordb_loader = VectorDBLoader(app_config) + except LoaderException as e: + LOGGER.error('Error initializing vector DB loader, files indexing thread will not start:', exc_info=e) + return + + while True: + if not app_enabled.is_set(): + LOGGER.info('Files indexing thread is stopping as the app is disabled') + return + + try: + nc = NextcloudApp() + q_items_res = nc.ocs( + 'GET', + '/apps/context_chat/queues/actions', + params={ 'n': ACTIONS_BATCH_SIZE } + ) + + try: + q_items: ActionsQueueItems = ActionsQueueItems.model_validate(q_items_res) + except ValidationError as e: + raise Exception(f'Error validating queue items response: {e}\nResponse content: {q_items_res}') from e + except ( + niquests.exceptions.ConnectionError, + niquests.exceptions.Timeout, + ) as e: + LOGGER.info('Temporary error fetching updates to process, will retry:', exc_info=e) + sleep(5) + continue + except Exception as e: + LOGGER.exception('Error fetching updates to process:', exc_info=e) + sleep(5) + continue + + if not q_items.actions: + LOGGER.debug('No updates to process') + sleep(POLLING_COOLDOWN) + continue + + processed_event_ids = [] + errored_events = {} + for i, (db_id, action_item) in enumerate(q_items.actions.items()): + try: + match action_item.type: + case ActionType.DELETE_SOURCE_IDS: + exec_in_proc(target=delete_by_source, args=(vectordb_loader, action_item.payload.sourceIds)) + + case ActionType.DELETE_PROVIDER_ID: + exec_in_proc(target=delete_by_provider, args=(vectordb_loader, action_item.payload.providerId)) + + case ActionType.DELETE_USER_ID: + exec_in_proc(target=delete_user, args=(vectordb_loader, action_item.payload.userId)) + + case ActionType.UPDATE_ACCESS_SOURCE_ID: + exec_in_proc( + target=update_access, + args=( + vectordb_loader, + action_item.payload.op, + action_item.payload.userIds, + action_item.payload.sourceId, + ), + ) + + case ActionType.UPDATE_ACCESS_PROVIDER_ID: + exec_in_proc( + target=update_access_provider, + args=( + vectordb_loader, + action_item.payload.op, + action_item.payload.userIds, + action_item.payload.providerId, + ), + ) + + case ActionType.UPDATE_ACCESS_DECL_SOURCE_ID: + exec_in_proc( + target=decl_update_access, + args=( + vectordb_loader, + action_item.payload.userIds, + action_item.payload.sourceId, + ), + ) + + case _: + LOGGER.warning( + f'Unknown action type {action_item.type} for action id {db_id},' + f' type {action_item.type}, skipping and marking as processed', + extra={ 'action_item': action_item }, + ) + continue + + processed_event_ids.append(db_id) + except SafeDbException as e: + LOGGER.debug( + f'Safe DB error thrown while processing action id {db_id}, type {action_item.type},' + " it's safe to ignore and mark as processed.", + exc_info=e, + extra={ 'action_item': action_item }, + ) + processed_event_ids.append(db_id) + continue + + except (LoaderException, DbException) as e: + LOGGER.error( + f'Error deleting source for action id {db_id}, type {action_item.type}: {e}', + exc_info=e, + extra={ 'action_item': action_item }, + ) + errored_events[db_id] = str(e) + continue + + except Exception as e: + LOGGER.error( + f'Unexpected error processing action id {db_id}, type {action_item.type}: {e}', + exc_info=e, + extra={ 'action_item': action_item }, + ) + errored_events[db_id] = f'Unexpected error: {e}' + continue + + if (i + 1) % 20 == 0: + LOGGER.debug(f'Processed {i + 1} updates, sleeping for a bit to allow other operations to proceed') + sleep(2) + + LOGGER.info(f'Processed {len(processed_event_ids)} updates with {len(errored_events)} errors', extra={ + 'errored_events': errored_events, + }) + + if len(processed_event_ids) == 0: + LOGGER.debug('No updates processed, skipping reporting to the server') + continue + + try: + nc.ocs( + 'DELETE', + '/apps/context_chat/queues/actions/', + json={ 'actions': processed_event_ids }, + ) + except ( + niquests.exceptions.ConnectionError, + niquests.exceptions.Timeout, + ) as e: + LOGGER.info('Temporary error reporting processed updates, will retry:', exc_info=e) + sleep(5) + with suppress(Exception): + nc = NextcloudApp() + nc.ocs( + 'DELETE', + '/apps/context_chat/queues/actions/', + json={ 'ids': processed_event_ids }, + ) + continue + except Exception as e: + LOGGER.exception('Error reporting processed updates:', exc_info=e) + sleep(5) + continue def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None: diff --git a/context_chat_backend/types.py b/context_chat_backend/types.py index 97d48ce..849c2e3 100644 --- a/context_chat_backend/types.py +++ b/context_chat_backend/types.py @@ -4,12 +4,13 @@ # from enum import Enum from io import BytesIO -from typing import Self +from typing import Annotated, Literal, Self -from pydantic import BaseModel, field_validator +from pydantic import BaseModel, Discriminator, field_validator from .mimetype_list import SUPPORTED_MIMETYPES from .utils import is_valid_provider_id, is_valid_source_id +from .vectordb.types import UpdateAccessOp __all__ = [ 'DEFAULT_EM_MODEL_ALIAS', @@ -182,7 +183,7 @@ def validate_content(cls, v): raise ValueError('Content must be either a non-empty string or a non-empty BytesIO') -class FilesQueueItem(BaseModel): +class FilesQueueItems(BaseModel): files: dict[int, ReceivedFileItem] # [db id]: FileItem content_providers: dict[int, SourceItem] # [db id]: SourceItem @@ -198,3 +199,179 @@ def __init__(self, message: str, retryable: bool = False): class IndexingError(BaseModel): error: str retryable: bool = False + + +# PHP equivalent for reference: + +# class ActionType { +# // { sourceIds: array } +# public const DELETE_SOURCE_IDS = 'delete_source_ids'; +# // { providerId: string } +# public const DELETE_PROVIDER_ID = 'delete_provider_id'; +# // { userId: string } +# public const DELETE_USER_ID = 'delete_user_id'; +# // { op: string, userIds: array, sourceId: string } +# public const UPDATE_ACCESS_SOURCE_ID = 'update_access_source_id'; +# // { op: string, userIds: array, providerId: string } +# public const UPDATE_ACCESS_PROVIDER_ID = 'update_access_provider_id'; +# // { userIds: array, sourceId: string } +# public const UPDATE_ACCESS_DECL_SOURCE_ID = 'update_access_decl_source_id'; +# } + + +def _validate_source_ids(source_ids: list[str]) -> list[str]: + if ( + not isinstance(source_ids, list) + or not all(isinstance(sid, str) and sid.strip() != '' for sid in source_ids) + or len(source_ids) == 0 + ): + raise ValueError('sourceIds must be a non-empty list of non-empty strings') + return [sid.strip() for sid in source_ids] + + +def _validate_provider_id(provider_id: str) -> str: + if not isinstance(provider_id, str) or not is_valid_provider_id(provider_id): + raise ValueError('providerId must be a valid provider ID string') + return provider_id + + +def _validate_user_ids(user_ids: list[str]) -> list[str]: + if ( + not isinstance(user_ids, list) + or not all(isinstance(uid, str) and uid.strip() != '' for uid in user_ids) + or len(user_ids) == 0 + ): + raise ValueError('userIds must be a non-empty list of non-empty strings') + return [uid.strip() for uid in user_ids] + + +class ActionPayloadDeleteSourceIds(BaseModel): + sourceIds: list[str] + + @field_validator('sourceIds', mode='after') + def validate_source_ids(self) -> Self: + self.sourceIds = _validate_source_ids(self.sourceIds) + return self + + +class ActionPayloadDeleteProviderId(BaseModel): + providerId: str + + @field_validator('providerId') + def validate_provider_id(self) -> Self: + self.providerId = _validate_provider_id(self.providerId) + return self + + +class ActionPayloadDeleteUserId(BaseModel): + userId: str + + @field_validator('userId') + def validate_user_id(self) -> Self: + self.userId = _validate_user_ids([self.userId])[0] + return self + + +class ActionPayloadUpdateAccessSourceId(BaseModel): + op: UpdateAccessOp + userIds: list[str] + sourceId: str + + @field_validator('userIds', mode='after') + def validate_user_ids(self) -> Self: + self.userIds = _validate_user_ids(self.userIds) + return self + + @field_validator('sourceId') + def validate_source_id(self) -> Self: + self.sourceId = _validate_source_ids([self.sourceId])[0] + return self + + +class ActionPayloadUpdateAccessProviderId(BaseModel): + op: UpdateAccessOp + userIds: list[str] + providerId: str + + @field_validator('userIds', mode='after') + def validate_user_ids(self) -> Self: + self.userIds = _validate_user_ids(self.userIds) + return self + + @field_validator('providerId') + def validate_provider_id(self) -> Self: + self.providerId = _validate_provider_id(self.providerId) + return self + + +class ActionPayloadUpdateAccessDeclSourceId(BaseModel): + userIds: list[str] + sourceId: str + + @field_validator('userIds', mode='after') + def validate_user_ids(self) -> Self: + self.userIds = _validate_user_ids(self.userIds) + return self + + @field_validator('sourceId') + def validate_source_id(self) -> Self: + self.sourceId = _validate_source_ids([self.sourceId])[0] + return self + + +class ActionType(str, Enum): + DELETE_SOURCE_IDS = 'delete_source_ids' + DELETE_PROVIDER_ID = 'delete_provider_id' + DELETE_USER_ID = 'delete_user_id' + UPDATE_ACCESS_SOURCE_ID = 'update_access_source_id' + UPDATE_ACCESS_PROVIDER_ID = 'update_access_provider_id' + UPDATE_ACCESS_DECL_SOURCE_ID = 'update_access_decl_source_id' + + +class CommonActionsQueueItem(BaseModel): + id: int + + +class ActionsQueueItemDeleteSourceIds(CommonActionsQueueItem): + type: Literal[ActionType.DELETE_SOURCE_IDS] + payload: ActionPayloadDeleteSourceIds + + +class ActionsQueueItemDeleteProviderId(CommonActionsQueueItem): + type: Literal[ActionType.DELETE_PROVIDER_ID] + payload: ActionPayloadDeleteProviderId + + +class ActionsQueueItemDeleteUserId(CommonActionsQueueItem): + type: Literal[ActionType.DELETE_USER_ID] + payload: ActionPayloadDeleteUserId + + +class ActionsQueueItemUpdateAccessSourceId(CommonActionsQueueItem): + type: Literal[ActionType.UPDATE_ACCESS_SOURCE_ID] + payload: ActionPayloadUpdateAccessSourceId + + +class ActionsQueueItemUpdateAccessProviderId(CommonActionsQueueItem): + type: Literal[ActionType.UPDATE_ACCESS_PROVIDER_ID] + payload: ActionPayloadUpdateAccessProviderId + + +class ActionsQueueItemUpdateAccessDeclSourceId(CommonActionsQueueItem): + type: Literal[ActionType.UPDATE_ACCESS_DECL_SOURCE_ID] + payload: ActionPayloadUpdateAccessDeclSourceId + + +ActionsQueueItem = Annotated[ + ActionsQueueItemDeleteSourceIds + | ActionsQueueItemDeleteProviderId + | ActionsQueueItemDeleteUserId + | ActionsQueueItemUpdateAccessSourceId + | ActionsQueueItemUpdateAccessProviderId + | ActionsQueueItemUpdateAccessDeclSourceId, + Discriminator('type'), +] + + +class ActionsQueueItems(BaseModel): + actions: dict[int, ActionsQueueItem] diff --git a/context_chat_backend/vectordb/pgvector.py b/context_chat_backend/vectordb/pgvector.py index f5879fe..8bcc6f4 100644 --- a/context_chat_backend/vectordb/pgvector.py +++ b/context_chat_backend/vectordb/pgvector.py @@ -338,7 +338,7 @@ def update_access( ) match op: - case UpdateAccessOp.allow: + case UpdateAccessOp.ALLOW: for i in range(0, len(user_ids), PG_BATCH_SIZE): batched_uids = user_ids[i:i+PG_BATCH_SIZE] stmt = ( @@ -355,7 +355,7 @@ def update_access( session.execute(stmt) session.commit() - case UpdateAccessOp.deny: + case UpdateAccessOp.DENY: for i in range(0, len(user_ids), PG_BATCH_SIZE): batched_uids = user_ids[i:i+PG_BATCH_SIZE] stmt = ( @@ -448,15 +448,17 @@ def delete_source_ids(self, source_ids: list[str], session_: orm.Session | None # entry from "AccessListStore" is deleted automatically due to the foreign key constraint # batch the deletion to avoid hitting the query parameter limit chunks_to_delete = [] + deleted_source_ids = [] for i in range(0, len(source_ids), PG_BATCH_SIZE): batched_ids = source_ids[i:i+PG_BATCH_SIZE] stmt_doc = ( sa.delete(DocumentsStore) .filter(DocumentsStore.source_id.in_(batched_ids)) - .returning(DocumentsStore.chunks) + .returning(DocumentsStore.chunks, DocumentsStore.source_id) ) doc_result = session.execute(stmt_doc) chunks_to_delete.extend(str(c) for res in doc_result for c in res.chunks) + deleted_source_ids.extend(str(res.source_id) for res in doc_result) for i in range(0, len(chunks_to_delete), PG_BATCH_SIZE): batched_chunks = chunks_to_delete[i:i+PG_BATCH_SIZE] @@ -476,6 +478,14 @@ def delete_source_ids(self, source_ids: list[str], session_: orm.Session | None if session_ is None: session.close() + undeleted_source_ids = set(source_ids) - set(deleted_source_ids) + if len(undeleted_source_ids) > 0: + logger.info( + f'Source ids {undeleted_source_ids} were not deleted from documents store.' + ' This can be due to the source ids not existing in the documents store due to' + ' already being deleted or not having been added yet.' + ) + def delete_provider(self, provider_key: str): with self.session_maker() as session: try: @@ -519,7 +529,16 @@ def delete_user(self, user_id: str): session.rollback() raise DbException('Error: deleting user from access list') from e - self._cleanup_if_orphaned(list(source_ids), session) + try: + self._cleanup_if_orphaned(list(source_ids), session) + except Exception as e: + session.rollback() + logger.error( + 'Error cleaning up orphaned source ids after deleting user, manual cleanup might be required', + exc_info=e, + extra={ 'source_ids': list(source_ids) }, + ) + raise DbException('Error: cleaning up orphaned source ids after deleting user') from e def count_documents_by_provider(self) -> dict[str, int]: try: diff --git a/context_chat_backend/vectordb/service.py b/context_chat_backend/vectordb/service.py index 620a0b3..06a8e19 100644 --- a/context_chat_backend/vectordb/service.py +++ b/context_chat_backend/vectordb/service.py @@ -6,27 +6,42 @@ from ..dyn_loader import VectorDBLoader from .base import BaseVectorDB -from .types import DbException, UpdateAccessOp +from .types import UpdateAccessOp logger = logging.getLogger('ccb.vectordb') -# todo: return source ids that were successfully deleted + def delete_by_source(vectordb_loader: VectorDBLoader, source_ids: list[str]): + ''' + Raises + ------ + DbException + LoaderException + ''' db: BaseVectorDB = vectordb_loader.load() logger.debug('deleting sources by id', extra={ 'source_ids': source_ids }) - try: - db.delete_source_ids(source_ids) - except Exception as e: - raise DbException('Error: Vectordb delete_source_ids error') from e + db.delete_source_ids(source_ids) def delete_by_provider(vectordb_loader: VectorDBLoader, provider_key: str): + ''' + Raises + ------ + DbException + LoaderException + ''' db: BaseVectorDB = vectordb_loader.load() logger.debug(f'deleting sources by provider: {provider_key}') db.delete_provider(provider_key) def delete_user(vectordb_loader: VectorDBLoader, user_id: str): + ''' + Raises + ------ + DbException + LoaderException + ''' db: BaseVectorDB = vectordb_loader.load() logger.debug(f'deleting user from db: {user_id}') db.delete_user(user_id) @@ -38,6 +53,13 @@ def update_access( user_ids: list[str], source_id: str, ): + ''' + Raises + ------ + DbException + LoaderException + SafeDbException + ''' db: BaseVectorDB = vectordb_loader.load() logger.debug('updating access', extra={ 'op': op, 'user_ids': user_ids, 'source_id': source_id }) db.update_access(op, user_ids, source_id) @@ -49,6 +71,13 @@ def update_access_provider( user_ids: list[str], provider_id: str, ): + ''' + Raises + ------ + DbException + LoaderException + SafeDbException + ''' db: BaseVectorDB = vectordb_loader.load() logger.debug('updating access by provider', extra={ 'op': op, 'user_ids': user_ids, 'provider_id': provider_id }) db.update_access_provider(op, user_ids, provider_id) @@ -59,11 +88,24 @@ def decl_update_access( user_ids: list[str], source_id: str, ): + ''' + Raises + ------ + DbException + LoaderException + SafeDbException + ''' db: BaseVectorDB = vectordb_loader.load() logger.debug('decl update access', extra={ 'user_ids': user_ids, 'source_id': source_id }) db.decl_update_access(user_ids, source_id) def count_documents_by_provider(vectordb_loader: VectorDBLoader): + ''' + Raises + ------ + DbException + LoaderException + ''' db: BaseVectorDB = vectordb_loader.load() logger.debug('counting documents by provider') return db.count_documents_by_provider() diff --git a/context_chat_backend/vectordb/types.py b/context_chat_backend/vectordb/types.py index df5c6dd..3081179 100644 --- a/context_chat_backend/vectordb/types.py +++ b/context_chat_backend/vectordb/types.py @@ -14,5 +14,5 @@ class SafeDbException(Exception): class UpdateAccessOp(Enum): - allow = 'allow' - deny = 'deny' + ALLOW = 'allow' + DENY = 'deny' From b09a93cafda6726b706f11c8e7815b4a91acfc43 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Wed, 11 Mar 2026 14:33:39 +0530 Subject: [PATCH 08/96] fix pydantic types Signed-off-by: Anupam Kumar --- context_chat_backend/types.py | 180 ++++++++++++---------------------- context_chat_backend/utils.py | 10 -- 2 files changed, 64 insertions(+), 126 deletions(-) diff --git a/context_chat_backend/types.py b/context_chat_backend/types.py index 849c2e3..8577c93 100644 --- a/context_chat_backend/types.py +++ b/context_chat_backend/types.py @@ -2,14 +2,14 @@ # SPDX-FileCopyrightText: 2024 Nextcloud GmbH and Nextcloud contributors # SPDX-License-Identifier: AGPL-3.0-or-later # +import re from enum import Enum from io import BytesIO from typing import Annotated, Literal, Self -from pydantic import BaseModel, Discriminator, field_validator +from pydantic import AfterValidator, BaseModel, Discriminator, field_validator, model_validator from .mimetype_list import SUPPORTED_MIMETYPES -from .utils import is_valid_provider_id, is_valid_source_id from .vectordb.types import UpdateAccessOp __all__ = [ @@ -26,6 +26,49 @@ FILES_PROVIDER_ID = 'files__default' +def is_valid_source_id(source_id: str) -> bool: + # note the ":" in the item id part + return re.match(r'^[a-zA-Z0-9_-]+__[a-zA-Z0-9_-]+: [a-zA-Z0-9:-]+$', source_id) is not None + + +def is_valid_provider_id(provider_id: str) -> bool: + return re.match(r'^[a-zA-Z0-9_-]+__[a-zA-Z0-9_-]+$', provider_id) is not None + + +def _validate_source_ids(source_ids: list[str]) -> list[str]: + if ( + not isinstance(source_ids, list) + or not all(isinstance(sid, str) and sid.strip() != '' for sid in source_ids) + or len(source_ids) == 0 + ): + raise ValueError('sourceIds must be a non-empty list of non-empty strings') + return [sid.strip() for sid in source_ids] + + +def _validate_source_id(source_id: str) -> str: + return _validate_source_ids([source_id])[0] + + +def _validate_provider_id(provider_id: str) -> str: + if not isinstance(provider_id, str) or not is_valid_provider_id(provider_id): + raise ValueError('providerId must be a valid provider ID string') + return provider_id + + +def _validate_user_ids(user_ids: list[str]) -> list[str]: + if ( + not isinstance(user_ids, list) + or not all(isinstance(uid, str) and uid.strip() != '' for uid in user_ids) + or len(user_ids) == 0 + ): + raise ValueError('userIds must be a non-empty list of non-empty strings') + return [uid.strip() for uid in user_ids] + + +def _validate_user_id(user_id: str) -> str: + return _validate_user_ids([user_id])[0] + + class TEmbeddingAuthApiKey(BaseModel): apikey: str @@ -89,12 +132,13 @@ class AppRole(str, Enum): class CommonSourceItem(BaseModel): - userIds: list[str] - reference: str # source_id of the form "appId__providerId: itemId" + userIds: Annotated[list[str], AfterValidator(_validate_user_ids)] + # source_id of the form "appId__providerId: itemId" + reference: Annotated[str, AfterValidator(_validate_source_id)] title: str modified: int | str # todo: int/string? type: str - provider: str + provider: Annotated[str, AfterValidator(_validate_provider_id)] size: int @field_validator('modified', mode='before') @@ -116,42 +160,13 @@ def validate_strings_non_empty(cls, v): raise ValueError('Must be a non-empty string') return v.strip() - @field_validator('userIds', mode='after') - def validate_user_ids(self) -> Self: - if ( - not isinstance(self.userIds, list) - or not all( - isinstance(uid, str) - and uid.strip() != '' - for uid in self.userIds - ) - or len(self.userIds) == 0 - ): - raise ValueError('userIds must be a non-empty list of non-empty strings') - self.userIds = [uid.strip() for uid in self.userIds] - return self - - @field_validator('reference', mode='after') - def validate_reference_format(self) -> Self: - # validate reference format: "appId__providerId: itemId" - if not is_valid_source_id(self.reference): - raise ValueError('Invalid reference format, must be "appId__providerId: itemId"') - return self - - @field_validator('provider', mode='after') - def validate_provider_format(self) -> Self: - # validate provider format: "appId__providerId" - if not is_valid_provider_id(self.provider): - raise ValueError('Invalid provider format, must be "appId__providerId"') - return self - - @field_validator('type', mode='after') + @model_validator(mode='after') def validate_type(self) -> Self: if self.reference.startswith(FILES_PROVIDER_ID) and self.type not in SUPPORTED_MIMETYPES: raise ValueError(f'Unsupported file type: {self.type} for reference {self.reference}') return self - @field_validator('size', mode='after') + @model_validator(mode='after') def validate_size(self) -> Self: if not isinstance(self.size, int) or self.size < 0: raise ValueError(f'Invalid size value: {self.size}, must be a non-negative integer') @@ -182,6 +197,10 @@ def validate_content(cls, v): return v raise ValueError('Content must be either a non-empty string or a non-empty BytesIO') + class Config: + # to allow BytesIO in content field + arbitrary_types_allowed = True + class FilesQueueItems(BaseModel): files: dict[int, ReceivedFileItem] # [db id]: FileItem @@ -219,104 +238,33 @@ class IndexingError(BaseModel): # } -def _validate_source_ids(source_ids: list[str]) -> list[str]: - if ( - not isinstance(source_ids, list) - or not all(isinstance(sid, str) and sid.strip() != '' for sid in source_ids) - or len(source_ids) == 0 - ): - raise ValueError('sourceIds must be a non-empty list of non-empty strings') - return [sid.strip() for sid in source_ids] - - -def _validate_provider_id(provider_id: str) -> str: - if not isinstance(provider_id, str) or not is_valid_provider_id(provider_id): - raise ValueError('providerId must be a valid provider ID string') - return provider_id - - -def _validate_user_ids(user_ids: list[str]) -> list[str]: - if ( - not isinstance(user_ids, list) - or not all(isinstance(uid, str) and uid.strip() != '' for uid in user_ids) - or len(user_ids) == 0 - ): - raise ValueError('userIds must be a non-empty list of non-empty strings') - return [uid.strip() for uid in user_ids] - - class ActionPayloadDeleteSourceIds(BaseModel): - sourceIds: list[str] - - @field_validator('sourceIds', mode='after') - def validate_source_ids(self) -> Self: - self.sourceIds = _validate_source_ids(self.sourceIds) - return self + sourceIds: Annotated[list[str], AfterValidator(_validate_source_ids)] class ActionPayloadDeleteProviderId(BaseModel): - providerId: str - - @field_validator('providerId') - def validate_provider_id(self) -> Self: - self.providerId = _validate_provider_id(self.providerId) - return self + providerId: Annotated[str, AfterValidator(_validate_provider_id)] class ActionPayloadDeleteUserId(BaseModel): - userId: str - - @field_validator('userId') - def validate_user_id(self) -> Self: - self.userId = _validate_user_ids([self.userId])[0] - return self + userId: Annotated[str, AfterValidator(_validate_user_id)] class ActionPayloadUpdateAccessSourceId(BaseModel): op: UpdateAccessOp - userIds: list[str] - sourceId: str - - @field_validator('userIds', mode='after') - def validate_user_ids(self) -> Self: - self.userIds = _validate_user_ids(self.userIds) - return self - - @field_validator('sourceId') - def validate_source_id(self) -> Self: - self.sourceId = _validate_source_ids([self.sourceId])[0] - return self + userIds: Annotated[list[str], AfterValidator(_validate_user_ids)] + sourceId: Annotated[str, AfterValidator(_validate_source_id)] class ActionPayloadUpdateAccessProviderId(BaseModel): op: UpdateAccessOp - userIds: list[str] - providerId: str - - @field_validator('userIds', mode='after') - def validate_user_ids(self) -> Self: - self.userIds = _validate_user_ids(self.userIds) - return self - - @field_validator('providerId') - def validate_provider_id(self) -> Self: - self.providerId = _validate_provider_id(self.providerId) - return self + userIds: Annotated[list[str], AfterValidator(_validate_user_ids)] + providerId: Annotated[str, AfterValidator(_validate_provider_id)] class ActionPayloadUpdateAccessDeclSourceId(BaseModel): - userIds: list[str] - sourceId: str - - @field_validator('userIds', mode='after') - def validate_user_ids(self) -> Self: - self.userIds = _validate_user_ids(self.userIds) - return self - - @field_validator('sourceId') - def validate_source_id(self) -> Self: - self.sourceId = _validate_source_ids([self.sourceId])[0] - return self + userIds: Annotated[list[str], AfterValidator(_validate_user_ids)] + sourceId: Annotated[str, AfterValidator(_validate_source_id)] class ActionType(str, Enum): diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py index 224f466..c7e588b 100644 --- a/context_chat_backend/utils.py +++ b/context_chat_backend/utils.py @@ -5,7 +5,6 @@ import logging import multiprocessing as mp import os -import re import traceback from collections.abc import Callable from functools import partial, wraps @@ -102,15 +101,6 @@ def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daem return result['value'] -def is_valid_source_id(source_id: str) -> bool: - # note the ":" in the item id part - return re.match(r'^[a-zA-Z0-9_-]+__[a-zA-Z0-9_-]+: [a-zA-Z0-9:-]+$', source_id) is not None - - -def is_valid_provider_id(provider_id: str) -> bool: - return re.match(r'^[a-zA-Z0-9_-]+__[a-zA-Z0-9_-]+$', provider_id) is not None - - def timed(func: Callable): ''' Decorator to time a function From 11b436c8ce43778dbf6beda8a7e3978626e7aee5 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Wed, 11 Mar 2026 14:34:55 +0530 Subject: [PATCH 09/96] fix: use a dedicated event to allow app halt without app being disabled Signed-off-by: Anupam Kumar --- context_chat_backend/controller.py | 1 + context_chat_backend/task_fetcher.py | 28 ++++++++++++++++++++++++---- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py index 580416f..55206ca 100644 --- a/context_chat_backend/controller.py +++ b/context_chat_backend/controller.py @@ -88,6 +88,7 @@ async def lifespan(app: FastAPI): nc = NextcloudApp() if nc.enabled_state: app_enabled.set() + start_bg_threads(app_config, app_enabled) logger.info(f'App enable state at startup: {app_enabled.is_set()}') t = Thread(target=background_thread_task, args=()) t.start() diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index 84b974b..e93eac3 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -44,6 +44,7 @@ APP_ROLE = get_app_role() THREADS = {} +THREAD_STOP_EVENT = Event() LOGGER = logging.getLogger('ccb.task_fetcher') FILES_INDEXING_BATCH_SIZE = 64 # todo: config? # divides the batch into these many chunks @@ -199,8 +200,8 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro while True: - if not app_enabled.is_set(): - LOGGER.info('Files indexing thread is stopping as the app is disabled') + if THREAD_STOP_EVENT.is_set(): + LOGGER.info('Files indexing thread is stopping due to stop event being set') return try: @@ -329,8 +330,8 @@ def updates_processing_thread(app_config: TConfig, app_enabled: Event) -> None: return while True: - if not app_enabled.is_set(): - LOGGER.info('Files indexing thread is stopping as the app is disabled') + if THREAD_STOP_EVENT.is_set(): + LOGGER.info('Updates processing thread is stopping due to stop event being set') return try: @@ -490,6 +491,14 @@ def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None: def start_bg_threads(app_config: TConfig, app_enabled: Event): match APP_ROLE: case AppRole.INDEXING | AppRole.NORMAL: + if ( + ThreadType.FILES_INDEXING in THREADS + or ThreadType.UPDATES_PROCESSING in THREADS + ): + LOGGER.info('Background threads already running, skipping start') + return + + THREAD_STOP_EVENT.clear() THREADS[ThreadType.FILES_INDEXING] = Thread( target=files_indexing_thread, args=(app_config, app_enabled), @@ -502,7 +511,13 @@ def start_bg_threads(app_config: TConfig, app_enabled: Event): ) THREADS[ThreadType.FILES_INDEXING].start() THREADS[ThreadType.UPDATES_PROCESSING].start() + case AppRole.RP | AppRole.NORMAL: + if ThreadType.REQUEST_PROCESSING in THREADS: + LOGGER.info('Background threads already running, skipping start') + return + + THREAD_STOP_EVENT.clear() THREADS[ThreadType.REQUEST_PROCESSING] = Thread( target=request_processing_thread, args=(app_config, app_enabled), @@ -516,12 +531,17 @@ def wait_for_bg_threads(): case AppRole.INDEXING | AppRole.NORMAL: if (ThreadType.FILES_INDEXING not in THREADS or ThreadType.UPDATES_PROCESSING not in THREADS): return + + THREAD_STOP_EVENT.set() THREADS[ThreadType.FILES_INDEXING].join() THREADS[ThreadType.UPDATES_PROCESSING].join() THREADS.pop(ThreadType.FILES_INDEXING) THREADS.pop(ThreadType.UPDATES_PROCESSING) + case AppRole.RP | AppRole.NORMAL: if (ThreadType.REQUEST_PROCESSING not in THREADS): return + + THREAD_STOP_EVENT.set() THREADS[ThreadType.REQUEST_PROCESSING].join() THREADS.pop(ThreadType.REQUEST_PROCESSING) From c88e15364d53764257f7fddaca76505cf27c80d9 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Wed, 11 Mar 2026 17:54:48 +0530 Subject: [PATCH 10/96] fix fetch url and pydantic types Signed-off-by: Anupam Kumar --- context_chat_backend/task_fetcher.py | 14 +++++++------- context_chat_backend/types.py | 17 +++++++++-------- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index e93eac3..5784d12 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -80,7 +80,7 @@ async def __fetch_file_content( # a file pointer for storing the stream in memory until it is consumed fp = BytesIO() await nc._session.download2fp( - url_path=f'/apps/context_chat/files/{file_id}', + url_path=f'/ocs/v2.php/apps/context_chat/files/{file_id}', fp=fp, dav=False, params={ 'userId': user_id }, @@ -209,7 +209,7 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro # todo: add the 'size' param to the return of this call. q_items_res = nc.ocs( 'GET', - '/apps/context_chat/queues/documents', + '/ocs/v2.php/apps/context_chat/queues/documents', params={ 'n': FILES_INDEXING_BATCH_SIZE } ) @@ -292,7 +292,7 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro try: nc.ocs( 'DELETE', - '/apps/context_chat/queues/documents/', + '/ocs/v2.php/apps/context_chat/queues/documents/', json={ 'files': to_delete_file_ids, 'content_providers': to_delete_provider_ids, @@ -308,7 +308,7 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro nc = NextcloudApp() nc.ocs( 'DELETE', - '/apps/context_chat/queues/documents/', + '/ocs/v2.php/apps/context_chat/queues/documents/', json={ 'files': to_delete_file_ids, 'content_providers': to_delete_provider_ids, @@ -338,7 +338,7 @@ def updates_processing_thread(app_config: TConfig, app_enabled: Event) -> None: nc = NextcloudApp() q_items_res = nc.ocs( 'GET', - '/apps/context_chat/queues/actions', + '/ocs/v2.php/apps/context_chat/queues/actions', params={ 'n': ACTIONS_BATCH_SIZE } ) @@ -461,7 +461,7 @@ def updates_processing_thread(app_config: TConfig, app_enabled: Event) -> None: try: nc.ocs( 'DELETE', - '/apps/context_chat/queues/actions/', + '/ocs/v2.php/apps/context_chat/queues/actions/', json={ 'actions': processed_event_ids }, ) except ( @@ -474,7 +474,7 @@ def updates_processing_thread(app_config: TConfig, app_enabled: Event) -> None: nc = NextcloudApp() nc.ocs( 'DELETE', - '/apps/context_chat/queues/actions/', + '/ocs/v2.php/apps/context_chat/queues/actions/', json={ 'ids': processed_event_ids }, ) continue diff --git a/context_chat_backend/types.py b/context_chat_backend/types.py index 8577c93..972756f 100644 --- a/context_chat_backend/types.py +++ b/context_chat_backend/types.py @@ -136,10 +136,10 @@ class CommonSourceItem(BaseModel): # source_id of the form "appId__providerId: itemId" reference: Annotated[str, AfterValidator(_validate_source_id)] title: str - modified: int | str # todo: int/string? + modified: int type: str provider: Annotated[str, AfterValidator(_validate_provider_id)] - size: int + size: float @field_validator('modified', mode='before') @classmethod @@ -160,18 +160,19 @@ def validate_strings_non_empty(cls, v): raise ValueError('Must be a non-empty string') return v.strip() + @field_validator('size') + @classmethod + def validate_size(cls, v): + if isinstance(v, int | float) and v >= 0: + return float(v) + raise ValueError(f'Invalid size value: {v}, must be a non-negative number') + @model_validator(mode='after') def validate_type(self) -> Self: if self.reference.startswith(FILES_PROVIDER_ID) and self.type not in SUPPORTED_MIMETYPES: raise ValueError(f'Unsupported file type: {self.type} for reference {self.reference}') return self - @model_validator(mode='after') - def validate_size(self) -> Self: - if not isinstance(self.size, int) or self.size < 0: - raise ValueError(f'Invalid size value: {self.size}, must be a non-negative integer') - return self - class ReceivedFileItem(CommonSourceItem): content: None From cd5241e199a2ae2316d4f8f3841aa27bb7c12842 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Wed, 11 Mar 2026 18:52:35 +0530 Subject: [PATCH 11/96] fix: use the correct file id Signed-off-by: Anupam Kumar --- context_chat_backend/controller.py | 9 ++-- context_chat_backend/task_fetcher.py | 79 +++++++++++++++++----------- context_chat_backend/types.py | 22 +++++++- 3 files changed, 75 insertions(+), 35 deletions(-) diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py index 55206ca..797ba20 100644 --- a/context_chat_backend/controller.py +++ b/context_chat_backend/controller.py @@ -24,7 +24,6 @@ from contextlib import asynccontextmanager from functools import wraps from threading import Event, Thread -from time import sleep from typing import Any from fastapi import FastAPI, Request @@ -130,9 +129,11 @@ async def lifespan(app: FastAPI): # logger background thread def background_thread_task(): - while(True): - logger.info(f'Currently indexing {len(_indexing)} documents (filename, size): ', extra={'_indexing': _indexing}) - sleep(10) + # todo + # while(True): + # logger.info(f'Currently indexing {len(_indexing)} documents (filename, size): ', extra={'_indexing': _indexing}) + # sleep(10) + ... # exception handlers diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index 5784d12..0442cd5 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -125,15 +125,29 @@ async def __fetch_files_content( semaphore = asyncio.Semaphore(CONCURRENT_FILE_FETCHES) tasks = [] - for file_id, file_item in files.items(): - if file_item.size > MAX_FILE_SIZE: + for db_id, file in files.items(): + try: + # to detect any validation errors but it should not happen since file.reference is validated + file.file_id # noqa: B018 + except ValueError as e: + LOGGER.error( + f'Invalid file reference format for db id {db_id}, file reference {file.reference}: {e}', + exc_info=e, + ) + source_items[db_id] = IndexingError( + error=f'Invalid file reference format: {file.reference}', + retryable=False, + ) + continue + + if file.size > MAX_FILE_SIZE: LOGGER.info( - f'Skipping file id {file_id}, source id {file_item.reference} due to size' - f' {(file_item.size/(1024*1024)):.2f} MiB exceeding the limit {(MAX_FILE_SIZE/(1024*1024)):.2f} MiB', + f'Skipping db id {db_id}, file id {file.file_id}, source id {file.reference} due to size' + f' {(file.size/(1024*1024)):.2f} MiB exceeding the limit {(MAX_FILE_SIZE/(1024*1024)):.2f} MiB', ) - source_items[file_id] = IndexingError( + source_items[db_id] = IndexingError( error=( - f'File size {(file_item.size/(1024*1024)):.2f} MiB' + f'File size {(file.size/(1024*1024)):.2f} MiB' f' exceeds the limit {(MAX_FILE_SIZE/(1024*1024)):.2f} MiB' ), retryable=False, @@ -141,39 +155,44 @@ async def __fetch_files_content( continue # todo: perform the existing file check before fetching the content to avoid unnecessary fetches # any user id from the list should have read access to the file - tasks.append(asyncio.ensure_future(__fetch_file_content(semaphore, file_id, file_item.userIds[0]))) + tasks.append(asyncio.ensure_future(__fetch_file_content(semaphore, file.file_id, file.userIds[0]))) results = await asyncio.gather(*tasks, return_exceptions=True) - for (file_id, file_item), result in zip(files.items(), results, strict=True): + for (db_id, file), result in zip(files.items(), results, strict=True): if isinstance(result, IndexingException): LOGGER.error( - f'Error fetching content for file id {file_id}, reference {file_item.reference}: {result}', + f'Error fetching content for db id {db_id}, file id {file.file_id}, reference {file.reference}' + f': {result}', exc_info=result, ) - source_items[file_id] = IndexingError( + source_items[db_id] = IndexingError( error=str(result), retryable=result.retryable, ) elif isinstance(result, str) or isinstance(result, BytesIO): - source_items[file_id] = SourceItem( - **file_item.model_dump(), - content=result, + source_items[db_id] = SourceItem( + **{ + **file.model_dump(), + 'content': result, + } ) elif isinstance(result, BaseException): LOGGER.error( - f'Unexpected error fetching content for file id {file_id}, reference {file_item.reference}: {result}', + f'Unexpected error fetching content for db id {db_id}, file id {file.file_id},' + f' reference {file.reference}: {result}', exc_info=result, ) - source_items[file_id] = IndexingError( + source_items[db_id] = IndexingError( error=f'Unexpected error: {result}', retryable=True, ) else: LOGGER.error( - f'Unknown error fetching content for file id {file_id}, reference {file_item.reference}: {result}', + f'Unknown error fetching content for db id {db_id}, file id {file.file_id}, reference {file.reference}' + f': {result}', exc_info=True, ) - source_items[file_id] = IndexingError( + source_items[db_id] = IndexingError( error='Unknown error', retryable=True, ) @@ -232,11 +251,11 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro if q_items.files: fetched_files = asyncio.run(__fetch_files_content(q_items.files)) - for file_id, result in fetched_files.items(): + for db_id, result in fetched_files.items(): if isinstance(result, SourceItem): - source_files[file_id] = result + source_files[db_id] = result else: - source_errors[file_id] = result + source_errors[db_id] = result files_result = {} providers_result = {} @@ -257,8 +276,8 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro ): LOGGER.error('Some sources failed to index', extra={ 'file_errors': { - file_id: error - for file_id, error in files_result.items() + db_id: error + for db_id, error in files_result.items() if isinstance(error, IndexingError) }, 'provider_errors': { @@ -280,12 +299,12 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro continue # delete the entries from the PHP side queue where indexing succeeded or the error is not retryable - to_delete_file_ids = [ - file_id for file_id, result in files_result.items() + to_delete_files_db_ids = [ + db_id for db_id, result in files_result.items() if result is None or (isinstance(result, IndexingError) and not result.retryable) ] - to_delete_provider_ids = [ - provider_id for provider_id, result in providers_result.items() + to_delete_provider_db_ids = [ + db_id for db_id, result in providers_result.items() if result is None or (isinstance(result, IndexingError) and not result.retryable) ] @@ -294,8 +313,8 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro 'DELETE', '/ocs/v2.php/apps/context_chat/queues/documents/', json={ - 'files': to_delete_file_ids, - 'content_providers': to_delete_provider_ids, + 'files': to_delete_files_db_ids, + 'content_providers': to_delete_provider_db_ids, }, ) except ( @@ -310,8 +329,8 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro 'DELETE', '/ocs/v2.php/apps/context_chat/queues/documents/', json={ - 'files': to_delete_file_ids, - 'content_providers': to_delete_provider_ids, + 'files': to_delete_files_db_ids, + 'content_providers': to_delete_provider_db_ids, }, ) continue diff --git a/context_chat_backend/types.py b/context_chat_backend/types.py index 972756f..9f23e14 100644 --- a/context_chat_backend/types.py +++ b/context_chat_backend/types.py @@ -7,7 +7,7 @@ from io import BytesIO from typing import Annotated, Literal, Self -from pydantic import AfterValidator, BaseModel, Discriminator, field_validator, model_validator +from pydantic import AfterValidator, BaseModel, Discriminator, computed_field, field_validator, model_validator from .mimetype_list import SUPPORTED_MIMETYPES from .vectordb.types import UpdateAccessOp @@ -69,6 +69,21 @@ def _validate_user_id(user_id: str) -> str: return _validate_user_ids([user_id])[0] +def _get_file_id_from_source_ref(source_ref: str) -> int: + ''' + source reference is in the format "FILES_PROVIDER_ID: ". + ''' + if not source_ref.startswith(f'{FILES_PROVIDER_ID}: '): + raise ValueError(f'Source reference does not start with expected prefix: {source_ref}') + + try: + return int(source_ref[len(f'{FILES_PROVIDER_ID}: '):]) + except ValueError as e: + raise ValueError( + f'Invalid source reference format for extracting file_id: {source_ref}' + ) from e + + class TEmbeddingAuthApiKey(BaseModel): apikey: str @@ -177,6 +192,11 @@ def validate_type(self) -> Self: class ReceivedFileItem(CommonSourceItem): content: None + @computed_field + @property + def file_id(self) -> int: + return _get_file_id_from_source_ref(self.reference) + class SourceItem(CommonSourceItem): ''' From 4958d1d980b0d0741762ffc9c3eac3ff91e5c2b0 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Wed, 11 Mar 2026 19:24:51 +0530 Subject: [PATCH 12/96] fix: wip: improve embeddings exception handling Signed-off-by: Anupam Kumar --- context_chat_backend/network_em.py | 13 +++++++++---- context_chat_backend/task_fetcher.py | 1 + context_chat_backend/vectordb/pgvector.py | 17 ++++++----------- 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/context_chat_backend/network_em.py b/context_chat_backend/network_em.py index 18bb11f..d39ea56 100644 --- a/context_chat_backend/network_em.py +++ b/context_chat_backend/network_em.py @@ -79,6 +79,7 @@ def _get_embedding(self, input_: str | list[str], try_: int = 3) -> list[float] raise FatalEmbeddingException(response.text) if response.status_code // 100 != 2: raise EmbeddingException(response.text) + # todo: rework exception handling and their downstream interpretation except FatalEmbeddingException as e: logger.error('Fatal error while getting embeddings: %s', str(e), exc_info=e) raise e @@ -108,10 +109,14 @@ def _get_embedding(self, input_: str | list[str], try_: int = 3) -> list[float] logger.error('Unexpected error while getting embeddings', exc_info=e) raise EmbeddingException('Error: unexpected error while getting embeddings') from e - # converts TypedDict to a pydantic model - resp = CreateEmbeddingResponse(**response.json()) - if isinstance(input_, str): - return resp['data'][0]['embedding'] + try: + # converts TypedDict to a pydantic model + resp = CreateEmbeddingResponse(**response.json()) + if isinstance(input_, str): + return resp['data'][0]['embedding'] + except Exception as e: + logger.error('Error parsing embedding response', exc_info=e) + raise EmbeddingException('Error: failed to parse embedding response') from e # only one embedding in d['embedding'] since truncate is True return [d['embedding'] for d in resp['data']] # pyright: ignore[reportReturnType] diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index 0442cd5..51f98e7 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -261,6 +261,7 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro providers_result = {} chunk_size = FILES_INDEXING_BATCH_SIZE // PARALLEL_FILE_PARSING + # todo: do it in asyncio, it's not truly parallel yet # chunk file parsing for better file operation parallelism for i in range(0, len(source_files), chunk_size): chunk = dict(list(source_files.items())[i:i+chunk_size]) diff --git a/context_chat_backend/vectordb/pgvector.py b/context_chat_backend/vectordb/pgvector.py index 8bcc6f4..bfca0bb 100644 --- a/context_chat_backend/vectordb/pgvector.py +++ b/context_chat_backend/vectordb/pgvector.py @@ -17,7 +17,7 @@ from langchain_postgres.vectorstores import Base, PGVector from ..chain.types import InDocument, ScopeType -from ..types import EmbeddingException, IndexingError, RetryableEmbeddingException, SourceItem +from ..types import EmbeddingException, FatalEmbeddingException, IndexingError, RetryableEmbeddingException, SourceItem from ..utils import timed from .base import BaseVectorDB from .types import DbException, SafeDbException, UpdateAccessOp @@ -181,7 +181,11 @@ def add_indocuments(self, indocuments: dict[int, InDocument]) -> dict[int, Index retryable=True, ) continue - except RetryableEmbeddingException as e: + except FatalEmbeddingException as e: + raise EmbeddingException( + f'Fatal error while embedding documents for source {indoc.source_id}: {e}' + ) from e + except (RetryableEmbeddingException, EmbeddingException) as e: # temporary error, continue with the next document logger.exception('Error adding documents to vectordb, should be retried later.', exc_info=e, extra={ 'source_id': indoc.source_id, @@ -191,15 +195,6 @@ def add_indocuments(self, indocuments: dict[int, InDocument]) -> dict[int, Index retryable=True, ) continue - except EmbeddingException as e: - logger.exception('Error adding documents to vectordb', exc_info=e, extra={ - 'source_id': indoc.source_id, - }) - results[php_db_id] = IndexingError( - error=str(e), - retryable=False, - ) - continue except Exception as e: logger.exception('Error adding documents to vectordb', exc_info=e, extra={ 'source_id': indoc.source_id, From a04912120965d8ff9a285eac559794b716a595ce Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Wed, 11 Mar 2026 19:44:06 +0530 Subject: [PATCH 13/96] fix(ci): update to the latest changes Signed-off-by: Anupam Kumar --- .github/workflows/integration-test.yml | 104 ++++++++++++++++++------- 1 file changed, 76 insertions(+), 28 deletions(-) diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index fb06baf..9563bcd 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -199,26 +199,87 @@ jobs: ls -la context_chat_backend/persistent_storage/* sleep 30 # Wait for the em server to get ready - - name: Scan files, baseline - run: | - ./occ files:scan admin - ./occ context_chat:scan admin -m text/plain - - - name: Check python memory usage + - name: Initial memory usage check run: | ps -p $(cat pid.txt) -o pid,cmd,%mem,rss --sort=-%mem ps -p $(cat pid.txt) -o %mem --no-headers > initial_mem.txt - - name: Scan files - run: | - ./occ files:scan admin - ./occ context_chat:scan admin -m text/markdown & - ./occ context_chat:scan admin -m text/x-rst - - - name: Check python memory usage + - name: Periodically check context_chat stats for 15 minutes to allow the backend to index the files run: | - ps -p $(cat pid.txt) -o pid,cmd,%mem,rss --sort=-%mem - ps -p $(cat pid.txt) -o %mem --no-headers > after_scan_mem.txt + success=0 + for i in {1..90}; do + echo "Checking stats, attempt $i..." + + mkfifo error_pipe + stats=$(timeout 5 ./occ context_chat:stats 2>error_pipe) + echo "Stats output:" + echo "$stats" + echo "---" + + # Check for critical errors in output + if echo "$stats" | grep -q "Error during request"; then + echo "Backend connection error detected, retrying..." + rm -f error_pipe + sleep 10 + continue + fi + + # Extract Total eligible files + total_files=$(echo "$stats" | grep -oP 'Total eligible files:\s*\K\d+' || echo "") + + # Extract Indexed documents count (files__default) + indexed_count=$(echo "$stats" | grep -oP "'files__default'\s*=>\s*\K\d+" || echo "") + + # Validate parsed values + if [ -z "$total_files" ] || [ -z "$indexed_count" ]; then + echo "Error: Could not parse stats output properly" + if echo "$stats" | grep -q "Indexed documents:"; then + echo " Indexed documents section found but could not extract count" + fi + rm -f error_pipe + sleep 10 + continue + fi + + echo "Total eligible files: $total_files" + echo "Indexed documents (files__default): $indexed_count" + + # Calculate absolute difference + diff=$((total_files - indexed_count)) + if [ $diff -lt 0 ]; then + diff=$((-diff)) + fi + + # Calculate 2% threshold using bc for floating point support + threshold=$(echo "scale=4; $total_files * 0.02" | bc) + + # Check if difference is within tolerance + if (( $(echo "$diff <= $threshold" | bc -l) )); then + echo "Indexing within 2% tolerance (diff=$diff, threshold=$threshold)" + rm -f error_pipe + success=1 + break + else + pct=$(echo "scale=2; ($diff / $total_files) * 100" | bc) + echo "Outside 2% tolerance: diff=$diff (${pct}%), threshold=$threshold" + fi + + # Check if backend is still alive + ccb_alive=$(ps -p $(cat pid.txt) -o cmd= | grep -c "main.py" || echo "0") + if [ "$ccb_alive" -eq 0 ]; then + echo "Error: Context Chat Backend process is not running. Exiting." + rm -f error_pipe + exit 1 + fi + + rm -f error_pipe + sleep 10 + done + + if [ $success -ne 1 ]; then + echo "Max attempts reached" + exit 1 + fi - name: Run the prompts run: | @@ -252,19 +313,6 @@ jobs: echo "Memory usage during scan is stable. No memory leak detected." fi - - name: Compare memory usage and detect leak - run: | - initial_mem=$(cat after_scan_mem.txt | tr -d ' ') - final_mem=$(cat after_prompt_mem.txt | tr -d ' ') - echo "Initial Memory Usage: $initial_mem%" - echo "Memory Usage after prompt: $final_mem%" - - if (( $(echo "$final_mem > $initial_mem" | bc -l) )); then - echo "Memory usage has increased during prompt. Possible memory leak detected!" - else - echo "Memory usage during prompt is stable. No memory leak detected." - fi - - name: Show server logs if: always() run: | From 795380c7c62ce5f60f80aa16ffa1e7568133f03e Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Thu, 12 Mar 2026 16:10:58 +0530 Subject: [PATCH 14/96] fix(ci): use file to store stderr Signed-off-by: Anupam Kumar --- .github/workflows/integration-test.yml | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index 9563bcd..de0f465 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -210,16 +210,21 @@ jobs: for i in {1..90}; do echo "Checking stats, attempt $i..." - mkfifo error_pipe - stats=$(timeout 5 ./occ context_chat:stats 2>error_pipe) + stats_err=$(mktemp) + stats=$(timeout 5 ./occ context_chat:stats 2>"$stats_err") + stats_exit=$? echo "Stats output:" echo "$stats" + if [ -s "$stats_err" ]; then + echo "Stderr:" + cat "$stats_err" + fi echo "---" + rm -f "$stats_err" # Check for critical errors in output - if echo "$stats" | grep -q "Error during request"; then - echo "Backend connection error detected, retrying..." - rm -f error_pipe + if [ $stats_exit -ne 0 ] || echo "$stats" | grep -q "Error during request"; then + echo "Backend connection error detected (exit=$stats_exit), retrying..." sleep 10 continue fi @@ -236,7 +241,6 @@ jobs: if echo "$stats" | grep -q "Indexed documents:"; then echo " Indexed documents section found but could not extract count" fi - rm -f error_pipe sleep 10 continue fi @@ -256,7 +260,6 @@ jobs: # Check if difference is within tolerance if (( $(echo "$diff <= $threshold" | bc -l) )); then echo "Indexing within 2% tolerance (diff=$diff, threshold=$threshold)" - rm -f error_pipe success=1 break else @@ -268,11 +271,9 @@ jobs: ccb_alive=$(ps -p $(cat pid.txt) -o cmd= | grep -c "main.py" || echo "0") if [ "$ccb_alive" -eq 0 ]; then echo "Error: Context Chat Backend process is not running. Exiting." - rm -f error_pipe exit 1 fi - rm -f error_pipe sleep 10 done From 7bc0ed7c3c535f930f03cc38c4dd884b5370696c Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Thu, 12 Mar 2026 17:17:38 +0530 Subject: [PATCH 15/96] fix(ci): add cron jobs Signed-off-by: Anupam Kumar --- .github/workflows/integration-test.yml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index de0f465..0d8e422 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -204,9 +204,18 @@ jobs: ps -p $(cat pid.txt) -o pid,cmd,%mem,rss --sort=-%mem ps -p $(cat pid.txt) -o %mem --no-headers > initial_mem.txt + - name: Run cron jobs + run: | + # every 10 seconds indefinitely + while true; do + php cron.php + sleep 10 + done & + - name: Periodically check context_chat stats for 15 minutes to allow the backend to index the files run: | success=0 + echo "::group::Checking stats periodically for 15 minutes to allow the backend to index the files" for i in {1..90}; do echo "Checking stats, attempt $i..." @@ -277,6 +286,10 @@ jobs: sleep 10 done + echo "::endgroup::" + + ./occ context_chat:stats + if [ $success -ne 1 ]; then echo "Max attempts reached" exit 1 From d94c687e057a7049e6b0f1f32b580f326692acd3 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Thu, 12 Mar 2026 17:35:47 +0530 Subject: [PATCH 16/96] fix(ci): do a occ files scan before cron jobs Signed-off-by: Anupam Kumar --- .github/workflows/integration-test.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index 0d8e422..58f9f50 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -169,6 +169,10 @@ jobs: cd .. rm -rf documentation + - name: Run files scan + run: | + ./occ files:scan --all + - name: Setup python 3.11 uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5 with: From dadc8fa7d193f40ddacffecf6266d8a2b37a6817 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Mon, 16 Mar 2026 20:09:30 +0530 Subject: [PATCH 17/96] feat: record indexing errors in content decode function Signed-off-by: Anupam Kumar --- .../chain/ingest/doc_loader.py | 44 +++++++++---------- context_chat_backend/chain/ingest/injest.py | 20 ++++++--- 2 files changed, 36 insertions(+), 28 deletions(-) diff --git a/context_chat_backend/chain/ingest/doc_loader.py b/context_chat_backend/chain/ingest/doc_loader.py index d26f74b..832c833 100644 --- a/context_chat_backend/chain/ingest/doc_loader.py +++ b/context_chat_backend/chain/ingest/doc_loader.py @@ -3,7 +3,6 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # -import logging import re import tempfile from collections.abc import Callable @@ -18,9 +17,8 @@ from pypdf.errors import FileNotDecryptedError as PdfFileNotDecryptedError from striprtf import striprtf -from ...types import SourceItem +from ...types import IndexingException, SourceItem -logger = logging.getLogger('ccb.doc_loader') def _temp_file_wrapper(file: BytesIO, loader: Callable, sep: str = '\n') -> str: raw_bytes = file.read() @@ -75,10 +73,10 @@ def _load_xlsx(file: BytesIO) -> str: return read_excel(file, na_filter=False).to_string(header=False, na_rep='') -def _load_email(file: BytesIO, ext: str = 'eml') -> str | None: +def _load_email(file: BytesIO, ext: str = 'eml') -> str: # NOTE: msg format is not tested if ext not in ['eml', 'msg']: - return None + raise IndexingException(f'Unsupported email format: {ext}') # TODO: implement attachment partitioner using unstructured.partition.partition_{email,msg} # since langchain does not pass through the attachment_partitioner kwarg @@ -116,34 +114,36 @@ def attachment_partitioner( } -def decode_source(source: SourceItem) -> str | None: +def decode_source(source: SourceItem) -> str: + ''' + Raises + ------ + IndexingException + ''' + io_obj: BytesIO | None = None try: # .pot files are powerpoint templates but also plain text files, # so we skip them to prevent decoding errors if source.title.endswith('.pot'): - return None - - mimetype = source.type - if mimetype is None: - return None + raise IndexingException('PowerPoint template files (.pot) are not supported') if isinstance(source.content, str): io_obj = BytesIO(source.content.encode('utf-8', 'ignore')) else: io_obj = source.content - if _loader_map.get(mimetype): - result = _loader_map[mimetype](io_obj) - return result.encode('utf-8', 'ignore').decode('utf-8', 'ignore') - - return io_obj.read().decode('utf-8', 'ignore') - except PdfFileNotDecryptedError: - logger.warning(f'PDF file ({source.reference}) is encrypted and cannot be read') - return None - except Exception: - logger.exception(f'Error decoding source file ({source.reference})', stack_info=True) - return None + if _loader_map.get(source.type): + result = _loader_map[source.type](io_obj) + return result.encode('utf-8', 'ignore').decode('utf-8', 'ignore').strip() + + return io_obj.read().decode('utf-8', 'ignore').strip() + except IndexingException: + raise + except PdfFileNotDecryptedError as e: + raise IndexingException('PDF file is encrypted and cannot be read') from e + except Exception as e: + raise IndexingException(f'Error decoding source file: {e}') from e finally: if io_obj is not None: io_obj.close() diff --git a/context_chat_backend/chain/ingest/injest.py b/context_chat_backend/chain/ingest/injest.py index 7369f45..d9ea543 100644 --- a/context_chat_backend/chain/ingest/injest.py +++ b/context_chat_backend/chain/ingest/injest.py @@ -8,7 +8,7 @@ from langchain.schema import Document from ...dyn_loader import VectorDBLoader -from ...types import IndexingError, SourceItem, TConfig +from ...types import IndexingError, IndexingException, SourceItem, TConfig from ...vectordb.base import BaseVectorDB from ...vectordb.types import DbException, SafeDbException, UpdateAccessOp from ..types import InDocument @@ -59,9 +59,17 @@ def _sources_to_indocuments( # todo: maybe fetch the content of the files here # transform the source to have text data - content = decode_source(source) + try: + content = decode_source(source) + except IndexingException as e: + logger.error(f'Error decoding source ({source.reference}): {e}', exc_info=e) + errored_docs[db_id] = IndexingError( + error=str(e), + retryable=False, + ) + continue - if content is None or (content := content.strip()) == '': + if content == '': logger.debug('decoded empty source', extra={ 'source_id': source.reference }) errored_docs[db_id] = IndexingError( error='Decoded content is empty', @@ -74,12 +82,12 @@ def _sources_to_indocuments( # NOTE: do not use this with all docs when programming files are added content = re.sub(r'(\s){5,}', r'\g<1>', content) # filter out null bytes - content = content.replace('\0', '') + content = content.replace('\0', '').strip() - if content is None or content == '': + if content == '': logger.debug('decoded empty source after cleanup', extra={ 'source_id': source.reference }) errored_docs[db_id] = IndexingError( - error='Decoded content is empty', + error='Cleaned up content is empty', retryable=False, ) continue From f9d86dcf1ddac21e61edcc3698b79e0a69475a24 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Tue, 17 Mar 2026 20:27:10 +0530 Subject: [PATCH 18/96] chore: move file fetch inside injest Signed-off-by: Anupam Kumar --- context_chat_backend/chain/ingest/injest.py | 197 ++++++++++++++++++-- context_chat_backend/task_fetcher.py | 173 +---------------- context_chat_backend/types.py | 7 +- context_chat_backend/vectordb/base.py | 11 +- context_chat_backend/vectordb/pgvector.py | 14 +- 5 files changed, 208 insertions(+), 194 deletions(-) diff --git a/context_chat_backend/chain/ingest/injest.py b/context_chat_backend/chain/ingest/injest.py index d9ea543..18a37b4 100644 --- a/context_chat_backend/chain/ingest/injest.py +++ b/context_chat_backend/chain/ingest/injest.py @@ -2,13 +2,18 @@ # SPDX-FileCopyrightText: 2023 Nextcloud GmbH and Nextcloud contributors # SPDX-License-Identifier: AGPL-3.0-or-later # +import asyncio import logging import re +from collections.abc import Mapping +from io import BytesIO +import niquests from langchain.schema import Document +from nc_py_api import AsyncNextcloudApp from ...dyn_loader import VectorDBLoader -from ...types import IndexingError, IndexingException, SourceItem, TConfig +from ...types import IndexingError, IndexingException, ReceivedFileItem, SourceItem, TConfig from ...vectordb.base import BaseVectorDB from ...vectordb.types import DbException, SafeDbException, UpdateAccessOp from ..types import InDocument @@ -17,15 +22,165 @@ logger = logging.getLogger('ccb.injest') +# max concurrent fetches to avoid overloading the NC server or hitting rate limits +CONCURRENT_FILE_FETCHES = 10 # todo: config? +MAX_FILE_SIZE = 100 * 1024 * 1024 # 100 MB, all loaded in RAM at once, todo: config? + + +async def __fetch_file_content( + semaphore: asyncio.Semaphore, + file_id: int, + user_id: str, + _rlimit = 3, +) -> BytesIO: + ''' + Raises + ------ + IndexingException + ''' + + async with semaphore: + nc = AsyncNextcloudApp() + try: + # a file pointer for storing the stream in memory until it is consumed + fp = BytesIO() + await nc._session.download2fp( + url_path=f'/ocs/v2.php/apps/context_chat/files/{file_id}', + fp=fp, + dav=False, + params={ 'userId': user_id }, + ) + return fp + except niquests.exceptions.RequestException as e: + if e.response is None: + raise + + if e.response.status_code == niquests.codes.too_many_requests: # pyright: ignore[reportAttributeAccessIssue] + # todo: implement rate limits in php CC? + wait_for = int(e.response.headers.get('Retry-After', '30')) + if _rlimit <= 0: + raise IndexingException( + f'Rate limited when fetching content for file id {file_id}, user id {user_id},' + ' max retries exceeded', + retryable=True, + ) from e + logger.warning( + f'Rate limited when fetching content for file id {file_id}, user id {user_id},' + f' waiting {wait_for} before retrying', + exc_info=e, + ) + await asyncio.sleep(wait_for) + return await __fetch_file_content(semaphore, file_id, user_id, _rlimit - 1) + + raise + except IndexingException: + raise + except Exception as e: + logger.error(f'Error fetching content for file id {file_id}, user id {user_id}: {e}', exc_info=e) + raise IndexingException(f'Error fetching content for file id {file_id}, user id {user_id}: {e}') from e + + +async def __fetch_files_content( + sources: Mapping[int, SourceItem | ReceivedFileItem] +) -> tuple[Mapping[int, SourceItem], Mapping[int, IndexingError]]: + source_items = {} + error_items = {} + semaphore = asyncio.Semaphore(CONCURRENT_FILE_FETCHES) + tasks = [] + + for db_id, file in sources.items(): + if isinstance(file, SourceItem): + continue + + try: + # to detect any validation errors but it should not happen since file.reference is validated + file.file_id # noqa: B018 + except ValueError as e: + logger.error( + f'Invalid file reference format for db id {db_id}, file reference {file.reference}: {e}', + exc_info=e, + ) + error_items[db_id] = IndexingError( + error=f'Invalid file reference format: {file.reference}', + retryable=False, + ) + continue + + if file.size > MAX_FILE_SIZE: + logger.info( + f'Skipping db id {db_id}, file id {file.file_id}, source id {file.reference} due to size' + f' {(file.size/(1024*1024)):.2f} MiB exceeding the limit {(MAX_FILE_SIZE/(1024*1024)):.2f} MiB', + ) + error_items[db_id] = IndexingError( + error=( + f'File size {(file.size/(1024*1024)):.2f} MiB' + f' exceeds the limit {(MAX_FILE_SIZE/(1024*1024)):.2f} MiB' + ), + retryable=False, + ) + continue + # any user id from the list should have read access to the file + tasks.append(asyncio.ensure_future(__fetch_file_content(semaphore, file.file_id, file.userIds[0]))) + + results = await asyncio.gather(*tasks, return_exceptions=True) + for (db_id, file), result in zip(sources.items(), results, strict=True): + if isinstance(file, SourceItem): + continue + + if isinstance(result, IndexingException): + logger.error( + f'Error fetching content for db id {db_id}, file id {file.file_id}, reference {file.reference}' + f': {result}', + exc_info=result, + ) + error_items[db_id] = IndexingError( + error=str(result), + retryable=result.retryable, + ) + elif isinstance(result, str) or isinstance(result, BytesIO): + source_items[db_id] = SourceItem( + **{ + **file.model_dump(), + 'content': result, + } + ) + elif isinstance(result, BaseException): + logger.error( + f'Unexpected error fetching content for db id {db_id}, file id {file.file_id},' + f' reference {file.reference}: {result}', + exc_info=result, + ) + error_items[db_id] = IndexingError( + error=f'Unexpected error: {result}', + retryable=True, + ) + else: + logger.error( + f'Unknown error fetching content for db id {db_id}, file id {file.file_id}, reference {file.reference}' + f': {result}', + exc_info=True, + ) + error_items[db_id] = IndexingError( + error='Unknown error', + retryable=True, + ) + + # add the content providers from the orginal "sources" to the result unprocessed + for db_id, source in sources.items(): + if isinstance(source, SourceItem): + source_items[db_id] = source + + return source_items, error_items + def _filter_sources( vectordb: BaseVectorDB, - sources: dict[int, SourceItem] -) -> tuple[dict[int, SourceItem], dict[int, SourceItem]]: + sources: Mapping[int, SourceItem | ReceivedFileItem] +) -> tuple[Mapping[int, SourceItem | ReceivedFileItem], Mapping[int, SourceItem | ReceivedFileItem]]: ''' Returns ------- - tuple[list[str], list[UploadFile]] + tuple[Mapping[int, SourceItem | ReceivedFileItem], Mapping[int, SourceItem | ReceivedFileItem]]: First value is a list of sources that already exist in the vectordb. Second value is a list of sources that are new and should be embedded. ''' @@ -49,15 +204,14 @@ def _filter_sources( def _sources_to_indocuments( config: TConfig, - sources: dict[int, SourceItem] -) -> tuple[dict[int, InDocument], dict[int, IndexingError]]: + sources: Mapping[int, SourceItem] +) -> tuple[Mapping[int, InDocument], Mapping[int, IndexingError]]: indocuments = {} errored_docs = {} for db_id, source in sources.items(): logger.debug('processing source', extra={ 'source_id': source.reference }) - # todo: maybe fetch the content of the files here # transform the source to have text data try: content = decode_source(source) @@ -121,8 +275,8 @@ def _sources_to_indocuments( def _increase_access_for_existing_sources( vectordb: BaseVectorDB, - existing_sources: dict[int, SourceItem] -) -> dict[int, IndexingError | None]: + existing_sources: Mapping[int, SourceItem | ReceivedFileItem] +) -> Mapping[int, IndexingError | None]: ''' update userIds for existing sources allow the userIds as additional users, not as the only users @@ -162,8 +316,8 @@ def _increase_access_for_existing_sources( def _process_sources( vectordb: BaseVectorDB, config: TConfig, - sources: dict[int, SourceItem] -) -> dict[int, IndexingError | None]: + sources: Mapping[int, SourceItem | ReceivedFileItem] +) -> Mapping[int, IndexingError | None]: ''' Processes the sources and adds them to the vectordb. Returns the list of source ids that were successfully added and those that need to be retried. @@ -178,18 +332,21 @@ def _process_sources( source_proc_results = _increase_access_for_existing_sources(vectordb, existing_sources) - if len(to_embed_sources) == 0: + populated_to_embed_sources, errored_sources = asyncio.run(__fetch_files_content(to_embed_sources)) + source_proc_results.update(errored_sources) # pyright: ignore[reportAttributeAccessIssue] + + if len(populated_to_embed_sources) == 0: # no new sources to embed logger.debug('Filtered all sources, nothing to embed') return source_proc_results logger.debug('Filtered sources:', extra={ - 'source_ids': [source.reference for source in to_embed_sources.values()] + 'source_ids': [source.reference for source in populated_to_embed_sources.values()] }) # invalid/empty sources are filtered out here and not counted in loaded/retryable - indocuments, errored_docs = _sources_to_indocuments(config, to_embed_sources) + indocuments, errored_docs = _sources_to_indocuments(config, populated_to_embed_sources) - source_proc_results.update(errored_docs) + source_proc_results.update(errored_docs) # pyright: ignore[reportAttributeAccessIssue] logger.debug('Converted sources to documents') if len(indocuments) == 0: @@ -197,8 +354,12 @@ def _process_sources( logger.debug('All documents were found empty after being processed') return source_proc_results + logger.debug('Adding documents to vectordb', extra={ + 'source_ids': [indoc.source_id for indoc in indocuments.values()] + }) + doc_add_results = vectordb.add_indocuments(indocuments) - source_proc_results.update(doc_add_results) + source_proc_results.update(doc_add_results) # pyright: ignore[reportAttributeAccessIssue] logger.debug('Added documents to vectordb') return source_proc_results @@ -215,8 +376,8 @@ def _decode_latin_1(s: str) -> str: def embed_sources( vectordb_loader: VectorDBLoader, config: TConfig, - sources: dict[int, SourceItem] -) -> dict[int, IndexingError | None]: + sources: Mapping[int, SourceItem | ReceivedFileItem] +) -> Mapping[int, IndexingError | None]: logger.debug('Embedding sources:', extra={ 'source_ids': [ f'{source.reference} ({_decode_latin_1(source.title)})' diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index 51f98e7..28aff6a 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -3,17 +3,16 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # -import asyncio import logging import os +from collections.abc import Mapping from contextlib import suppress from enum import Enum -from io import BytesIO from threading import Event, Thread from time import sleep import niquests -from nc_py_api import AsyncNextcloudApp, NextcloudApp +from nc_py_api import NextcloudApp from pydantic import ValidationError from .chain.ingest.injest import embed_sources @@ -25,7 +24,6 @@ EmbeddingException, FilesQueueItems, IndexingError, - IndexingException, LoaderException, ReceivedFileItem, SourceItem, @@ -46,12 +44,10 @@ THREADS = {} THREAD_STOP_EVENT = Event() LOGGER = logging.getLogger('ccb.task_fetcher') -FILES_INDEXING_BATCH_SIZE = 64 # todo: config? +FILES_INDEXING_BATCH_SIZE = 16 # theoretical max RAM usage: 16 * 100 MiB, todo: config? +MIN_FILES_PER_CPU = 4 # divides the batch into these many chunks PARALLEL_FILE_PARSING = max(1, (os.cpu_count() or 2) - 1) # todo: config? -# max concurrent fetches to avoid overloading the NC server or hitting rate limits -CONCURRENT_FILE_FETCHES = 10 # todo: config? -MAX_FILE_SIZE = 100 * 1024 * 1024 # 100 MB, todo: config? ACTIONS_BATCH_SIZE = 512 # todo: config? POLLING_COOLDOWN = 30 @@ -62,143 +58,6 @@ class ThreadType(Enum): REQUEST_PROCESSING = 'request_processing' -async def __fetch_file_content( - semaphore: asyncio.Semaphore, - file_id: int, - user_id: str, - _rlimit = 3, -) -> BytesIO: - ''' - Raises - ------ - IndexingException - ''' - - async with semaphore: - nc = AsyncNextcloudApp() - try: - # a file pointer for storing the stream in memory until it is consumed - fp = BytesIO() - await nc._session.download2fp( - url_path=f'/ocs/v2.php/apps/context_chat/files/{file_id}', - fp=fp, - dav=False, - params={ 'userId': user_id }, - ) - return fp - except niquests.exceptions.RequestException as e: - # todo: raise IndexingException with retryable=True for rate limit errors, - # todo: and handle it in the caller to not delete the source from the queue and retry later through - # todo: the normal lock expiry mechanism - if e.response is None: - raise - - if e.response.status_code == niquests.codes.too_many_requests: # pyright: ignore[reportAttributeAccessIssue] - # todo: implement rate limits in php CC? - wait_for = int(e.response.headers.get('Retry-After', '30')) - if _rlimit <= 0: - raise IndexingException( - f'Rate limited when fetching content for file id {file_id}, user id {user_id},' - ' max retries exceeded', - retryable=True, - ) from e - LOGGER.warning( - f'Rate limited when fetching content for file id {file_id}, user id {user_id},' - f' waiting {wait_for} before retrying', - exc_info=e, - ) - await asyncio.sleep(wait_for) - return await __fetch_file_content(semaphore, file_id, user_id, _rlimit - 1) - - raise - except IndexingException: - raise - except Exception as e: - LOGGER.error(f'Error fetching content for file id {file_id}, user id {user_id}: {e}', exc_info=e) - raise IndexingException(f'Error fetching content for file id {file_id}, user id {user_id}: {e}') from e - - -async def __fetch_files_content( - files: dict[int, ReceivedFileItem] -) -> dict[int, SourceItem | IndexingError]: - source_items = {} - semaphore = asyncio.Semaphore(CONCURRENT_FILE_FETCHES) - tasks = [] - - for db_id, file in files.items(): - try: - # to detect any validation errors but it should not happen since file.reference is validated - file.file_id # noqa: B018 - except ValueError as e: - LOGGER.error( - f'Invalid file reference format for db id {db_id}, file reference {file.reference}: {e}', - exc_info=e, - ) - source_items[db_id] = IndexingError( - error=f'Invalid file reference format: {file.reference}', - retryable=False, - ) - continue - - if file.size > MAX_FILE_SIZE: - LOGGER.info( - f'Skipping db id {db_id}, file id {file.file_id}, source id {file.reference} due to size' - f' {(file.size/(1024*1024)):.2f} MiB exceeding the limit {(MAX_FILE_SIZE/(1024*1024)):.2f} MiB', - ) - source_items[db_id] = IndexingError( - error=( - f'File size {(file.size/(1024*1024)):.2f} MiB' - f' exceeds the limit {(MAX_FILE_SIZE/(1024*1024)):.2f} MiB' - ), - retryable=False, - ) - continue - # todo: perform the existing file check before fetching the content to avoid unnecessary fetches - # any user id from the list should have read access to the file - tasks.append(asyncio.ensure_future(__fetch_file_content(semaphore, file.file_id, file.userIds[0]))) - - results = await asyncio.gather(*tasks, return_exceptions=True) - for (db_id, file), result in zip(files.items(), results, strict=True): - if isinstance(result, IndexingException): - LOGGER.error( - f'Error fetching content for db id {db_id}, file id {file.file_id}, reference {file.reference}' - f': {result}', - exc_info=result, - ) - source_items[db_id] = IndexingError( - error=str(result), - retryable=result.retryable, - ) - elif isinstance(result, str) or isinstance(result, BytesIO): - source_items[db_id] = SourceItem( - **{ - **file.model_dump(), - 'content': result, - } - ) - elif isinstance(result, BaseException): - LOGGER.error( - f'Unexpected error fetching content for db id {db_id}, file id {file.file_id},' - f' reference {file.reference}: {result}', - exc_info=result, - ) - source_items[db_id] = IndexingError( - error=f'Unexpected error: {result}', - retryable=True, - ) - else: - LOGGER.error( - f'Unknown error fetching content for db id {db_id}, file id {file.file_id}, reference {file.reference}' - f': {result}', - exc_info=True, - ) - source_items[db_id] = IndexingError( - error='Unknown error', - retryable=True, - ) - return source_items - - def files_indexing_thread(app_config: TConfig, app_enabled: Event) -> None: try: vectordb_loader = VectorDBLoader(app_config) @@ -206,7 +65,7 @@ def files_indexing_thread(app_config: TConfig, app_enabled: Event) -> None: LOGGER.error('Error initializing vector DB loader, files indexing thread will not start:', exc_info=e) return - def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingError | None]: + def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> Mapping[int, IndexingError | None]: try: return exec_in_proc( target=embed_sources, @@ -225,7 +84,6 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro try: nc = NextcloudApp() - # todo: add the 'size' param to the return of this call. q_items_res = nc.ocs( 'GET', '/ocs/v2.php/apps/context_chat/queues/documents', @@ -242,29 +100,14 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro sleep(POLLING_COOLDOWN) continue - # populate files content and convert to source items - fetched_files = {} - source_files = {} - # unified error structure for files and content providers - source_errors = {} - - if q_items.files: - fetched_files = asyncio.run(__fetch_files_content(q_items.files)) - - for db_id, result in fetched_files.items(): - if isinstance(result, SourceItem): - source_files[db_id] = result - else: - source_errors[db_id] = result - files_result = {} providers_result = {} - chunk_size = FILES_INDEXING_BATCH_SIZE // PARALLEL_FILE_PARSING + chunk_size = max(MIN_FILES_PER_CPU, FILES_INDEXING_BATCH_SIZE // PARALLEL_FILE_PARSING) # todo: do it in asyncio, it's not truly parallel yet # chunk file parsing for better file operation parallelism - for i in range(0, len(source_files), chunk_size): - chunk = dict(list(source_files.items())[i:i+chunk_size]) + for i in range(0, len(q_items.files), chunk_size): + chunk = dict(list(q_items.files.items())[i:i+chunk_size]) files_result.update(_load_sources(chunk)) for i in range(0, len(q_items.content_providers), chunk_size): diff --git a/context_chat_backend/types.py b/context_chat_backend/types.py index 9f23e14..59d2568 100644 --- a/context_chat_backend/types.py +++ b/context_chat_backend/types.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # import re +from collections.abc import Mapping from enum import Enum from io import BytesIO from typing import Annotated, Literal, Self @@ -224,8 +225,8 @@ class Config: class FilesQueueItems(BaseModel): - files: dict[int, ReceivedFileItem] # [db id]: FileItem - content_providers: dict[int, SourceItem] # [db id]: SourceItem + files: Mapping[int, ReceivedFileItem] # [db id]: FileItem + content_providers: Mapping[int, SourceItem] # [db id]: SourceItem class IndexingException(Exception): @@ -343,4 +344,4 @@ class ActionsQueueItemUpdateAccessDeclSourceId(CommonActionsQueueItem): class ActionsQueueItems(BaseModel): - actions: dict[int, ActionsQueueItem] + actions: Mapping[int, ActionsQueueItem] diff --git a/context_chat_backend/vectordb/base.py b/context_chat_backend/vectordb/base.py index ebd5407..2b4aa35 100644 --- a/context_chat_backend/vectordb/base.py +++ b/context_chat_backend/vectordb/base.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # from abc import ABC, abstractmethod +from collections.abc import Mapping from typing import Any from langchain.schema import Document @@ -10,7 +11,7 @@ from langchain.schema.vectorstore import VectorStore from ..chain.types import InDocument, ScopeType -from ..types import IndexingError, SourceItem +from ..types import IndexingError, ReceivedFileItem, SourceItem from ..utils import timed from .types import UpdateAccessOp @@ -62,7 +63,7 @@ def get_instance(self) -> VectorStore: ''' @abstractmethod - def add_indocuments(self, indocuments: dict[int, InDocument]) -> dict[int, IndexingError | None]: + def add_indocuments(self, indocuments: Mapping[int, InDocument]) -> Mapping[int, IndexingError | None]: ''' Adds the given indocuments to the vectordb and updates the docs + access tables. @@ -79,7 +80,7 @@ def add_indocuments(self, indocuments: dict[int, InDocument]) -> dict[int, Index @timed @abstractmethod - def check_sources(self, sources: dict[int, SourceItem]) -> tuple[list[str], list[str]]: + def check_sources(self, sources: Mapping[int, SourceItem | ReceivedFileItem]) -> tuple[list[str], list[str]]: ''' Checks the sources in the vectordb if they are already embedded and are up to date. @@ -88,8 +89,8 @@ def check_sources(self, sources: dict[int, SourceItem]) -> tuple[list[str], list Args ---- - sources: list[UploadFile] - List of source ids to check. + sources: Mapping[int, SourceItem | ReceivedFileItem] + Dict of sources to check. Returns ------- diff --git a/context_chat_backend/vectordb/pgvector.py b/context_chat_backend/vectordb/pgvector.py index bfca0bb..86f636b 100644 --- a/context_chat_backend/vectordb/pgvector.py +++ b/context_chat_backend/vectordb/pgvector.py @@ -4,6 +4,7 @@ # import logging import os +from collections.abc import Mapping from datetime import datetime import psycopg @@ -17,7 +18,14 @@ from langchain_postgres.vectorstores import Base, PGVector from ..chain.types import InDocument, ScopeType -from ..types import EmbeddingException, FatalEmbeddingException, IndexingError, RetryableEmbeddingException, SourceItem +from ..types import ( + EmbeddingException, + FatalEmbeddingException, + IndexingError, + ReceivedFileItem, + RetryableEmbeddingException, + SourceItem, +) from ..utils import timed from .base import BaseVectorDB from .types import DbException, SafeDbException, UpdateAccessOp @@ -129,7 +137,7 @@ def get_users(self) -> list[str]: except Exception as e: raise DbException('Error: getting a list of all users from access list') from e - def add_indocuments(self, indocuments: dict[int, InDocument]) -> dict[int, IndexingError | None]: + def add_indocuments(self, indocuments: Mapping[int, InDocument]) -> Mapping[int, IndexingError | None]: """ Raises EmbeddingException: if the embedding request definitively fails @@ -208,7 +216,7 @@ def add_indocuments(self, indocuments: dict[int, InDocument]) -> dict[int, Index return results @timed - def check_sources(self, sources: dict[int, SourceItem]) -> tuple[list[str], list[str]]: + def check_sources(self, sources: Mapping[int, SourceItem | ReceivedFileItem]) -> tuple[list[str], list[str]]: ''' returns a tuple of (existing_source_ids, to_embed_source_ids) ''' From 1ade19186593193a5005d2aadc97a83b25f601b8 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Wed, 18 Mar 2026 16:49:09 +0530 Subject: [PATCH 19/96] fix: truly parallel file parsing and indexing Signed-off-by: Anupam Kumar --- context_chat_backend/task_fetcher.py | 48 ++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 13 deletions(-) diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index 28aff6a..f07f501 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -4,8 +4,10 @@ # import logging +import math import os from collections.abc import Mapping +from concurrent.futures import ThreadPoolExecutor from contextlib import suppress from enum import Enum from threading import Event, Thread @@ -47,7 +49,7 @@ FILES_INDEXING_BATCH_SIZE = 16 # theoretical max RAM usage: 16 * 100 MiB, todo: config? MIN_FILES_PER_CPU = 4 # divides the batch into these many chunks -PARALLEL_FILE_PARSING = max(1, (os.cpu_count() or 2) - 1) # todo: config? +PARALLEL_FILE_PARSING_COUNT = max(1, (os.cpu_count() or 2) - 1) # todo: config? ACTIONS_BATCH_SIZE = 512 # todo: config? POLLING_COOLDOWN = 30 @@ -71,10 +73,14 @@ def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> target=embed_sources, args=(vectordb_loader, app_config, source_items), ) - except (DbException, EmbeddingException): - raise except Exception as e: - raise DbException('Error: failed to load sources') from e + err_name = {DbException: "DB", EmbeddingException: "Embedding"}.get(type(e), "Unknown") + source_ids = (s.reference for s in source_items.values()) + err = IndexingError( + error=f'{err_name} Error occurred, the sources {source_ids} will be retried: {e}', + retryable=True, + ) + return dict.fromkeys(source_items, err) while True: @@ -102,17 +108,33 @@ def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> files_result = {} providers_result = {} - chunk_size = max(MIN_FILES_PER_CPU, FILES_INDEXING_BATCH_SIZE // PARALLEL_FILE_PARSING) - # todo: do it in asyncio, it's not truly parallel yet # chunk file parsing for better file operation parallelism - for i in range(0, len(q_items.files), chunk_size): - chunk = dict(list(q_items.files.items())[i:i+chunk_size]) - files_result.update(_load_sources(chunk)) - - for i in range(0, len(q_items.content_providers), chunk_size): - chunk = dict(list(q_items.content_providers.items())[i:i+chunk_size]) - providers_result.update(_load_sources(chunk)) + file_chunk_size = max(MIN_FILES_PER_CPU, math.ceil(len(q_items.files) / PARALLEL_FILE_PARSING_COUNT)) + file_chunks = [ + dict(list(q_items.files.items())[i:i+file_chunk_size]) + for i in range(0, len(q_items.files), file_chunk_size) + ] + provider_chunk_size = max( + MIN_FILES_PER_CPU, + math.ceil(len(q_items.content_providers) / PARALLEL_FILE_PARSING_COUNT), + ) + provider_chunks = [ + dict(list(q_items.content_providers.items())[i:i+provider_chunk_size]) + for i in range(0, len(q_items.content_providers), provider_chunk_size) + ] + + with ThreadPoolExecutor( + max_workers=PARALLEL_FILE_PARSING_COUNT, + thread_name_prefix='IndexingPool', + ) as executor: + file_futures = [executor.submit(_load_sources, chunk) for chunk in file_chunks] + provider_futures = [executor.submit(_load_sources, chunk) for chunk in provider_chunks] + + for future in file_futures: + files_result.update(future.result()) + for future in provider_futures: + providers_result.update(future.result()) if ( any(isinstance(res, IndexingError) for res in files_result.values()) From 12fd1ca00fc6d3fab6e91b8bb4dbc6c11488ca74 Mon Sep 17 00:00:00 2001 From: Marcel Klehr Date: Tue, 24 Mar 2026 10:36:04 +0100 Subject: [PATCH 20/96] initial pass at request processing --- context_chat_backend/controller.py | 4 +- context_chat_backend/task_fetcher.py | 362 +++++++++++++++++++++++++-- 2 files changed, 350 insertions(+), 16 deletions(-) diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py index 797ba20..3ebdc8a 100644 --- a/context_chat_backend/controller.py +++ b/context_chat_backend/controller.py @@ -40,7 +40,7 @@ from .models.types import LlmException from nc_py_api.ex_app import AppAPIAuthMiddleware from .utils import JSONResponse, exec_in_proc, value_of -from .task_fetcher import start_bg_threads, wait_for_bg_threads +from .task_fetcher import start_bg_threads, trigger_handler, wait_for_bg_threads from .vectordb.service import count_documents_by_provider # setup @@ -83,7 +83,7 @@ def enabled_handler(enabled: bool, _: NextcloudApp | AsyncNextcloudApp) -> str: @asynccontextmanager async def lifespan(app: FastAPI): - set_handlers(app, enabled_handler, models_to_fetch=models_to_fetch) + set_handlers(app, enabled_handler, models_to_fetch=models_to_fetch, trigger_handler=trigger_handler) nc = NextcloudApp() if nc.enabled_state: app_enabled.set() diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index f07f501..a502802 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -12,26 +12,25 @@ from enum import Enum from threading import Event, Thread from time import sleep +from typing import Any import niquests -from nc_py_api import NextcloudApp +from langchain.llms.base import LLM +from langchain.schema import Document +from nc_py_api import NextcloudApp, NextcloudException +from niquests import JSONDecodeError, RequestException from pydantic import ValidationError +from .chain.context import get_context_chunks, get_context_docs from .chain.ingest.injest import embed_sources +from .chain.query_proc import get_pruned_query +from .chain.types import ContextException, LLMOutput, ScopeType +from .controller import llm_loader from .dyn_loader import VectorDBLoader -from .types import ( - ActionsQueueItems, - ActionType, - AppRole, - EmbeddingException, - FilesQueueItems, - IndexingError, - LoaderException, - ReceivedFileItem, - SourceItem, - TConfig, -) +from .types import ActionType, ActionsQueueItems, AppRole, EmbeddingException, FilesQueueItems, IndexingError, \ + LoaderException, ReceivedFileItem, SourceItem, TConfig from .utils import exec_in_proc, get_app_role +from .vectordb.base import BaseVectorDB from .vectordb.service import ( decl_update_access, delete_by_provider, @@ -52,6 +51,10 @@ PARALLEL_FILE_PARSING_COUNT = max(1, (os.cpu_count() or 2) - 1) # todo: config? ACTIONS_BATCH_SIZE = 512 # todo: config? POLLING_COOLDOWN = 30 +TRIGGER = Event() +CHECK_INTERVAL = 5 +CHECK_INTERVAL_WITH_TRIGGER = 5 * 60 +CHECK_INTERVAL_ON_ERROR = 15 class ThreadType(Enum): @@ -370,7 +373,78 @@ def updates_processing_thread(app_config: TConfig, app_enabled: Event) -> None: def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None: - ... + logger.info('Starting task fetcher loop') + + try: + vectordb_loader = VectorDBLoader(app_config) + except LoaderException as e: + LOGGER.error('Error initializing vector DB loader, files indexing thread will not start:', exc_info=e) + return + + nc = NextcloudApp() + llm: LLM = llm_loader.load() + + while True: + if THREAD_STOP_EVENT.is_set(): + LOGGER.info('Updates processing thread is stopping due to stop event being set') + return + + try: + # Fetch pending task + try: + response = nc.providers.task_processing.next_task(list(provider_ids), list(task_type_ids)) + if not response: + wait_for_tasks() + continue + except (NextcloudException, RequestException, JSONDecodeError) as e: + LOGGER.error(f"Network error fetching the next task {e}", exc_info=e) + wait_for_tasks(CHECK_INTERVAL_ON_ERROR) + continue + + # Process task + task = response["task"] + provider = response["provider"] + + try: + logger.debug(f'Processing task {task["id"]}') + result = process_task(task, vectordb_loader, llm, app_config) + + # Return result to Nextcloud + success = return_result_to_nextcloud(task_id, result) + + if success: + LOGGER.info(f'Task {task["id"]} completed successfully') + else: + LOGGER.error(f'Failed to return result for task {task["id"]}') + + except ContextException as e: + LOGGER.warning(f'Context error for task {task["id"]}: {e}') + # TODO: Return error to Nextcloud + except ValueError as e: + LOGGER.warning(f'Validation error for task {task["id"]}: {e}') + # TODO: Return error to Nextcloud + except Exception as e: + LOGGER.exception(f'Unexpected error processing task {task["id"]}', exc_info=e) + # TODO: Return error to Nextcloud + + except Exception as e: + logger.exception('Error in task fetcher loop', exc_info=e) + # TODO: Add appropriate error handling and backoff + +def trigger_handler(providerId: str): + global TRIGGER + print('TRIGGER called') + TRIGGER.set() + +def wait_for_tasks(interval = None): + global TRIGGER + global CHECK_INTERVAL + global CHECK_INTERVAL_WITH_TRIGGER + actual_interval = CHECK_INTERVAL if interval is None else interval + if TRIGGER.wait(timeout=actual_interval): + CHECK_INTERVAL = CHECK_INTERVAL_WITH_TRIGGER + TRIGGER.clear() + def start_bg_threads(app_config: TConfig, app_enabled: Event): @@ -430,3 +504,263 @@ def wait_for_bg_threads(): THREAD_STOP_EVENT.set() THREADS[ThreadType.REQUEST_PROCESSING].join() THREADS.pop(ThreadType.REQUEST_PROCESSING) + + +# Default LLM template for context-based queries +_LLM_TEMPLATE = '''Answer based only on this context and do not add any imaginative details. Make sure to use the same language as the question in your answer. +{context} + +{question} +''' + +def query_vector_database( + user_id: str, + query: str, + vectordb: BaseVectorDB, + ctx_limit: int, + scope_type: ScopeType | None = None, + scope_list: list[str] | None = None, +) -> list[Document]: + """ + Query the vector database to retrieve relevant documents. + + Args: + user_id: User ID for scoping the search + query: The search query text + vectordb: Vector database instance + ctx_limit: Maximum number of documents to return + scope_type: Optional scope type (PROVIDER or SOURCE) + scope_list: Optional list of scope identifiers + + Returns: + List of relevant Document objects + + Raises: + ContextException: If scope type is provided without scope list + """ + context_docs = get_context_docs(user_id, query, vectordb, ctx_limit, scope_type, scope_list) + logger.debug('Retrieved context documents', extra={ + 'user_id': user_id, + 'num_docs': len(context_docs), + 'ctx_limit': ctx_limit, + }) + return context_docs + + +def prepare_context_chunks(context_docs: list[Document]) -> list[str]: + """ + Extract and format text chunks from documents for LLM context. + + Args: + context_docs: List of Document objects from vector DB + + Returns: + List of formatted text chunks including titles and content + """ + return get_context_chunks(context_docs) + + +def generate_llm_response( + llm: LLM, + app_config: TConfig, + user_id: str, + query: str, + template: str, + context_chunks: list[str], + end_separator: str = '', +) -> str: + """ + Generate LLM response using the pruned query and context. + + Args: + llm: Language model instance + app_config: Application configuration + user_id: User ID for the request + query: The original query text + template: Template for formatting the prompt + context_chunks: Context chunks to include in the prompt + end_separator: Optional separator to stop generation + + Returns: + Generated LLM output text + + Raises: + ValueError: If context length is too small to fit the query + """ + pruned_query_text = get_pruned_query(llm, app_config, query, template, context_chunks) + + stop = [end_separator] if end_separator else None + output = llm.invoke( + pruned_query_text, + stop=stop, + userid=user_id, + ).strip() + + logger.debug('Generated LLM response', extra={ + 'user_id': user_id, + 'output_length': len(output), + }) + return output + + +def extract_unique_sources(context_docs: list[Document]) -> list[str]: + """ + Extract unique source IDs from context documents. + + Args: + context_docs: List of Document objects + + Returns: + List of unique source IDs + """ + unique_sources: list[str] = list({ + source for d in context_docs if (source := d.metadata.get('source')) + }) + return unique_sources + +def execute_context_query( + user_id: str, + vectordb_loader: VectorDBLoader, + llm: LLM, + app_config: TConfig, + query: str, + ctx_limit: int = 20, + scope_type: ScopeType | None = None, + scope_list: list[str] | None = None, + template: str | None = None, + end_separator: str = '', +) -> LLMOutput: + """ + Execute a RAG query with context retrieval from vector database. + + This is the main function for processing queries that require context + from the vector database. It orchestrates the entire RAG pipeline: + 1. Query vector database for relevant documents + 2. Extract and format context chunks + 3. Generate LLM response with context + 4. Return output with source references + + Args: + user_id: User ID for the request + vectordb_loader: Vector database loader instance + llm: Language model instance + app_config: Application configuration + query: The query text + ctx_limit: Maximum number of context documents (default: 20) + scope_type: Optional scope type for filtering + scope_list: Optional list of scope identifiers + template: Optional custom prompt template + end_separator: Optional separator to stop generation + + Returns: + LLMOutput with generated text and source references + + Raises: + ContextException: If no documents are retrieved + ValueError: If context length is too small to fit the query + """ + logger.info('Executing context query', extra={ + 'user_id': user_id, + 'query_length': len(query), + 'ctx_limit': ctx_limit, + }) + + # Step 1: Load vector database and retrieve relevant documents + db = vectordb_loader.load() + context_docs = query_vector_database(user_id, query, db, ctx_limit, scope_type, scope_list) + + if len(context_docs) == 0: + raise ContextException('No documents retrieved, please index a few documents first') + + # Step 2: Prepare context chunks for LLM + context_chunks = prepare_context_chunks(context_docs) + logger.debug('Prepared context chunks', extra={ + 'num_docs': len(context_docs), + 'num_chunks': len(context_chunks), + }) + + # Step 3: Generate LLM response + output = generate_llm_response( + llm, + app_config, + user_id, + query, + template or _LLM_TEMPLATE, + context_chunks, + end_separator, + ) + + # Step 4: Extract unique sources for citation + unique_sources = extract_unique_sources(context_docs) + + logger.info('Context query completed', extra={ + 'user_id': user_id, + 'num_sources': len(unique_sources), + }) + + return LLMOutput(output=output, sources=unique_sources) + +# ============================================================================ +# Task Queue Processing +# ============================================================================ + + +def return_result_to_nextcloud(task_id: str, result: LLMOutput) -> bool: + """ + Return query result back to Nextcloud. + + STUB: This function should be implemented to send results back + to Nextcloud's task queue or API endpoint. + + Args: + task_id: Unique task identifier + result: The LLMOutput result to return + + Returns: + True if successful, False otherwise + """ + logger.debug('Returning result to Nextcloud (STUB)', extra={ + 'task_id': task_id, + 'output_length': len(result['output']), + 'num_sources': len(result['sources']), + }) + # TODO: Implement actual Nextcloud result submission + return True + + +def process_task( + task: dict[str, Any], + vectordb_loader: VectorDBLoader, + llm: LLM, + app_config: TConfig, +) -> LLMOutput: + """ + Process a single query task. + + Args: + task: Task dictionary from fetch_query_tasks_from_nextcloud + vectordb_loader: Vector database loader instance + llm: Language model instance + app_config: Application configuration + + Returns: + LLMOutput with generated text and sources + + Raises: + Various exceptions from query execution + """ + user_id = task['user_id'] + query = task['query'] + + return execute_context_query( + user_id=user_id, + vectordb_loader=vectordb_loader, + llm=llm, + app_config=app_config, + query=query, + ctx_limit=task.get('ctx_limit', 20), + scope_type=task.get('scope_type'), + scope_list=task.get('scope_list'), + template=task.get('template'), # TODO: Somehow get the real template, tasks don't have it + end_separator=task.get('end_separator', ''), # TODO: same here + ) \ No newline at end of file From 8aa2471080c10ea7b0a97a9d2dac4023e005464c Mon Sep 17 00:00:00 2001 From: Marcel Klehr Date: Wed, 25 Mar 2026 10:42:40 +0100 Subject: [PATCH 21/96] implement request processing --- context_chat_backend/chain/one_shot.py | 1 + context_chat_backend/chain/types.py | 12 + context_chat_backend/controller.py | 19 +- context_chat_backend/task_fetcher.py | 292 +++++++++++++++---------- 4 files changed, 201 insertions(+), 123 deletions(-) diff --git a/context_chat_backend/chain/one_shot.py b/context_chat_backend/chain/one_shot.py index 1c0521b..d0f5bbe 100644 --- a/context_chat_backend/chain/one_shot.py +++ b/context_chat_backend/chain/one_shot.py @@ -20,6 +20,7 @@ logger = logging.getLogger('ccb.chain') +# todo: remove this maybe def process_query( user_id: str, llm: LLM, diff --git a/context_chat_backend/chain/types.py b/context_chat_backend/chain/types.py index b006ad1..c527756 100644 --- a/context_chat_backend/chain/types.py +++ b/context_chat_backend/chain/types.py @@ -42,3 +42,15 @@ class LLMOutput(TypedDict): class SearchResult(TypedDict): source_id: str title: str + +class EnrichedSource(BaseModel): + id: str + label: str + icon: str + url: str + +class EnrichedSourceList(BaseModel): + sources: list[EnrichedSource] + +class ScopeList(BaseModel): + source_ids: list[str] \ No newline at end of file diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py index 3ebdc8a..1e0d277 100644 --- a/context_chat_backend/controller.py +++ b/context_chat_backend/controller.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: 2023 Nextcloud GmbH and Nextcloud contributors # SPDX-License-Identifier: AGPL-3.0-or-later # +from nc_py_api.ex_app.providers.task_processing import TaskProcessingProvider # isort: off from .chain.types import ContextException, LLMOutput, ScopeType, SearchResult @@ -65,9 +66,23 @@ } if __download_models_from_hf else {} app_enabled = Event() -def enabled_handler(enabled: bool, _: NextcloudApp | AsyncNextcloudApp) -> str: +def enabled_handler(enabled: bool, nc: NextcloudApp | AsyncNextcloudApp) -> str: try: if enabled: + provider = TaskProcessingProvider( + id="context_chat-context_chat_search", + name="Context Chat", + task_type="context_chat:context_chat_search", + expected_runtime=30, + ) + nc.providers.task_processing.register(provider) + provider = TaskProcessingProvider( + id="context_chat-context_chat", + name="Context Chat", + task_type="context_chat:context_chat", + expected_runtime=30, + ) + nc.providers.task_processing.register(provider) app_enabled.set() start_bg_threads(app_config, app_enabled) else: @@ -383,7 +398,7 @@ def download_logs() -> FileResponse: # 'title': source.headers.get('title'), # 'headers': source.headers, # }) -# return JSONResponse(f'Invaild/missing headers for: {source.filename}', 400) +# return JSONResponse(f'Invaild/missing headers for:provider_ids {source.filename}', 400) # # wait for 10 minutes before failing the request # semres = doc_parse_semaphore.acquire(block=True, timeout=10*60) diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index a502802..7951f06 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: 2026 Nextcloud GmbH and Nextcloud contributors # SPDX-License-Identifier: AGPL-3.0-or-later # - +import json import logging import math import os @@ -21,11 +21,13 @@ from niquests import JSONDecodeError, RequestException from pydantic import ValidationError -from .chain.context import get_context_chunks, get_context_docs +from .chain.context import do_doc_search, get_context_chunks, get_context_docs from .chain.ingest.injest import embed_sources +from .chain.one_shot import process_context_query from .chain.query_proc import get_pruned_query -from .chain.types import ContextException, LLMOutput, ScopeType -from .controller import llm_loader +from .chain.types import ContextException, EnrichedSource, EnrichedSourceList, LLMOutput, ScopeList, ScopeType, \ + SearchResult +from .controller import Query, execute_query, llm_loader from .dyn_loader import VectorDBLoader from .types import ActionType, ActionsQueueItems, AppRole, EmbeddingException, FilesQueueItems, IndexingError, \ LoaderException, ReceivedFileItem, SourceItem, TConfig @@ -55,6 +57,7 @@ CHECK_INTERVAL = 5 CHECK_INTERVAL_WITH_TRIGGER = 5 * 60 CHECK_INTERVAL_ON_ERROR = 15 +CONTEXT_LIMIT=20 class ThreadType(Enum): @@ -372,8 +375,25 @@ def updates_processing_thread(app_config: TConfig, app_enabled: Event) -> None: continue +def resolve_scope_list(source_ids: list[str], userId: str) -> list[str]: + """ + + Parameters + ---------- + source_ids + + Returns + ------- + source_ids with only files, no folders (or source_ids in case of non-file provider) + """ + nc = NextcloudApp() + data = nc.ocs('POST', f'/ocs/v2.php/apps/context_chat/resolve_scope_list', json={'source_ids': source_ids, 'userId': userId}) + sources = ScopeList.model_validate(data).source_ids + return sources + + def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None: - logger.info('Starting task fetcher loop') + LOGGER.info('Starting task fetcher loop') try: vectordb_loader = VectorDBLoader(app_config) @@ -392,7 +412,7 @@ def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None: try: # Fetch pending task try: - response = nc.providers.task_processing.next_task(list(provider_ids), list(task_type_ids)) + response = nc.providers.task_processing.next_task(['context_chat-context_chat', 'context_chat-context_chat_search'], ['context_chat:context_chat', 'context_chat:context_chat_search']) if not response: wait_for_tasks() continue @@ -403,14 +423,26 @@ def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None: # Process task task = response["task"] - provider = response["provider"] + userId = task['userId'] try: - logger.debug(f'Processing task {task["id"]}') - result = process_task(task, vectordb_loader, llm, app_config) - - # Return result to Nextcloud - success = return_result_to_nextcloud(task_id, result) + LOGGER.debug(f'Processing task {task["id"]}') + + if task['input'].get('scopeType') == 'source': + # Resolve scope list to only files, no folders + task['input']['scopeList'] = resolve_scope_list(task['input'].get('scopeList'), userId) + + if task['type'] == 'context_chat:context_chat': + result: LLMOutput = process_normal_task(task, vectordb_loader, llm, app_config) + # Return result to Nextcloud + success = return_normal_result_to_nextcloud(task['id'], userId, result) + elif task['type'] == 'context_chat:context_chat_search': + result: list[SearchResult] = process_search_task(task, vectordb_loader) + # Return result to Nextcloud + success = return_search_result_to_nextcloud(task['id'], userId, result) + else: + LOGGER.error(f'Unknown task type {task["type"]}') + success = return_error_to_nextcloud(task['id'], Exception(f'Unknown task type {task["type"]}')) if success: LOGGER.info(f'Task {task["id"]} completed successfully') @@ -419,17 +451,17 @@ def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None: except ContextException as e: LOGGER.warning(f'Context error for task {task["id"]}: {e}') - # TODO: Return error to Nextcloud + return_error_to_nextcloud(task['id'], e) except ValueError as e: LOGGER.warning(f'Validation error for task {task["id"]}: {e}') - # TODO: Return error to Nextcloud + return_error_to_nextcloud(task['id'], e) except Exception as e: LOGGER.exception(f'Unexpected error processing task {task["id"]}', exc_info=e) - # TODO: Return error to Nextcloud + return_error_to_nextcloud(task['id'], e) except Exception as e: - logger.exception('Error in task fetcher loop', exc_info=e) - # TODO: Add appropriate error handling and backoff + LOGGER.exception('Error in task fetcher loop', exc_info=e) + wait_for_tasks(CHECK_INTERVAL_ON_ERROR) def trigger_handler(providerId: str): global TRIGGER @@ -506,13 +538,6 @@ def wait_for_bg_threads(): THREADS.pop(ThreadType.REQUEST_PROCESSING) -# Default LLM template for context-based queries -_LLM_TEMPLATE = '''Answer based only on this context and do not add any imaginative details. Make sure to use the same language as the question in your answer. -{context} - -{question} -''' - def query_vector_database( user_id: str, query: str, @@ -539,7 +564,7 @@ def query_vector_database( ContextException: If scope type is provided without scope list """ context_docs = get_context_docs(user_id, query, vectordb, ctx_limit, scope_type, scope_list) - logger.debug('Retrieved context documents', extra={ + LOGGER.debug('Retrieved context documents', extra={ 'user_id': user_id, 'num_docs': len(context_docs), 'ctx_limit': ctx_limit, @@ -596,7 +621,7 @@ def generate_llm_response( userid=user_id, ).strip() - logger.debug('Generated LLM response', extra={ + LOGGER.debug('Generated LLM response', extra={ 'user_id': user_id, 'output_length': len(output), }) @@ -618,117 +643,112 @@ def extract_unique_sources(context_docs: list[Document]) -> list[str]: }) return unique_sources -def execute_context_query( - user_id: str, - vectordb_loader: VectorDBLoader, - llm: LLM, - app_config: TConfig, - query: str, - ctx_limit: int = 20, - scope_type: ScopeType | None = None, - scope_list: list[str] | None = None, - template: str | None = None, - end_separator: str = '', -) -> LLMOutput: +def return_normal_result_to_nextcloud(task_id: int, userId: str, result: LLMOutput) -> bool: """ - Execute a RAG query with context retrieval from vector database. - - This is the main function for processing queries that require context - from the vector database. It orchestrates the entire RAG pipeline: - 1. Query vector database for relevant documents - 2. Extract and format context chunks - 3. Generate LLM response with context - 4. Return output with source references + Return query result back to Nextcloud. Args: - user_id: User ID for the request - vectordb_loader: Vector database loader instance - llm: Language model instance - app_config: Application configuration - query: The query text - ctx_limit: Maximum number of context documents (default: 20) - scope_type: Optional scope type for filtering - scope_list: Optional list of scope identifiers - template: Optional custom prompt template - end_separator: Optional separator to stop generation + task_id: Unique task identifier + result: The LLMOutput result to return Returns: - LLMOutput with generated text and source references - - Raises: - ContextException: If no documents are retrieved - ValueError: If context length is too small to fit the query + True if successful, False otherwise """ - logger.info('Executing context query', extra={ - 'user_id': user_id, - 'query_length': len(query), - 'ctx_limit': ctx_limit, + LOGGER.debug('Returning result to Nextcloud', extra={ + 'task_id': task_id, + 'output_length': len(result['output']), + 'num_sources': len(result['sources']), }) - # Step 1: Load vector database and retrieve relevant documents - db = vectordb_loader.load() - context_docs = query_vector_database(user_id, query, db, ctx_limit, scope_type, scope_list) + nc = NextcloudApp() - if len(context_docs) == 0: - raise ContextException('No documents retrieved, please index a few documents first') + try: + nc.providers.task_processing.report_result(task_id, { + 'output': result['output'], + 'sources': enrich_sources(result['sources'], userId), + }) + except (NextcloudException, RequestException, JSONDecodeError) as e: + LOGGER.error(f"Network error reporting task result {e}", exc_info=e) + return False - # Step 2: Prepare context chunks for LLM - context_chunks = prepare_context_chunks(context_docs) - logger.debug('Prepared context chunks', extra={ - 'num_docs': len(context_docs), - 'num_chunks': len(context_chunks), - }) + return True - # Step 3: Generate LLM response - output = generate_llm_response( - llm, - app_config, - user_id, - query, - template or _LLM_TEMPLATE, - context_chunks, - end_separator, - ) +def enrich_sources(results: list[str], userId: str) -> list[EnrichedSource]: + nc = NextcloudApp() + # todo: refactor to include title here + data = nc.ocs('POST', f'/ocs/v2.php/apps/context_chat/enrich_sources', json={'sources': [{'source_id': id} for id in results], 'userId': userId}) + sources = EnrichedSourceList.model_validate(data).sources + return sources - # Step 4: Extract unique sources for citation - unique_sources = extract_unique_sources(context_docs) +def enrich_search_sources(results: list[SearchResult], userId: str) -> list[EnrichedSource]: + nc = NextcloudApp() + data = nc.ocs('POST', f'/ocs/v2.php/apps/context_chat/enrich_sources', json={'sources': results, 'userId': userId}) + sources = EnrichedSourceList.model_validate(data).sources + return sources - logger.info('Context query completed', extra={ - 'user_id': user_id, - 'num_sources': len(unique_sources), + +def return_search_result_to_nextcloud(task_id: int, userId: str, result: list[SearchResult]) -> bool: + """ + Return search result back to Nextcloud. + + Args: + task_id: Unique task identifier + result: The list of search results to return + + Returns: + True if successful, False otherwise + """ + LOGGER.debug('Returning search result to Nextcloud', extra={ + 'task_id': task_id, + 'num_sources': len(result), }) - return LLMOutput(output=output, sources=unique_sources) + nc = NextcloudApp() -# ============================================================================ -# Task Queue Processing -# ============================================================================ + try: + sources = [json.dumps(source) for source in enrich_search_sources(result, userId)] + nc.providers.task_processing.report_result(task_id, { + 'sources': sources, + }) + except (NextcloudException, RequestException, JSONDecodeError) as e: + LOGGER.error(f"Network error reporting search task result {e}", exc_info=e) + return False -def return_result_to_nextcloud(task_id: str, result: LLMOutput) -> bool: - """ - Return query result back to Nextcloud. + return True - STUB: This function should be implemented to send results back - to Nextcloud's task queue or API endpoint. +def return_error_to_nextcloud(task_id: int, e: Exception) -> bool: + """ + Return error result back to Nextcloud. Args: task_id: Unique task identifier - result: The LLMOutput result to return + e: error object Returns: True if successful, False otherwise """ - logger.debug('Returning result to Nextcloud (STUB)', extra={ - 'task_id': task_id, - 'output_length': len(result['output']), - 'num_sources': len(result['sources']), - }) - # TODO: Implement actual Nextcloud result submission + LOGGER.debug('Returning error to Nextcloud', exc_info=e) + + nc = NextcloudApp() + + if isinstance(e, ValueError): + message = "Validation error: " + str(e) + elif isinstance(e, ContextException): + message = "Context error" + str(e) + else: + message = "Unexpected error" + str(e) + + try: + nc.providers.task_processing.report_result(task_id, None, message) + except (NextcloudException, RequestException, JSONDecodeError) as e: + LOGGER.error(f"Network error reporting task result {e}", exc_info=e) + return False + return True -def process_task( +def process_normal_task( task: dict[str, Any], vectordb_loader: VectorDBLoader, llm: LLM, @@ -750,17 +770,47 @@ def process_task( Various exceptions from query execution """ user_id = task['user_id'] - query = task['query'] - - return execute_context_query( - user_id=user_id, - vectordb_loader=vectordb_loader, - llm=llm, - app_config=app_config, - query=query, - ctx_limit=task.get('ctx_limit', 20), - scope_type=task.get('scope_type'), - scope_list=task.get('scope_list'), - template=task.get('template'), # TODO: Somehow get the real template, tasks don't have it - end_separator=task.get('end_separator', ''), # TODO: same here + task_input = task['input'] + + return exec_in_proc(target=process_context_query, + args=( + user_id, + vectordb_loader, + llm, + app_config, + task_input.get('prompt'), + CONTEXT_LIMIT, + task_input.get('scopeType'), + task_input.get('scopeList'), + ) + ) + +def process_search_task( + task: dict[str, Any], + vectordb_loader: VectorDBLoader, +) -> list[SearchResult]: + """ + Process a single search task. + + Args: + task: Task dictionary from fetch_query_tasks_from_nextcloud + vectordb_loader: Vector database loader instance + + Returns: + list of Search results + + Raises: + Various exceptions from query execution + """ + user_id = task['user_id'] + task_input = task['input'] + return exec_in_proc(target=do_doc_search, + args=( + user_id, + task_input.get('prompt'), + vectordb_loader, + CONTEXT_LIMIT, + task_input.get('scopeType'), + task_input.get('scopeList'), + ) ) \ No newline at end of file From 2093936913c08e55c5aca01b559314df731b4bb4 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Thu, 26 Mar 2026 22:43:48 +0530 Subject: [PATCH 22/96] request processing fixes Signed-off-by: Anupam Kumar --- context_chat_backend/chain/one_shot.py | 7 +- context_chat_backend/chain/types.py | 14 +- context_chat_backend/controller.py | 179 ++++++++++++------------- context_chat_backend/dyn_loader.py | 16 +-- context_chat_backend/task_fetcher.py | 164 +++++++++++----------- 5 files changed, 185 insertions(+), 195 deletions(-) diff --git a/context_chat_backend/chain/one_shot.py b/context_chat_backend/chain/one_shot.py index d0f5bbe..c79f272 100644 --- a/context_chat_backend/chain/one_shot.py +++ b/context_chat_backend/chain/one_shot.py @@ -10,7 +10,7 @@ from ..types import TConfig from .context import get_context_chunks, get_context_docs from .query_proc import get_pruned_query -from .types import ContextException, LLMOutput, ScopeType +from .types import ContextException, LLMOutput, ScopeType, SearchResult _LLM_TEMPLATE = '''Answer based only on this context and do not add any imaginative details. Make sure to use the same language as the question in your answer. {context} @@ -79,6 +79,9 @@ def process_context_query( stop=[end_separator], userid=user_id, ).strip() - unique_sources: list[str] = list({source for d in context_docs if (source := d.metadata.get('source'))}) + unique_sources = [SearchResult( + source_id=source, + title=d.metadata.get('title', ''), + ) for d in context_docs if (source := d.metadata.get('source'))] return LLMOutput(output=output, sources=unique_sources) diff --git a/context_chat_backend/chain/types.py b/context_chat_backend/chain/types.py index c527756..3afdf29 100644 --- a/context_chat_backend/chain/types.py +++ b/context_chat_backend/chain/types.py @@ -33,16 +33,16 @@ class ContextException(Exception): ... -class LLMOutput(TypedDict): - output: str - sources: list[str] - # todo: add "titles" field - - class SearchResult(TypedDict): source_id: str title: str + +class LLMOutput(TypedDict): + output: str + sources: list[SearchResult] + + class EnrichedSource(BaseModel): id: str label: str @@ -53,4 +53,4 @@ class EnrichedSourceList(BaseModel): sources: list[EnrichedSource] class ScopeList(BaseModel): - source_ids: list[str] \ No newline at end of file + source_ids: list[str] diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py index 1e0d277..33e3cad 100644 --- a/context_chat_backend/controller.py +++ b/context_chat_backend/controller.py @@ -5,7 +5,7 @@ from nc_py_api.ex_app.providers.task_processing import TaskProcessingProvider # isort: off -from .chain.types import ContextException, LLMOutput, ScopeType, SearchResult +from .chain.types import ContextException from .types import LoaderException, EmbeddingException from .vectordb.types import DbException, SafeDbException from .setup_functions import ensure_config_file, repair_run, setup_env_vars @@ -25,22 +25,17 @@ from contextlib import asynccontextmanager from functools import wraps from threading import Event, Thread -from typing import Any from fastapi import FastAPI, Request -from langchain.llms.base import LLM from nc_py_api import AsyncNextcloudApp, NextcloudApp from nc_py_api.ex_app import persistent_storage, set_handlers -from pydantic import BaseModel, ValidationInfo, field_validator from starlette.responses import FileResponse -from .chain.context import do_doc_search -from .chain.one_shot import process_context_query, process_query from .config_parser import get_config -from .dyn_loader import LLMModelLoader, VectorDBLoader +from .dyn_loader import VectorDBLoader from .models.types import LlmException from nc_py_api.ex_app import AppAPIAuthMiddleware -from .utils import JSONResponse, exec_in_proc, value_of +from .utils import JSONResponse, exec_in_proc from .task_fetcher import start_bg_threads, trigger_handler, wait_for_bg_threads from .vectordb.service import count_documents_by_provider @@ -108,7 +103,6 @@ async def lifespan(app: FastAPI): t.start() yield vectordb_loader.offload() - llm_loader.offload() wait_for_bg_threads() @@ -120,7 +114,6 @@ async def lifespan(app: FastAPI): # loaders vectordb_loader = VectorDBLoader(app_config) -llm_loader = LLMModelLoader(app, app_config) # locks and semaphores @@ -438,90 +431,90 @@ def download_logs() -> FileResponse: # return JSONResponse({'loaded_sources': loaded_sources, 'sources_to_retry': not_added_sources}) -class Query(BaseModel): - userId: str - query: str - useContext: bool = True - scopeType: ScopeType | None = None - scopeList: list[str] | None = None - ctxLimit: int = 20 - - @field_validator('userId', 'query', 'ctxLimit') - @classmethod - def check_empty_values(cls, value: Any, info: ValidationInfo): - if value_of(value) is None: - raise ValueError('Empty value for field', info.field_name) - - return value - - @field_validator('ctxLimit') - @classmethod - def at_least_one_context(cls, value: int): - if value < 1: - raise ValueError('Invalid context chunk limit') - - return value - - -def execute_query(query: Query, in_proc: bool = True) -> LLMOutput: - llm: LLM = llm_loader.load() - template = app.extra.get('LLM_TEMPLATE') - no_ctx_template = app.extra['LLM_NO_CTX_TEMPLATE'] - # todo: array - end_separator = app.extra.get('LLM_END_SEPARATOR', '') - - if query.useContext: - target = process_context_query - args=( - query.userId, - vectordb_loader, - llm, - app_config, - query.query, - query.ctxLimit, - query.scopeType, - query.scopeList, - template, - end_separator, - ) - else: - target=process_query - args=( - query.userId, - llm, - app_config, - query.query, - no_ctx_template, - end_separator, - ) - - if in_proc: - return exec_in_proc(target=target, args=args) - - return target(*args) # pyright: ignore - - -@app.post('/query') -@enabled_guard(app) -def _(query: Query) -> LLMOutput: - logger.debug('received query request', extra={ 'query': query.dict() }) +# class Query(BaseModel): +# userId: str +# query: str +# useContext: bool = True +# scopeType: ScopeType | None = None +# scopeList: list[str] | None = None +# ctxLimit: int = 20 + +# @field_validator('userId', 'query', 'ctxLimit') +# @classmethod +# def check_empty_values(cls, value: Any, info: ValidationInfo): +# if value_of(value) is None: +# raise ValueError('Empty value for field', info.field_name) + +# return value + +# @field_validator('ctxLimit') +# @classmethod +# def at_least_one_context(cls, value: int): +# if value < 1: +# raise ValueError('Invalid context chunk limit') + +# return value + + +# def execute_query(query: Query, in_proc: bool = True) -> LLMOutput: +# llm: LLM = llm_loader.load() +# template = app.extra.get('LLM_TEMPLATE') +# no_ctx_template = app.extra['LLM_NO_CTX_TEMPLATE'] +# # todo: array +# end_separator = app.extra.get('LLM_END_SEPARATOR', '') + +# if query.useContext: +# target = process_context_query +# args=( +# query.userId, +# vectordb_loader, +# llm, +# app_config, +# query.query, +# query.ctxLimit, +# query.scopeType, +# query.scopeList, +# template, +# end_separator, +# ) +# else: +# target=process_query +# args=( +# query.userId, +# llm, +# app_config, +# query.query, +# no_ctx_template, +# end_separator, +# ) - if app_config.llm[0] == 'nc_texttotext': - return execute_query(query) +# if in_proc: +# return exec_in_proc(target=target, args=args) - with llm_lock: - return execute_query(query, in_proc=False) +# return target(*args) # pyright: ignore -@app.post('/docSearch') -@enabled_guard(app) -def _(query: Query) -> list[SearchResult]: - # useContext from Query is not used here - return exec_in_proc(target=do_doc_search, args=( - query.userId, - query.query, - vectordb_loader, - query.ctxLimit, - query.scopeType, - query.scopeList, - )) +# @app.post('/query') +# @enabled_guard(app) +# def _(query: Query) -> LLMOutput: +# logger.debug('received query request', extra={ 'query': query.dict() }) + +# if app_config.llm[0] == 'nc_texttotext': +# return execute_query(query) + +# with llm_lock: +# return execute_query(query, in_proc=False) + + +# @app.post('/docSearch') +# @enabled_guard(app) +# def _(query: Query) -> list[SearchResult]: +# # useContext from Query is not used here +# return exec_in_proc(target=do_doc_search, args=( +# query.userId, +# query.query, +# vectordb_loader, +# query.ctxLimit, +# query.scopeType, +# query.scopeList, +# )) diff --git a/context_chat_backend/dyn_loader.py b/context_chat_backend/dyn_loader.py index d67310f..47b1957 100644 --- a/context_chat_backend/dyn_loader.py +++ b/context_chat_backend/dyn_loader.py @@ -7,11 +7,9 @@ import gc import logging from abc import ABC, abstractmethod -from time import time from typing import Any import torch -from fastapi import FastAPI from langchain.llms.base import LLM from .models.loader import init_model @@ -54,19 +52,11 @@ def offload(self) -> None: class LLMModelLoader(Loader): - def __init__(self, app: FastAPI, config: TConfig) -> None: + def __init__(self, config: TConfig) -> None: self.config = config - self.app = app def load(self) -> LLM: - if self.app.extra.get('LLM_MODEL') is not None: - self.app.extra['LLM_LAST_ACCESSED'] = time() - return self.app.extra['LLM_MODEL'] - llm_name, llm_config = self.config.llm - self.app.extra['LLM_TEMPLATE'] = llm_config.pop('template', '') - self.app.extra['LLM_NO_CTX_TEMPLATE'] = llm_config.pop('no_ctx_template', '') - self.app.extra['LLM_END_SEPARATOR'] = llm_config.pop('end_separator', '') try: model = init_model('llm', (llm_name, llm_config)) @@ -75,13 +65,9 @@ def load(self) -> LLM: if not isinstance(model, LLM): raise LoaderException(f'Error: {model} does not implement "llm" type or has returned an invalid object') - self.app.extra['LLM_MODEL'] = model - self.app.extra['LLM_LAST_ACCESSED'] = time() return model def offload(self) -> None: - if self.app.extra.get('LLM_MODEL') is not None: - del self.app.extra['LLM_MODEL'] clear_cache() diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index 7951f06..634b51c 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: 2026 Nextcloud GmbH and Nextcloud contributors # SPDX-License-Identifier: AGPL-3.0-or-later # -import json import logging import math import os @@ -25,12 +24,20 @@ from .chain.ingest.injest import embed_sources from .chain.one_shot import process_context_query from .chain.query_proc import get_pruned_query -from .chain.types import ContextException, EnrichedSource, EnrichedSourceList, LLMOutput, ScopeList, ScopeType, \ - SearchResult -from .controller import Query, execute_query, llm_loader -from .dyn_loader import VectorDBLoader -from .types import ActionType, ActionsQueueItems, AppRole, EmbeddingException, FilesQueueItems, IndexingError, \ - LoaderException, ReceivedFileItem, SourceItem, TConfig +from .chain.types import ContextException, EnrichedSourceList, LLMOutput, ScopeList, ScopeType, SearchResult +from .dyn_loader import LLMModelLoader, VectorDBLoader +from .types import ( + ActionsQueueItems, + ActionType, + AppRole, + EmbeddingException, + FilesQueueItems, + IndexingError, + LoaderException, + ReceivedFileItem, + SourceItem, + TConfig, +) from .utils import exec_in_proc, get_app_role from .vectordb.base import BaseVectorDB from .vectordb.service import ( @@ -387,9 +394,11 @@ def resolve_scope_list(source_ids: list[str], userId: str) -> list[str]: source_ids with only files, no folders (or source_ids in case of non-file provider) """ nc = NextcloudApp() - data = nc.ocs('POST', f'/ocs/v2.php/apps/context_chat/resolve_scope_list', json={'source_ids': source_ids, 'userId': userId}) - sources = ScopeList.model_validate(data).source_ids - return sources + data = nc.ocs('POST', '/ocs/v2.php/apps/context_chat/resolve_scope_list', json={ + 'source_ids': source_ids, + 'userId': userId, + }) + return ScopeList.model_validate(data).source_ids def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None: @@ -397,6 +406,7 @@ def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None: try: vectordb_loader = VectorDBLoader(app_config) + llm_loader = LLMModelLoader(app_config) except LoaderException as e: LOGGER.error('Error initializing vector DB loader, files indexing thread will not start:', exc_info=e) return @@ -412,7 +422,10 @@ def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None: try: # Fetch pending task try: - response = nc.providers.task_processing.next_task(['context_chat-context_chat', 'context_chat-context_chat_search'], ['context_chat:context_chat', 'context_chat:context_chat_search']) + response = nc.providers.task_processing.next_task( + ['context_chat-context_chat', 'context_chat-context_chat_search'], + ['context_chat:context_chat', 'context_chat:context_chat_search'], + ) if not response: wait_for_tasks() continue @@ -437,9 +450,9 @@ def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None: # Return result to Nextcloud success = return_normal_result_to_nextcloud(task['id'], userId, result) elif task['type'] == 'context_chat:context_chat_search': - result: list[SearchResult] = process_search_task(task, vectordb_loader) + search_result: list[SearchResult] = process_search_task(task, vectordb_loader) # Return result to Nextcloud - success = return_search_result_to_nextcloud(task['id'], userId, result) + success = return_search_result_to_nextcloud(task['id'], userId, search_result) else: LOGGER.error(f'Unknown task type {task["type"]}') success = return_error_to_nextcloud(task['id'], Exception(f'Unknown task type {task["type"]}')) @@ -480,62 +493,60 @@ def wait_for_tasks(interval = None): def start_bg_threads(app_config: TConfig, app_enabled: Event): - match APP_ROLE: - case AppRole.INDEXING | AppRole.NORMAL: - if ( - ThreadType.FILES_INDEXING in THREADS - or ThreadType.UPDATES_PROCESSING in THREADS - ): - LOGGER.info('Background threads already running, skipping start') - return - - THREAD_STOP_EVENT.clear() - THREADS[ThreadType.FILES_INDEXING] = Thread( - target=files_indexing_thread, - args=(app_config, app_enabled), - name='FilesIndexingThread', - ) - THREADS[ThreadType.UPDATES_PROCESSING] = Thread( - target=updates_processing_thread, - args=(app_config, app_enabled), - name='UpdatesProcessingThread', - ) - THREADS[ThreadType.FILES_INDEXING].start() - THREADS[ThreadType.UPDATES_PROCESSING].start() - - case AppRole.RP | AppRole.NORMAL: - if ThreadType.REQUEST_PROCESSING in THREADS: - LOGGER.info('Background threads already running, skipping start') - return - - THREAD_STOP_EVENT.clear() - THREADS[ThreadType.REQUEST_PROCESSING] = Thread( - target=request_processing_thread, - args=(app_config, app_enabled), - name='RequestProcessingThread', - ) - THREADS[ThreadType.REQUEST_PROCESSING].start() + if APP_ROLE == AppRole.INDEXING or APP_ROLE == AppRole.NORMAL: + if ( + ThreadType.FILES_INDEXING in THREADS + or ThreadType.UPDATES_PROCESSING in THREADS + ): + LOGGER.info('Background threads already running, skipping start') + return + + THREAD_STOP_EVENT.clear() + THREADS[ThreadType.FILES_INDEXING] = Thread( + target=files_indexing_thread, + args=(app_config, app_enabled), + name='FilesIndexingThread', + ) + THREADS[ThreadType.UPDATES_PROCESSING] = Thread( + target=updates_processing_thread, + args=(app_config, app_enabled), + name='UpdatesProcessingThread', + ) + THREADS[ThreadType.FILES_INDEXING].start() + THREADS[ThreadType.UPDATES_PROCESSING].start() + + if APP_ROLE == AppRole.RP or APP_ROLE == AppRole.NORMAL: + if ThreadType.REQUEST_PROCESSING in THREADS: + LOGGER.info('Background threads already running, skipping start') + return + + THREAD_STOP_EVENT.clear() + THREADS[ThreadType.REQUEST_PROCESSING] = Thread( + target=request_processing_thread, + args=(app_config, app_enabled), + name='RequestProcessingThread', + ) + THREADS[ThreadType.REQUEST_PROCESSING].start() def wait_for_bg_threads(): - match APP_ROLE: - case AppRole.INDEXING | AppRole.NORMAL: - if (ThreadType.FILES_INDEXING not in THREADS or ThreadType.UPDATES_PROCESSING not in THREADS): - return + if APP_ROLE == AppRole.INDEXING or APP_ROLE == AppRole.NORMAL: + if (ThreadType.FILES_INDEXING not in THREADS or ThreadType.UPDATES_PROCESSING not in THREADS): + return - THREAD_STOP_EVENT.set() - THREADS[ThreadType.FILES_INDEXING].join() - THREADS[ThreadType.UPDATES_PROCESSING].join() - THREADS.pop(ThreadType.FILES_INDEXING) - THREADS.pop(ThreadType.UPDATES_PROCESSING) + THREAD_STOP_EVENT.set() + THREADS[ThreadType.FILES_INDEXING].join() + THREADS[ThreadType.UPDATES_PROCESSING].join() + THREADS.pop(ThreadType.FILES_INDEXING) + THREADS.pop(ThreadType.UPDATES_PROCESSING) - case AppRole.RP | AppRole.NORMAL: - if (ThreadType.REQUEST_PROCESSING not in THREADS): - return + if APP_ROLE == AppRole.RP or APP_ROLE == AppRole.NORMAL: + if (ThreadType.REQUEST_PROCESSING not in THREADS): + return - THREAD_STOP_EVENT.set() - THREADS[ThreadType.REQUEST_PROCESSING].join() - THREADS.pop(ThreadType.REQUEST_PROCESSING) + THREAD_STOP_EVENT.set() + THREADS[ThreadType.REQUEST_PROCESSING].join() + THREADS.pop(ThreadType.REQUEST_PROCESSING) def query_vector_database( @@ -673,18 +684,12 @@ def return_normal_result_to_nextcloud(task_id: int, userId: str, result: LLMOutp return True -def enrich_sources(results: list[str], userId: str) -> list[EnrichedSource]: - nc = NextcloudApp() - # todo: refactor to include title here - data = nc.ocs('POST', f'/ocs/v2.php/apps/context_chat/enrich_sources', json={'sources': [{'source_id': id} for id in results], 'userId': userId}) - sources = EnrichedSourceList.model_validate(data).sources - return sources -def enrich_search_sources(results: list[SearchResult], userId: str) -> list[EnrichedSource]: +def enrich_sources(results: list[SearchResult], userId: str) -> list[str]: nc = NextcloudApp() - data = nc.ocs('POST', f'/ocs/v2.php/apps/context_chat/enrich_sources', json={'sources': results, 'userId': userId}) + data = nc.ocs('POST', '/ocs/v2.php/apps/context_chat/enrich_sources', json={'sources': results, 'userId': userId}) sources = EnrichedSourceList.model_validate(data).sources - return sources + return [s.model_dump_json() for s in sources] def return_search_result_to_nextcloud(task_id: int, userId: str, result: list[SearchResult]) -> bool: @@ -706,10 +711,8 @@ def return_search_result_to_nextcloud(task_id: int, userId: str, result: list[Se nc = NextcloudApp() try: - sources = [json.dumps(source) for source in enrich_search_sources(result, userId)] - nc.providers.task_processing.report_result(task_id, { - 'sources': sources, + 'sources': enrich_sources(result, userId), }) except (NextcloudException, RequestException, JSONDecodeError) as e: LOGGER.error(f"Network error reporting search task result {e}", exc_info=e) @@ -769,8 +772,10 @@ def process_normal_task( Raises: Various exceptions from query execution """ - user_id = task['user_id'] + user_id = task['userId'] task_input = task['input'] + if task_input.get('scopeType') == 'none': + task_input['scopeType'] = None return exec_in_proc(target=process_context_query, args=( @@ -802,8 +807,11 @@ def process_search_task( Raises: Various exceptions from query execution """ - user_id = task['user_id'] + user_id = task['userId'] task_input = task['input'] + if task_input.get('scopeType') == 'none': + task_input['scopeType'] = None + return exec_in_proc(target=do_doc_search, args=( user_id, @@ -813,4 +821,4 @@ def process_search_task( task_input.get('scopeType'), task_input.get('scopeList'), ) - ) \ No newline at end of file + ) From 36b5f0211ee2da2123d220a312521afe204a559b Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Thu, 26 Mar 2026 23:01:56 +0530 Subject: [PATCH 23/96] chore: drop commented code Signed-off-by: Anupam Kumar --- context_chat_backend/controller.py | 292 +---------------------------- 1 file changed, 1 insertion(+), 291 deletions(-) diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py index 33e3cad..49d1d73 100644 --- a/context_chat_backend/controller.py +++ b/context_chat_backend/controller.py @@ -24,7 +24,6 @@ from collections.abc import Callable from contextlib import asynccontextmanager from functools import wraps -from threading import Event, Thread from fastapi import FastAPI, Request from nc_py_api import AsyncNextcloudApp, NextcloudApp @@ -59,7 +58,7 @@ 'revision': '607a30d783dfa663caf39e06633721c8d4cfcd7e', } } if __download_models_from_hf else {} -app_enabled = Event() +app_enabled = threading.Event() def enabled_handler(enabled: bool, nc: NextcloudApp | AsyncNextcloudApp) -> str: try: @@ -99,8 +98,6 @@ async def lifespan(app: FastAPI): app_enabled.set() start_bg_threads(app_config, app_enabled) logger.info(f'App enable state at startup: {app_enabled.is_set()}') - t = Thread(target=background_thread_task, args=()) - t.start() yield vectordb_loader.offload() wait_for_bg_threads() @@ -134,15 +131,6 @@ async def lifespan(app: FastAPI): if not app_config.disable_aaa: app.add_middleware(AppAPIAuthMiddleware) -# logger background thread - -def background_thread_task(): - # todo - # while(True): - # logger.info(f'Currently indexing {len(_indexing)} documents (filename, size): ', extra={'_indexing': _indexing}) - # sleep(10) - ... - # exception handlers @app.exception_handler(DbException) @@ -240,281 +228,3 @@ def download_logs() -> FileResponse: if os.path.isfile(file_path): # Might be a folder (just skip it then) zip_file.write(file_path) return FileResponse(tmp.name, media_type='application/zip', filename='docker_logs.zip') - - -# @app.post('/updateAccessDeclarative') -# @enabled_guard(app) -# def _( -# userIds: Annotated[list[str], Body()], -# sourceId: Annotated[str, Body()], -# ): -# logger.debug('Update access declarative request:', extra={ -# 'user_ids': userIds, -# 'source_id': sourceId, -# }) - -# if len(userIds) == 0: -# return JSONResponse('Empty list of user ids', 400) - -# if not is_valid_source_id(sourceId): -# return JSONResponse('Invalid source id', 400) - -# exec_in_proc(target=decl_update_access, args=(vectordb_loader, userIds, sourceId)) - -# return JSONResponse('Access updated') - - -# @app.post('/updateAccess') -# @enabled_guard(app) -# def _( -# op: Annotated[UpdateAccessOp, Body()], -# userIds: Annotated[list[str], Body()], -# sourceId: Annotated[str, Body()], -# ): -# logger.debug('Update access request', extra={ -# 'op': op, -# 'user_ids': userIds, -# 'source_id': sourceId, -# }) - -# if len(userIds) == 0: -# return JSONResponse('Empty list of user ids', 400) - -# if not is_valid_source_id(sourceId): -# return JSONResponse('Invalid source id', 400) - -# exec_in_proc(target=update_access, args=(vectordb_loader, op, userIds, sourceId)) - -# return JSONResponse('Access updated') - - -# @app.post('/updateAccessProvider') -# @enabled_guard(app) -# def _( -# op: Annotated[UpdateAccessOp, Body()], -# userIds: Annotated[list[str], Body()], -# providerId: Annotated[str, Body()], -# ): -# logger.debug('Update access by provider request', extra={ -# 'op': op, -# 'user_ids': userIds, -# 'provider_id': providerId, -# }) - -# if len(userIds) == 0: -# return JSONResponse('Empty list of user ids', 400) - -# if not is_valid_provider_id(providerId): -# return JSONResponse('Invalid provider id', 400) - -# exec_in_proc(target=update_access_provider, args=(vectordb_loader, op, userIds, providerId)) - -# return JSONResponse('Access updated') - - -# @app.post('/deleteSources') -# @enabled_guard(app) -# def _(sourceIds: Annotated[list[str], Body(embed=True)]): -# logger.debug('Delete sources request', extra={ -# 'source_ids': sourceIds, -# }) - -# sourceIds = [source.strip() for source in sourceIds if source.strip() != ''] - -# if len(sourceIds) == 0: -# return JSONResponse('No sources provided', 400) - -# res = exec_in_proc(target=delete_by_source, args=(vectordb_loader, sourceIds)) -# if res is False: -# return JSONResponse('Error: VectorDB delete failed, check vectordb logs for more info.', 400) - -# return JSONResponse('All valid sources deleted') - - -# @app.post('/deleteProvider') -# @enabled_guard(app) -# def _(providerKey: str = Body(embed=True)): -# logger.debug('Delete sources by provider for all users request', extra={ 'provider_key': providerKey }) - -# if value_of(providerKey) is None: -# return JSONResponse('Invalid provider key provided', 400) - -# exec_in_proc(target=delete_by_provider, args=(vectordb_loader, providerKey)) - -# return JSONResponse('All valid sources deleted') - - -# @app.post('/deleteUser') -# @enabled_guard(app) -# def _(userId: str = Body(embed=True)): -# logger.debug('Remove access list for user, and orphaned sources', extra={ 'user_id': userId }) - -# if value_of(userId) is None: -# return JSONResponse('Invalid userId provided', 400) - -# exec_in_proc(target=delete_user, args=(vectordb_loader, userId)) - -# return JSONResponse('User deleted') - - -# @app.put('/loadSources') -# @enabled_guard(app) -# def _(sources: list[UploadFile]): -# global _indexing - -# if len(sources) == 0: -# return JSONResponse('No sources provided', 400) - -# for source in sources: -# if not value_of(source.filename): -# return JSONResponse(f'Invalid source filename for: {source.headers.get("title")}', 400) - -# with index_lock: -# if source.filename in _indexing: -# # this request will be retried by the client -# return JSONResponse( -# f'This source ({source.filename}) is already being processed in another request, try again later', -# 503, -# headers={'cc-retry': 'true'}, -# ) - -# if not ( -# value_of(source.headers.get('userIds')) -# and source.headers.get('title', None) is not None -# and value_of(source.headers.get('type')) -# and value_of(source.headers.get('modified')) -# and source.headers['modified'].isdigit() -# and value_of(source.headers.get('provider')) -# ): -# logger.error('Invalid/missing headers received', extra={ -# 'source_id': source.filename, -# 'title': source.headers.get('title'), -# 'headers': source.headers, -# }) -# return JSONResponse(f'Invaild/missing headers for:provider_ids {source.filename}', 400) - -# # wait for 10 minutes before failing the request -# semres = doc_parse_semaphore.acquire(block=True, timeout=10*60) -# if not semres: -# return JSONResponse( -# 'Document parser worker limit reached, try again in some time or consider increasing the limit', -# 503, -# headers={'cc-retry': 'true'} -# ) - -# with index_lock: -# for source in sources: -# _indexing[source.filename] = source.size - -# try: -# loaded_sources, not_added_sources = exec_in_proc( -# target=embed_sources, -# args=(vectordb_loader, app.extra['CONFIG'], sources) -# ) -# except (DbException, EmbeddingException): -# raise -# except Exception as e: -# raise DbException('Error: failed to load sources') from e -# finally: -# with index_lock: -# for source in sources: -# _indexing.pop(source.filename, None) -# doc_parse_semaphore.release() - -# if len(loaded_sources) != len(sources): -# logger.debug('Some sources were not loaded', extra={ -# 'Count of loaded sources': f'{len(loaded_sources)}/{len(sources)}', -# 'source_ids': loaded_sources, -# }) - -# # loaded sources include the existing sources that may only have their access updated -# return JSONResponse({'loaded_sources': loaded_sources, 'sources_to_retry': not_added_sources}) - - -# class Query(BaseModel): -# userId: str -# query: str -# useContext: bool = True -# scopeType: ScopeType | None = None -# scopeList: list[str] | None = None -# ctxLimit: int = 20 - -# @field_validator('userId', 'query', 'ctxLimit') -# @classmethod -# def check_empty_values(cls, value: Any, info: ValidationInfo): -# if value_of(value) is None: -# raise ValueError('Empty value for field', info.field_name) - -# return value - -# @field_validator('ctxLimit') -# @classmethod -# def at_least_one_context(cls, value: int): -# if value < 1: -# raise ValueError('Invalid context chunk limit') - -# return value - - -# def execute_query(query: Query, in_proc: bool = True) -> LLMOutput: -# llm: LLM = llm_loader.load() -# template = app.extra.get('LLM_TEMPLATE') -# no_ctx_template = app.extra['LLM_NO_CTX_TEMPLATE'] -# # todo: array -# end_separator = app.extra.get('LLM_END_SEPARATOR', '') - -# if query.useContext: -# target = process_context_query -# args=( -# query.userId, -# vectordb_loader, -# llm, -# app_config, -# query.query, -# query.ctxLimit, -# query.scopeType, -# query.scopeList, -# template, -# end_separator, -# ) -# else: -# target=process_query -# args=( -# query.userId, -# llm, -# app_config, -# query.query, -# no_ctx_template, -# end_separator, -# ) - -# if in_proc: -# return exec_in_proc(target=target, args=args) - -# return target(*args) # pyright: ignore - - -# @app.post('/query') -# @enabled_guard(app) -# def _(query: Query) -> LLMOutput: -# logger.debug('received query request', extra={ 'query': query.dict() }) - -# if app_config.llm[0] == 'nc_texttotext': -# return execute_query(query) - -# with llm_lock: -# return execute_query(query, in_proc=False) - - -# @app.post('/docSearch') -# @enabled_guard(app) -# def _(query: Query) -> list[SearchResult]: -# # useContext from Query is not used here -# return exec_in_proc(target=do_doc_search, args=( -# query.userId, -# query.query, -# vectordb_loader, -# query.ctxLimit, -# query.scopeType, -# query.scopeList, -# )) From 85d29f1640eb2ff5daa89016ecbae8ee9d484d27 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Fri, 27 Mar 2026 01:06:34 +0530 Subject: [PATCH 24/96] fix(ci): parse json output from the stats command Signed-off-by: Anupam Kumar --- .github/workflows/integration-test.yml | 31 +++++++------------------- 1 file changed, 8 insertions(+), 23 deletions(-) diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index 58f9f50..589f885 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -224,7 +224,7 @@ jobs: echo "Checking stats, attempt $i..." stats_err=$(mktemp) - stats=$(timeout 5 ./occ context_chat:stats 2>"$stats_err") + stats=$(timeout 5 ./occ context_chat:stats --json 2>"$stats_err") stats_exit=$? echo "Stats output:" echo "$stats" @@ -243,41 +243,25 @@ jobs: fi # Extract Total eligible files - total_files=$(echo "$stats" | grep -oP 'Total eligible files:\s*\K\d+' || echo "") + total_files=$(echo "$stats" | jq '.eligible_files_count' || echo "") # Extract Indexed documents count (files__default) - indexed_count=$(echo "$stats" | grep -oP "'files__default'\s*=>\s*\K\d+" || echo "") - - # Validate parsed values - if [ -z "$total_files" ] || [ -z "$indexed_count" ]; then - echo "Error: Could not parse stats output properly" - if echo "$stats" | grep -q "Indexed documents:"; then - echo " Indexed documents section found but could not extract count" - fi - sleep 10 - continue - fi + indexed_count=$(echo "$stats" | jq '.queued_documents_counts.files__default' || echo "") echo "Total eligible files: $total_files" echo "Indexed documents (files__default): $indexed_count" - # Calculate absolute difference diff=$((total_files - indexed_count)) - if [ $diff -lt 0 ]; then - diff=$((-diff)) - fi - - # Calculate 2% threshold using bc for floating point support - threshold=$(echo "scale=4; $total_files * 0.02" | bc) + threshold=$((total_files * 2 / 100)) # Check if difference is within tolerance - if (( $(echo "$diff <= $threshold" | bc -l) )); then + if [ $diff -le $threshold ]; then echo "Indexing within 2% tolerance (diff=$diff, threshold=$threshold)" success=1 break else - pct=$(echo "scale=2; ($diff / $total_files) * 100" | bc) - echo "Outside 2% tolerance: diff=$diff (${pct}%), threshold=$threshold" + progress=$((diff * 100 / total_files)) + echo "Outside 2% tolerance: diff=$diff (${progress}%), threshold=$threshold" fi # Check if backend is still alive @@ -293,6 +277,7 @@ jobs: echo "::endgroup::" ./occ context_chat:stats + ./occ context_chat:stats --json if [ $success -ne 1 ]; then echo "Max attempts reached" From 4c6d01b9e913de0a931345aeab7169b3029a5c9a Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Fri, 27 Mar 2026 02:57:22 +0530 Subject: [PATCH 25/96] fix: seek to 0 to read the full buffer Signed-off-by: Anupam Kumar --- context_chat_backend/chain/ingest/injest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/context_chat_backend/chain/ingest/injest.py b/context_chat_backend/chain/ingest/injest.py index 18a37b4..0196f5d 100644 --- a/context_chat_backend/chain/ingest/injest.py +++ b/context_chat_backend/chain/ingest/injest.py @@ -50,6 +50,7 @@ async def __fetch_file_content( dav=False, params={ 'userId': user_id }, ) + fp.seek(0) return fp except niquests.exceptions.RequestException as e: if e.response is None: From 51774ff771944c5dffd46b3f33ed2c4a0d7f5bb6 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Fri, 27 Mar 2026 02:59:46 +0530 Subject: [PATCH 26/96] fix(ci): 3% tolerance Signed-off-by: Anupam Kumar --- .github/workflows/integration-test.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index 589f885..73418e9 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -252,16 +252,16 @@ jobs: echo "Indexed documents (files__default): $indexed_count" diff=$((total_files - indexed_count)) - threshold=$((total_files * 2 / 100)) + threshold=$((total_files * 3 / 100)) # Check if difference is within tolerance if [ $diff -le $threshold ]; then - echo "Indexing within 2% tolerance (diff=$diff, threshold=$threshold)" + echo "Indexing within 3% tolerance (diff=$diff, threshold=$threshold)" success=1 break else progress=$((diff * 100 / total_files)) - echo "Outside 2% tolerance: diff=$diff (${progress}%), threshold=$threshold" + echo "Outside 3% tolerance: diff=$diff (${progress}%), threshold=$threshold" fi # Check if backend is still alive From c81b6758600eae2f049deb7ec578ef5c7eeca41b Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Fri, 27 Mar 2026 04:38:36 +0530 Subject: [PATCH 27/96] fix(ci): wait longer for EM server Signed-off-by: Anupam Kumar --- .github/workflows/integration-test.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index 73418e9..5c50548 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -201,7 +201,7 @@ jobs: timeout 10 ./occ app_api:daemon:register --net host manual_install "Manual Install" manual-install http localhost http://localhost:8080 timeout 120 ./occ app_api:app:register context_chat_backend manual_install --json-info "{\"appid\":\"context_chat_backend\",\"name\":\"Context Chat Backend\",\"daemon_config_name\":\"manual_install\",\"version\":\"${{ fromJson(steps.appinfo.outputs.result).version }}\",\"secret\":\"12345\",\"port\":10034,\"scopes\":[],\"system_app\":0}" --force-scopes --wait-finish ls -la context_chat_backend/persistent_storage/* - sleep 30 # Wait for the em server to get ready + sleep 60 # Wait for the em server to get ready - name: Initial memory usage check run: | @@ -242,13 +242,13 @@ jobs: continue fi - # Extract Total eligible files - total_files=$(echo "$stats" | jq '.eligible_files_count' || echo "") + # Extract total queued files + total_files=$(echo "$stats" | jq '.queued_documents_counts.files__default' || echo "") - # Extract Indexed documents count (files__default) - indexed_count=$(echo "$stats" | jq '.queued_documents_counts.files__default' || echo "") + # Extract indexed documents count (files__default) + indexed_count=$(echo "$stats" | jq '.vectordb_document_counts.files__default' || echo "") - echo "Total eligible files: $total_files" + echo "Total queued files: $total_files" echo "Indexed documents (files__default): $indexed_count" diff=$((total_files - indexed_count)) From 6817f897e4ae14fdfeab0ad7b40a9a2de78cfe4b Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Mon, 30 Mar 2026 15:57:44 +0530 Subject: [PATCH 28/96] fix: don't process files or requests until the EM server is healthy Signed-off-by: Anupam Kumar --- .github/workflows/integration-test.yml | 1 - context_chat_backend/network_em.py | 14 +++++++++++--- context_chat_backend/task_fetcher.py | 14 ++++++++++++++ 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index 5c50548..8e6ca7d 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -201,7 +201,6 @@ jobs: timeout 10 ./occ app_api:daemon:register --net host manual_install "Manual Install" manual-install http localhost http://localhost:8080 timeout 120 ./occ app_api:app:register context_chat_backend manual_install --json-info "{\"appid\":\"context_chat_backend\",\"name\":\"Context Chat Backend\",\"daemon_config_name\":\"manual_install\",\"version\":\"${{ fromJson(steps.appinfo.outputs.result).version }}\",\"secret\":\"12345\",\"port\":10034,\"scopes\":[],\"system_app\":0}" --force-scopes --wait-finish ls -la context_chat_backend/persistent_storage/* - sleep 60 # Wait for the em server to get ready - name: Initial memory usage check run: | diff --git a/context_chat_backend/network_em.py b/context_chat_backend/network_em.py index d39ea56..43ced6c 100644 --- a/context_chat_backend/network_em.py +++ b/context_chat_backend/network_em.py @@ -8,7 +8,6 @@ import niquests from langchain_core.embeddings import Embeddings -from pydantic import BaseModel from .types import ( EmbeddingException, @@ -41,8 +40,17 @@ class CreateEmbeddingResponse(TypedDict): usage: EmbeddingUsage -class NetworkEmbeddings(Embeddings, BaseModel): - app_config: TConfig +class NetworkEmbeddings(Embeddings): + def __init__(self, app_config: TConfig): + self.app_config = app_config + + def check_connection(self) -> bool: + try: + self.embed_query('hello') + return True + except EmbeddingException as e: + logger.warning('Embedding server connection failed', exc_info=e) + return False def _get_embedding(self, input_: str | list[str], try_: int = 3) -> list[float] | list[list[float]]: emconf = self.app_config.embedding diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index 634b51c..92d2719 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -26,6 +26,7 @@ from .chain.query_proc import get_pruned_query from .chain.types import ContextException, EnrichedSourceList, LLMOutput, ScopeList, ScopeType, SearchResult from .dyn_loader import LLMModelLoader, VectorDBLoader +from .network_em import NetworkEmbeddings from .types import ( ActionsQueueItems, ActionType, @@ -102,6 +103,10 @@ def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> return try: + if not __check_em_server(app_config): + sleep(POLLING_COOLDOWN) + continue + nc = NextcloudApp() q_items_res = nc.ocs( 'GET', @@ -415,6 +420,10 @@ def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None: llm: LLM = llm_loader.load() while True: + if not __check_em_server(app_config): + sleep(POLLING_COOLDOWN) + continue + if THREAD_STOP_EVENT.is_set(): LOGGER.info('Updates processing thread is stopping due to stop event being set') return @@ -822,3 +831,8 @@ def process_search_task( task_input.get('scopeList'), ) ) + + +def __check_em_server(app_config: TConfig) -> bool: + embedding_model = NetworkEmbeddings(app_config=app_config) + return embedding_model.check_connection() From 104a37a8a1b28878b98da5ce7b0eb520ebe73716 Mon Sep 17 00:00:00 2001 From: Marcel Klehr Date: Wed, 1 Apr 2026 12:38:38 +0200 Subject: [PATCH 29/96] tests: Increase testing time to allow backend to injest more sources --- .github/workflows/integration-test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index 8e6ca7d..b937a14 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -218,8 +218,8 @@ jobs: - name: Periodically check context_chat stats for 15 minutes to allow the backend to index the files run: | success=0 - echo "::group::Checking stats periodically for 15 minutes to allow the backend to index the files" - for i in {1..90}; do + echo "::group::Checking stats periodically for 30 minutes to allow the backend to index the files" + for i in {1..180}; do echo "Checking stats, attempt $i..." stats_err=$(mktemp) From b3b461a2b3a88f2fd815be11c132a7174772aa3c Mon Sep 17 00:00:00 2001 From: Marcel Klehr Date: Wed, 1 Apr 2026 13:17:13 +0200 Subject: [PATCH 30/96] fix: More log statements --- .../chain/ingest/doc_loader.py | 20 +++++++++-- context_chat_backend/chain/ingest/injest.py | 35 +++++++++++++++++++ context_chat_backend/task_fetcher.py | 29 +++++++++++++-- context_chat_backend/utils.py | 12 +++++++ context_chat_backend/vectordb/pgvector.py | 20 ++++++++++- 5 files changed, 110 insertions(+), 6 deletions(-) diff --git a/context_chat_backend/chain/ingest/doc_loader.py b/context_chat_backend/chain/ingest/doc_loader.py index 832c833..04c611d 100644 --- a/context_chat_backend/chain/ingest/doc_loader.py +++ b/context_chat_backend/chain/ingest/doc_loader.py @@ -7,6 +7,8 @@ import tempfile from collections.abc import Callable from io import BytesIO +import logging +from time import perf_counter_ns import docx2txt from epub2txt import epub2txt @@ -19,6 +21,8 @@ from ...types import IndexingException, SourceItem +logger = logging.getLogger('ccb.doc_loader') + def _temp_file_wrapper(file: BytesIO, loader: Callable, sep: str = '\n') -> str: raw_bytes = file.read() @@ -133,10 +137,22 @@ def decode_source(source: SourceItem) -> str: else: io_obj = source.content - if _loader_map.get(source.type): - result = _loader_map[source.type](io_obj) + loader_fn = _loader_map.get(source.type) + if loader_fn: + logger.debug( + 'Decoding source %r with loader %s (mime: %s) — may be slow or block', + source.title, loader_fn.__name__, source.type, + ) + t0 = perf_counter_ns() + result = loader_fn(io_obj) + elapsed_ms = (perf_counter_ns() - t0) / 1e6 + logger.debug( + 'Loader %s for %r finished in %.2f ms (%d chars)', + loader_fn.__name__, source.title, elapsed_ms, len(result), + ) return result.encode('utf-8', 'ignore').decode('utf-8', 'ignore').strip() + logger.debug('No specific loader for mime type %s, reading as plain text for %r', source.type, source.title) return io_obj.read().decode('utf-8', 'ignore').strip() except IndexingException: raise diff --git a/context_chat_backend/chain/ingest/injest.py b/context_chat_backend/chain/ingest/injest.py index 0196f5d..7ede94a 100644 --- a/context_chat_backend/chain/ingest/injest.py +++ b/context_chat_backend/chain/ingest/injest.py @@ -7,6 +7,7 @@ import re from collections.abc import Mapping from io import BytesIO +from time import perf_counter_ns import niquests from langchain.schema import Document @@ -42,6 +43,8 @@ async def __fetch_file_content( async with semaphore: nc = AsyncNextcloudApp() try: + logger.debug('Downloading file id %d for user %s', file_id, user_id) + t0 = perf_counter_ns() # a file pointer for storing the stream in memory until it is consumed fp = BytesIO() await nc._session.download2fp( @@ -51,6 +54,8 @@ async def __fetch_file_content( params={ 'userId': user_id }, ) fp.seek(0) + elapsed_ms = (perf_counter_ns() - t0) / 1e6 + logger.debug('Downloaded file id %d for user %s in %.2f ms (%d bytes)', file_id, user_id, elapsed_ms, fp.getbuffer().nbytes) return fp except niquests.exceptions.RequestException as e: if e.response is None: @@ -89,6 +94,9 @@ async def __fetch_files_content( semaphore = asyncio.Semaphore(CONCURRENT_FILE_FETCHES) tasks = [] + file_count = sum(1 for s in sources.values() if isinstance(s, ReceivedFileItem)) + logger.debug('Fetching content for %d file(s) (max %d concurrent)', file_count, CONCURRENT_FILE_FETCHES) + for db_id, file in sources.items(): if isinstance(file, SourceItem): continue @@ -123,7 +131,11 @@ async def __fetch_files_content( # any user id from the list should have read access to the file tasks.append(asyncio.ensure_future(__fetch_file_content(semaphore, file.file_id, file.userIds[0]))) + logger.debug('Gathering %d file download task(s) — this blocks until all downloads complete or fail', len(tasks)) + t0 = perf_counter_ns() results = await asyncio.gather(*tasks, return_exceptions=True) + elapsed_ms = (perf_counter_ns() - t0) / 1e6 + logger.debug('All %d file download task(s) completed in %.2f ms', len(tasks), elapsed_ms) for (db_id, file), result in zip(sources.items(), results, strict=True): if isinstance(file, SourceItem): continue @@ -215,7 +227,14 @@ def _sources_to_indocuments( # transform the source to have text data try: + logger.debug( + 'Decoding source %s (type: %s, title: %r) — may be slow for complex file types', + source.reference, source.type, source.title, + ) + t0 = perf_counter_ns() content = decode_source(source) + elapsed_ms = (perf_counter_ns() - t0) / 1e6 + logger.debug('Decoded source %s in %.2f ms (%d chars)', source.reference, elapsed_ms, len(content)) except IndexingException as e: logger.error(f'Error decoding source ({source.reference}): {e}', exc_info=e) errored_docs[db_id] = IndexingError( @@ -333,7 +352,17 @@ def _process_sources( source_proc_results = _increase_access_for_existing_sources(vectordb, existing_sources) + logger.debug( + 'Fetching file contents for %d source(s) — this blocks on network I/O to Nextcloud', + len(to_embed_sources), + ) + t0 = perf_counter_ns() populated_to_embed_sources, errored_sources = asyncio.run(__fetch_files_content(to_embed_sources)) + elapsed_ms = (perf_counter_ns() - t0) / 1e6 + logger.debug( + 'File content fetch complete in %.2f ms: %d fetched, %d errored', + elapsed_ms, len(populated_to_embed_sources), len(errored_sources), + ) source_proc_results.update(errored_sources) # pyright: ignore[reportAttributeAccessIssue] if len(populated_to_embed_sources) == 0: @@ -359,7 +388,13 @@ def _process_sources( 'source_ids': [indoc.source_id for indoc in indocuments.values()] }) + t0 = perf_counter_ns() doc_add_results = vectordb.add_indocuments(indocuments) + elapsed_ms = (perf_counter_ns() - t0) / 1e6 + logger.info( + 'vectordb.add_indocuments completed in %.2f ms for %d document(s)', + elapsed_ms, len(indocuments), + ) source_proc_results.update(doc_add_results) # pyright: ignore[reportAttributeAccessIssue] logger.debug('Added documents to vectordb') diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index 92d2719..32673c8 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -82,11 +82,22 @@ def files_indexing_thread(app_config: TConfig, app_enabled: Event) -> None: return def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> Mapping[int, IndexingError | None]: + source_refs = [s.reference for s in source_items.values()] + LOGGER.info('Starting embed_sources subprocess for %d source(s): %s', len(source_items), source_refs) try: - return exec_in_proc( + result = exec_in_proc( target=embed_sources, args=(vectordb_loader, app_config, source_items), ) + errors = {k: v for k, v in result.items() if isinstance(v, IndexingError)} + LOGGER.info( + 'embed_sources subprocess finished for %d source(s): %d succeeded, %d errored', + len(source_items), + len(result) - len(errors), + len(errors), + extra={'errors': errors} if errors else {}, + ) + return result except Exception as e: err_name = {DbException: "DB", EmbeddingException: "Embedding"}.get(type(e), "Unknown") source_ids = (s.reference for s in source_items.values()) @@ -94,6 +105,10 @@ def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> error=f'{err_name} Error occurred, the sources {source_ids} will be retried: {e}', retryable=True, ) + LOGGER.error( + 'embed_sources subprocess raised a %s error for sources %s, marking all as retryable', + err_name, source_refs, exc_info=e, + ) return dict.fromkeys(source_items, err) @@ -146,13 +161,21 @@ def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> max_workers=PARALLEL_FILE_PARSING_COUNT, thread_name_prefix='IndexingPool', ) as executor: + LOGGER.info( + 'Dispatching %d file chunk(s) and %d provider chunk(s) to %d IndexingPool worker(s)', + len(file_chunks), len(provider_chunks), PARALLEL_FILE_PARSING_COUNT, + ) file_futures = [executor.submit(_load_sources, chunk) for chunk in file_chunks] provider_futures = [executor.submit(_load_sources, chunk) for chunk in provider_chunks] - for future in file_futures: + for i, future in enumerate(file_futures): + LOGGER.debug('Waiting for file chunk %d/%d future to complete', i + 1, len(file_futures)) files_result.update(future.result()) - for future in provider_futures: + LOGGER.debug('File chunk %d/%d future completed', i + 1, len(file_futures)) + for i, future in enumerate(provider_futures): + LOGGER.debug('Waiting for provider chunk %d/%d future to complete', i + 1, len(provider_futures)) providers_result.update(future.result()) + LOGGER.debug('Provider chunk %d/%d future completed', i + 1, len(provider_futures)) if ( any(isinstance(res, IndexingError) for res in files_result.values()) diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py index c7e588b..d28fc58 100644 --- a/context_chat_backend/utils.py +++ b/context_chat_backend/utils.py @@ -90,8 +90,20 @@ def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daem kwargs=kwargs, daemon=daemon, ) + target_name = getattr(target, '__name__', str(target)) + _logger.debug('Starting subprocess for %s', target_name) + start = perf_counter_ns() p.start() + _logger.debug('Subprocess PID %d started for %s, waiting for it to finish (no timeout)', p.pid, target_name) p.join() + elapsed_ms = (perf_counter_ns() - start) / 1e6 + _logger.debug('Subprocess PID %d for %s finished in %.2f ms (exit code: %s)', p.pid, target_name, elapsed_ms, p.exitcode) + if p.exitcode != 0: + _logger.warning( + 'Subprocess PID %d for %s exited with non-zero exit code %d after %.2f ms' + ' — possible OOM kill or unhandled signal', + p.pid, target_name, p.exitcode, elapsed_ms, + ) result = pconn.recv() if result['error'] is not None: diff --git a/context_chat_backend/vectordb/pgvector.py b/context_chat_backend/vectordb/pgvector.py index 86f636b..33dfb03 100644 --- a/context_chat_backend/vectordb/pgvector.py +++ b/context_chat_backend/vectordb/pgvector.py @@ -6,6 +6,7 @@ import os from collections.abc import Mapping from datetime import datetime +from time import perf_counter_ns import psycopg import sqlalchemy as sa @@ -152,8 +153,25 @@ def add_indocuments(self, indocuments: Mapping[int, InDocument]) -> Mapping[int, # so we chunk the documents into (5 values * 10k) chunks # change the chunk size when there are more inserted values per document chunk_ids = [] - for i in range(0, len(indoc.documents), batch_size): + total_chunks = len(indoc.documents) + num_batches = max(1, -(-total_chunks // batch_size)) # ceiling division + logger.debug( + 'Embedding source %s: %d chunk(s) in %d batch(es) — blocks on embedding model', + indoc.source_id, total_chunks, num_batches, + ) + for i in range(0, total_chunks, batch_size): + batch_num = i // batch_size + 1 + logger.debug( + 'Sending embedding batch %d/%d (%d chunk(s)) for source %s', + batch_num, num_batches, len(indoc.documents[i:i+batch_size]), indoc.source_id, + ) + t0 = perf_counter_ns() chunk_ids.extend(self.client.add_documents(indoc.documents[i:i+batch_size])) + elapsed_ms = (perf_counter_ns() - t0) / 1e6 + logger.debug( + 'Embedding batch %d/%d for source %s completed in %.2f ms', + batch_num, num_batches, indoc.source_id, elapsed_ms, + ) doc = DocumentsStore( source_id=indoc.source_id, From a4a88dae5f231732e448cefb9c0ea3e0da03aee5 Mon Sep 17 00:00:00 2001 From: Marcel Klehr Date: Wed, 1 Apr 2026 13:18:24 +0200 Subject: [PATCH 31/96] tests: Set wait time back to 90 --- .github/workflows/integration-test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index b937a14..8e6ca7d 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -218,8 +218,8 @@ jobs: - name: Periodically check context_chat stats for 15 minutes to allow the backend to index the files run: | success=0 - echo "::group::Checking stats periodically for 30 minutes to allow the backend to index the files" - for i in {1..180}; do + echo "::group::Checking stats periodically for 15 minutes to allow the backend to index the files" + for i in {1..90}; do echo "Checking stats, attempt $i..." stats_err=$(mktemp) From 0c52747375355e6e0338fd68599338f8bd644dc4 Mon Sep 17 00:00:00 2001 From: Marcel Klehr Date: Wed, 1 Apr 2026 14:04:57 +0200 Subject: [PATCH 32/96] fix: Reduce worker count on github actions to prevent oom --- context_chat_backend/task_fetcher.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index 32673c8..91d1991 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -59,6 +59,10 @@ MIN_FILES_PER_CPU = 4 # divides the batch into these many chunks PARALLEL_FILE_PARSING_COUNT = max(1, (os.cpu_count() or 2) - 1) # todo: config? +if os.getenv('GITHUB_ACTIONS'): + # Keep CI memory usage predictable and avoid OOM-killed workers. + PARALLEL_FILE_PARSING_COUNT = max(1, min(PARALLEL_FILE_PARSING_COUNT, 2)) +LOGGER.info(f'Using {PARALLEL_FILE_PARSING_COUNT} parallel file parsing workers') ACTIONS_BATCH_SIZE = 512 # todo: config? POLLING_COOLDOWN = 30 TRIGGER = Event() From e676c329ca5a0c147ef0bfadbf5c372f4e25dd99 Mon Sep 17 00:00:00 2001 From: Marcel Klehr Date: Wed, 1 Apr 2026 14:14:58 +0200 Subject: [PATCH 33/96] fix(exec_in_proc): Raise RuntimeError if exitcode is non-zero --- context_chat_backend/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py index d28fc58..024e71c 100644 --- a/context_chat_backend/utils.py +++ b/context_chat_backend/utils.py @@ -104,6 +104,7 @@ def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daem ' — possible OOM kill or unhandled signal', p.pid, target_name, p.exitcode, elapsed_ms, ) + raise RuntimeError(f'Subprocess PID {p.pid} for {target_name} exited with non-zero exit code {p.exitcode}') result = pconn.recv() if result['error'] is not None: From b027ff3234a50cf8eb5a1447bafbef8f147212b5 Mon Sep 17 00:00:00 2001 From: Marcel Klehr Date: Wed, 1 Apr 2026 14:46:42 +0200 Subject: [PATCH 34/96] fix(indexing): Reduce memory pressure on gh actions --- context_chat_backend/task_fetcher.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index 91d1991..2a7e84f 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -56,7 +56,11 @@ THREAD_STOP_EVENT = Event() LOGGER = logging.getLogger('ccb.task_fetcher') FILES_INDEXING_BATCH_SIZE = 16 # theoretical max RAM usage: 16 * 100 MiB, todo: config? +if os.getenv('GITHUB_ACTIONS'): + FILES_INDEXING_BATCH_SIZE = 4 MIN_FILES_PER_CPU = 4 +if os.getenv('GITHUB_ACTIONS'): + MIN_FILES_PER_CPU = 2 # divides the batch into these many chunks PARALLEL_FILE_PARSING_COUNT = max(1, (os.cpu_count() or 2) - 1) # todo: config? if os.getenv('GITHUB_ACTIONS'): From 19b773fac97d3cf76fb581224df76d63e3c9a34d Mon Sep 17 00:00:00 2001 From: Marcel Klehr Date: Wed, 1 Apr 2026 15:19:06 +0200 Subject: [PATCH 35/96] fix(indexing): Fallback to batch_size=1 if embed_sources is killed and do not retry afterward if one these single item batches get killed --- context_chat_backend/task_fetcher.py | 51 +++++++++++++++++++++++++--- context_chat_backend/utils.py | 13 ++++++- 2 files changed, 59 insertions(+), 5 deletions(-) diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index 2a7e84f..edeabc1 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -39,7 +39,7 @@ SourceItem, TConfig, ) -from .utils import exec_in_proc, get_app_role +from .utils import SubprocessKilledError, exec_in_proc, get_app_role from .vectordb.base import BaseVectorDB from .vectordb.service import ( decl_update_access, @@ -89,6 +89,29 @@ def files_indexing_thread(app_config: TConfig, app_enabled: Event) -> None: LOGGER.error('Error initializing vector DB loader, files indexing thread will not start:', exc_info=e) return + def _embed_one(db_id: int, item: SourceItem | ReceivedFileItem) -> tuple[int, IndexingError | None]: + """Run embed_sources for a single item in its own subprocess. Returns (db_id, error_or_None).""" + try: + result = exec_in_proc( + target=embed_sources, + args=(vectordb_loader, app_config, {db_id: item}), + ) + return db_id, result.get(db_id) + except SubprocessKilledError as e: + LOGGER.error( + 'embed_sources subprocess killed for individual source %s — marking as non-retryable' + ' to prevent infinite OOM retry loop', + item.reference, exc_info=e, + ) + return db_id, IndexingError(error=f'Subprocess killed (OOM?): {e}', retryable=False) + except Exception as e: + err_name = {DbException: 'DB', EmbeddingException: 'Embedding'}.get(type(e), 'Unknown') + LOGGER.error( + 'embed_sources raised a %s error for individual source %s, marking as retryable', + err_name, item.reference, exc_info=e, + ) + return db_id, IndexingError(error=str(e), retryable=True) + def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> Mapping[int, IndexingError | None]: source_refs = [s.reference for s in source_items.values()] LOGGER.info('Starting embed_sources subprocess for %d source(s): %s', len(source_items), source_refs) @@ -106,11 +129,31 @@ def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> extra={'errors': errors} if errors else {}, ) return result + except SubprocessKilledError as e: + LOGGER.error( + 'embed_sources subprocess was killed (likely OOM) for %d source(s): %s', + len(source_items), source_refs, exc_info=e, + ) + if len(source_items) == 1: + # Single-item subprocess was killed — mark non-retryable to break infinite OOM loop. + LOGGER.error( + 'Single-item subprocess killed for %s — marking as non-retryable', + source_refs, + ) + return {db_id: IndexingError(error=f'Subprocess killed (OOM?): {e}', retryable=False) + for db_id in source_items} + + # Multi-item batch: fall back to one subprocess per source to pinpoint the problematic file. + LOGGER.warning( + 'Falling back to individual processing for %d sources to isolate any OOM-causing file(s)', + len(source_items), + ) + return dict(_embed_one(db_id, item) for db_id, item in source_items.items()) + except Exception as e: - err_name = {DbException: "DB", EmbeddingException: "Embedding"}.get(type(e), "Unknown") - source_ids = (s.reference for s in source_items.values()) + err_name = {DbException: 'DB', EmbeddingException: 'Embedding'}.get(type(e), 'Unknown') err = IndexingError( - error=f'{err_name} Error occurred, the sources {source_ids} will be retried: {e}', + error=f'{err_name} Error: {e}', retryable=True, ) LOGGER.error( diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py index 024e71c..4b9fad5 100644 --- a/context_chat_backend/utils.py +++ b/context_chat_backend/utils.py @@ -69,6 +69,17 @@ def JSONResponse( return FastAPIJSONResponse(content, status_code, **kwargs) +class SubprocessKilledError(RuntimeError): + """Raised when a subprocess exits with a non-zero exit code (likely OOM kill or unhandled signal).""" + + def __init__(self, pid: int, target_name: str, exitcode: int): + super().__init__( + f'Subprocess PID {pid} for {target_name} exited with non-zero exit code {exitcode}' + ' — possible OOM kill or unhandled signal' + ) + self.exitcode = exitcode + + def exception_wrap(fun: Callable | None, *args, resconn: Connection, **kwargs): try: if fun is None: @@ -104,7 +115,7 @@ def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daem ' — possible OOM kill or unhandled signal', p.pid, target_name, p.exitcode, elapsed_ms, ) - raise RuntimeError(f'Subprocess PID {p.pid} for {target_name} exited with non-zero exit code {p.exitcode}') + raise SubprocessKilledError(p.pid, target_name, p.exitcode) result = pconn.recv() if result['error'] is not None: From bde0bc54e2dde254b37fe426418abbca295a27a0 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Thu, 2 Apr 2026 14:18:47 +0530 Subject: [PATCH 36/96] fix: log stdout and stderr from subprocesses Signed-off-by: Anupam Kumar --- context_chat_backend/utils.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py index 4b9fad5..068ffa8 100644 --- a/context_chat_backend/utils.py +++ b/context_chat_backend/utils.py @@ -2,9 +2,11 @@ # SPDX-FileCopyrightText: 2023 Nextcloud GmbH and Nextcloud contributors # SPDX-License-Identifier: AGPL-3.0-or-later # +import io import logging import multiprocessing as mp import os +import sys import traceback from collections.abc import Callable from functools import partial, wraps @@ -80,7 +82,12 @@ def __init__(self, pid: int, target_name: str, exitcode: int): self.exitcode = exitcode -def exception_wrap(fun: Callable | None, *args, resconn: Connection, **kwargs): +def exception_wrap(fun: Callable | None, *args, resconn: Connection, stdconn: Connection, **kwargs): + stdout_capture = io.StringIO() + stderr_capture = io.StringIO() + sys.stdout = stdout_capture + sys.stderr = stderr_capture + try: if fun is None: return resconn.send({ 'value': None, 'error': None }) @@ -88,11 +95,15 @@ def exception_wrap(fun: Callable | None, *args, resconn: Connection, **kwargs): except Exception as e: tb = traceback.format_exc() resconn.send({ 'value': None, 'error': e, 'traceback': tb }) + finally: + stdconn.send({'stdout': stdout_capture.getvalue(), 'stderr': stderr_capture.getvalue()}) def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daemon=None): # noqa: B006 pconn, cconn = mp.Pipe() + std_pconn, std_cconn = mp.Pipe() kwargs['resconn'] = cconn + kwargs['stdconn'] = std_cconn p = mp.Process( group=group, target=partial(exception_wrap, target), @@ -108,20 +119,28 @@ def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daem _logger.debug('Subprocess PID %d started for %s, waiting for it to finish (no timeout)', p.pid, target_name) p.join() elapsed_ms = (perf_counter_ns() - start) / 1e6 - _logger.debug('Subprocess PID %d for %s finished in %.2f ms (exit code: %s)', p.pid, target_name, elapsed_ms, p.exitcode) + _logger.debug( + 'Subprocess PID %d for %s finished in %.2f ms (exit code: %s)', + p.pid, target_name, elapsed_ms, p.exitcode, + ) if p.exitcode != 0: _logger.warning( 'Subprocess PID %d for %s exited with non-zero exit code %d after %.2f ms' ' — possible OOM kill or unhandled signal', p.pid, target_name, p.exitcode, elapsed_ms, ) - raise SubprocessKilledError(p.pid, target_name, p.exitcode) + raise SubprocessKilledError(p.pid or 0, target_name, p.exitcode or -1) result = pconn.recv() if result['error'] is not None: _logger.error('original traceback: %s', result['traceback']) raise result['error'] + stdobj = std_pconn.recv() + _logger.info(f'std info for {target_name}', extra={ + 'stdout': stdobj['stdout'], + 'stderr': stdobj['stderr'], + }) return result['value'] From 4de591f79b29746c220cd0a268b9254a18fc424c Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Thu, 2 Apr 2026 14:57:16 +0530 Subject: [PATCH 37/96] fix: don't raise before std* is captured Signed-off-by: Anupam Kumar --- context_chat_backend/utils.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py index 068ffa8..3122a41 100644 --- a/context_chat_backend/utils.py +++ b/context_chat_backend/utils.py @@ -123,6 +123,17 @@ def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daem 'Subprocess PID %d for %s finished in %.2f ms (exit code: %s)', p.pid, target_name, elapsed_ms, p.exitcode, ) + stdobj = std_pconn.recv() + _logger.info(f'std info for {target_name}', extra={ + 'stdout': stdobj['stdout'], + 'stderr': stdobj['stderr'], + }) + + result = pconn.recv() + if result['error'] is not None: + _logger.error('original traceback: %s', result['traceback']) + raise result['error'] + if p.exitcode != 0: _logger.warning( 'Subprocess PID %d for %s exited with non-zero exit code %d after %.2f ms' @@ -131,16 +142,6 @@ def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daem ) raise SubprocessKilledError(p.pid or 0, target_name, p.exitcode or -1) - result = pconn.recv() - if result['error'] is not None: - _logger.error('original traceback: %s', result['traceback']) - raise result['error'] - - stdobj = std_pconn.recv() - _logger.info(f'std info for {target_name}', extra={ - 'stdout': stdobj['stdout'], - 'stderr': stdobj['stderr'], - }) return result['value'] From 4deda845f40dd3e3419253ec647d156a4c76e218 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Thu, 2 Apr 2026 15:01:10 +0530 Subject: [PATCH 38/96] feat: log cpu count and memory info of the system Signed-off-by: Anupam Kumar --- main.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/main.py b/main.py index c4ffa1f..8d838d8 100755 --- a/main.py +++ b/main.py @@ -4,8 +4,9 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # import logging -from os import getenv +from os import cpu_count, getenv +import psutil import uvicorn from nc_py_api.ex_app import run_app @@ -48,6 +49,7 @@ def _setup_log_levels(debug: bool): app_config: TConfig = app.extra['CONFIG'] _setup_log_levels(app_config.debug) + print(f'CPU count: {cpu_count()}, Memory: {psutil.virtual_memory()}') print('App config:\n' + redact_config(app_config).model_dump_json(indent=2), flush=True) uv_log_config = uvicorn.config.LOGGING_CONFIG # pyright: ignore[reportAttributeAccessIssue] From ad0eac70712600964f45e2401bed411945e148a7 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Thu, 2 Apr 2026 17:41:39 +0530 Subject: [PATCH 39/96] fix: catch BaseException in subprocess Signed-off-by: Anupam Kumar --- context_chat_backend/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py index 3122a41..02545d9 100644 --- a/context_chat_backend/utils.py +++ b/context_chat_backend/utils.py @@ -92,7 +92,7 @@ def exception_wrap(fun: Callable | None, *args, resconn: Connection, stdconn: Co if fun is None: return resconn.send({ 'value': None, 'error': None }) resconn.send({ 'value': fun(*args, **kwargs), 'error': None }) - except Exception as e: + except BaseException as e: tb = traceback.format_exc() resconn.send({ 'value': None, 'error': e, 'traceback': tb }) finally: From 36bcfb721364912bcca24c37bc30e357cebfe275 Mon Sep 17 00:00:00 2001 From: Marcel Klehr Date: Thu, 2 Apr 2026 14:19:49 +0200 Subject: [PATCH 40/96] fix(utils): Improve exec_in_proc to handle more failure modes --- context_chat_backend/utils.py | 170 +++++++++++++++++++++++++++++----- 1 file changed, 149 insertions(+), 21 deletions(-) diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py index 02545d9..e994a3f 100644 --- a/context_chat_backend/utils.py +++ b/context_chat_backend/utils.py @@ -9,6 +9,7 @@ import sys import traceback from collections.abc import Callable +from contextlib import suppress from functools import partial, wraps from multiprocessing.connection import Connection from time import perf_counter_ns @@ -72,31 +73,95 @@ def JSONResponse( class SubprocessKilledError(RuntimeError): - """Raised when a subprocess exits with a non-zero exit code (likely OOM kill or unhandled signal).""" + """Raised when a subprocess is terminated by a signal (for example SIGKILL).""" def __init__(self, pid: int, target_name: str, exitcode: int): super().__init__( - f'Subprocess PID {pid} for {target_name} exited with non-zero exit code {exitcode}' - ' — possible OOM kill or unhandled signal' + f'Subprocess PID {pid} for {target_name} exited with signal {abs(exitcode)} ' + f'(raw exit code: {exitcode})' ) self.exitcode = exitcode +class SubprocessExecutionError(RuntimeError): + """Raised when a subprocess exits non-zero without a recoverable Python exception payload.""" + + def __init__(self, pid: int, target_name: str, exitcode: int, details: str = ''): + msg = f'Subprocess PID {pid} for {target_name} exited with non-zero exit code {exitcode}' + if details: + msg = f'{msg}: {details}' + super().__init__(msg) + self.exitcode = exitcode + + +_MAX_STD_CAPTURE_CHARS = 64 * 1024 + + +def _truncate_capture(text: str) -> tuple[str, bool]: + if len(text) <= _MAX_STD_CAPTURE_CHARS: + return text, False + + head = _MAX_STD_CAPTURE_CHARS // 2 + tail = _MAX_STD_CAPTURE_CHARS - head + omitted = len(text) - _MAX_STD_CAPTURE_CHARS + truncated = ( + f'[truncated {omitted} chars]\n' + f'{text[:head]}\n' + '[...snip...]\n' + f'{text[-tail:]}' + ) + return truncated, True + + def exception_wrap(fun: Callable | None, *args, resconn: Connection, stdconn: Connection, **kwargs): stdout_capture = io.StringIO() stderr_capture = io.StringIO() + orig_stdout = sys.stdout + orig_stderr = sys.stderr sys.stdout = stdout_capture sys.stderr = stderr_capture try: if fun is None: - return resconn.send({ 'value': None, 'error': None }) - resconn.send({ 'value': fun(*args, **kwargs), 'error': None }) + resconn.send({ 'value': None, 'error': None }) + else: + resconn.send({ 'value': fun(*args, **kwargs), 'error': None }) except BaseException as e: tb = traceback.format_exc() - resconn.send({ 'value': None, 'error': e, 'traceback': tb }) + payload = { + 'value': None, + 'error': e, + 'traceback': tb, + 'error_type': type(e).__name__, + 'error_module': type(e).__module__, + 'error_message': str(e), + } + try: + resconn.send(payload) + except Exception as send_err: + # Fallback for unpicklable exceptions. + with suppress(Exception): + resconn.send({ + 'value': None, + 'error': None, + 'traceback': tb, + 'error_type': type(e).__name__, + 'error_module': type(e).__module__, + 'error_message': str(e), + 'send_error': str(send_err), + }) finally: - stdconn.send({'stdout': stdout_capture.getvalue(), 'stderr': stderr_capture.getvalue()}) + sys.stdout = orig_stdout + sys.stderr = orig_stderr + stdout_text, stdout_truncated = _truncate_capture(stdout_capture.getvalue()) + stderr_text, stderr_truncated = _truncate_capture(stderr_capture.getvalue()) + with suppress(Exception): + stdconn.send({ + 'stdout': stdout_text, + 'stderr': stderr_text, + 'stdout_truncated': stdout_truncated, + 'stderr_truncated': stderr_truncated, + }) def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daemon=None): # noqa: B006 @@ -117,30 +182,93 @@ def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daem start = perf_counter_ns() p.start() _logger.debug('Subprocess PID %d started for %s, waiting for it to finish (no timeout)', p.pid, target_name) + + result = None + stdobj = { + 'stdout': '', + 'stderr': '', + 'stdout_truncated': False, + 'stderr_truncated': False, + } + got_result = False + got_std = False + + # Drain result/std pipes while child is still alive to avoid deadlock on full pipe buffers. + while p.is_alive() and (not got_result or not got_std): + if not got_result and pconn.poll(0.1): + with suppress(EOFError, OSError, BrokenPipeError): + result = pconn.recv() + got_result = True + if not got_std and std_pconn.poll(): + with suppress(EOFError, OSError, BrokenPipeError): + stdobj = std_pconn.recv() + got_std = True + p.join() elapsed_ms = (perf_counter_ns() - start) / 1e6 _logger.debug( 'Subprocess PID %d for %s finished in %.2f ms (exit code: %s)', p.pid, target_name, elapsed_ms, p.exitcode, ) - stdobj = std_pconn.recv() - _logger.info(f'std info for {target_name}', extra={ - 'stdout': stdobj['stdout'], - 'stderr': stdobj['stderr'], - }) - - result = pconn.recv() - if result['error'] is not None: - _logger.error('original traceback: %s', result['traceback']) + + if not got_std: + with suppress(EOFError, OSError, BrokenPipeError): + if std_pconn.poll(): + stdobj = std_pconn.recv() + got_std = True + if stdobj['stdout'] or stdobj['stderr']: + extra = { + 'stdout': stdobj['stdout'], + 'stderr': stdobj['stderr'], + } + if stdobj.get('stdout_truncated') or stdobj.get('stderr_truncated'): + extra['stdio_truncated'] = { + 'stdout': bool(stdobj.get('stdout_truncated')), + 'stderr': bool(stdobj.get('stderr_truncated')), + } + _logger.info('std info for %s', target_name, extra=extra) + + if not got_result: + with suppress(EOFError, OSError, BrokenPipeError): + if pconn.poll(): + result = pconn.recv() + got_result = True + + if result is not None and result.get('error') is not None: + _logger.error('original traceback: %s', result.get('traceback', '')) raise result['error'] - if p.exitcode != 0: + if result is not None and result.get('error_type'): + details = ( + f"{result.get('error_module', '')}.{result.get('error_type', '')}: " + f"{result.get('error_message', '')}" + ) + if result.get('traceback'): + _logger.error('remote traceback: %s', result['traceback']) + raise SubprocessExecutionError(p.pid or 0, target_name, p.exitcode or 1, details) + + if p.exitcode and p.exitcode < 0: _logger.warning( - 'Subprocess PID %d for %s exited with non-zero exit code %d after %.2f ms' - ' — possible OOM kill or unhandled signal', - p.pid, target_name, p.exitcode, elapsed_ms, + 'Subprocess PID %d for %s exited due to signal %d after %.2f ms', + p.pid, target_name, abs(p.exitcode), elapsed_ms, + ) + raise SubprocessKilledError(p.pid or 0, target_name, p.exitcode) + + if p.exitcode not in (None, 0): + raise SubprocessExecutionError( + p.pid or 0, + target_name, + p.exitcode, + 'No structured exception payload received from child process', + ) + + if result is None: + raise SubprocessExecutionError( + p.pid or 0, + target_name, + 0, + 'Subprocess exited successfully but returned no result payload', ) - raise SubprocessKilledError(p.pid or 0, target_name, p.exitcode or -1) return result['value'] From 47eaf72daec83faec6d9a4a4ce9e23b231cfba31 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Fri, 3 Apr 2026 11:08:34 +0530 Subject: [PATCH 41/96] one more stab at a fix Signed-off-by: Anupam Kumar --- context_chat_backend/utils.py | 37 ++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py index e994a3f..b4e93c7 100644 --- a/context_chat_backend/utils.py +++ b/context_chat_backend/utils.py @@ -2,6 +2,8 @@ # SPDX-FileCopyrightText: 2023 Nextcloud GmbH and Nextcloud contributors # SPDX-License-Identifier: AGPL-3.0-or-later # +import atexit +import faulthandler import io import logging import multiprocessing as mp @@ -114,6 +116,28 @@ def _truncate_capture(text: str) -> tuple[str, bool]: def exception_wrap(fun: Callable | None, *args, resconn: Connection, stdconn: Connection, **kwargs): + # --- diagnostic probes: write directly to the real stderr FD so they survive + # Python's stdout/stderr redirection below and even os._exit() won't hide them + # from the parent process's stderr stream. + _diag_fd = os.dup(2) # dup before we capture sys.stderr + + def _raw_diag(msg: str) -> None: + with suppress(Exception): + os.write(_diag_fd, (msg + '\n').encode()) + + # Enable faulthandler on the real FD so crash tracebacks (SIGSEGV etc.) appear. + with suppress(Exception): + faulthandler.enable(file=os.fdopen(os.dup(_diag_fd), 'w', closefd=True), all_threads=True) + + # Atexit probe: if this message NEVER appears, it means os._exit() (C-level) + # was called with Python's cleanup phase entirely skipped. + _fun_name = getattr(fun, '__name__', str(fun)) + atexit.register( + _raw_diag, + f'[exception_wrap/atexit] pid={os.getpid()} target={_fun_name}' + ': Python atexit reached (normal Python exit)', + ) + stdout_capture = io.StringIO() stderr_capture = io.StringIO() orig_stdout = sys.stdout @@ -124,10 +148,18 @@ def exception_wrap(fun: Callable | None, *args, resconn: Connection, stdconn: Co try: if fun is None: resconn.send({ 'value': None, 'error': None }) + _raw_diag(f'[exception_wrap/probe] pid={os.getpid()} target={_fun_name}: result sent (fun=None)') else: - resconn.send({ 'value': fun(*args, **kwargs), 'error': None }) + result_value = fun(*args, **kwargs) + _raw_diag(f'[exception_wrap/probe] pid={os.getpid()} target={_fun_name}: fun() returned, sending result') + resconn.send({ 'value': result_value, 'error': None }) + _raw_diag(f'[exception_wrap/probe] pid={os.getpid()} target={_fun_name}: result pipe send complete') except BaseException as e: tb = traceback.format_exc() + _raw_diag( + f'[exception_wrap/probe] pid={os.getpid()} target={_fun_name}' + f': caught {type(e).__name__}: {e}' + ) payload = { 'value': None, 'error': e, @@ -162,6 +194,9 @@ def exception_wrap(fun: Callable | None, *args, resconn: Connection, stdconn: Co 'stdout_truncated': stdout_truncated, 'stderr_truncated': stderr_truncated, }) + _raw_diag(f'[exception_wrap/probe] pid={os.getpid()} target={_fun_name}: finally block complete') + with suppress(Exception): + os.close(_diag_fd) def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daemon=None): # noqa: B006 From 309ab2bf19a54fb89c01f61550b07a9daf9d45d1 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Fri, 3 Apr 2026 11:43:38 +0530 Subject: [PATCH 42/96] do not throw away the valid result even with exitcode 1 Signed-off-by: Anupam Kumar --- context_chat_backend/utils.py | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py index b4e93c7..fe4ee96 100644 --- a/context_chat_backend/utils.py +++ b/context_chat_backend/utils.py @@ -282,6 +282,23 @@ def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daem _logger.error('remote traceback: %s', result['traceback']) raise SubprocessExecutionError(p.pid or 0, target_name, p.exitcode or 1, details) + # If we received a valid result payload, return it even if the exit + # code is non-zero. The non-zero code typically comes from + # multiprocessing/C-extension cleanup (e.g. util._exit_function or + # a native atexit handler) that runs *after* exception_wrap has + # already sent the result over the pipe. + if result is not None and 'value' in result: + if p.exitcode not in (None, 0): + _logger.warning( + 'Subprocess PID %d for %s exited with code %s after %.2f ms' + ' but returned a valid result — accepting the result.' + ' The non-zero exit likely originates from process' + ' cleanup (multiprocessing finalizers, C-extension' + ' atexit, etc.).', + p.pid, target_name, p.exitcode, elapsed_ms, + ) + return result['value'] + if p.exitcode and p.exitcode < 0: _logger.warning( 'Subprocess PID %d for %s exited due to signal %d after %.2f ms', @@ -297,15 +314,12 @@ def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daem 'No structured exception payload received from child process', ) - if result is None: - raise SubprocessExecutionError( - p.pid or 0, - target_name, - 0, - 'Subprocess exited successfully but returned no result payload', - ) - - return result['value'] + raise SubprocessExecutionError( + p.pid or 0, + target_name, + 0, + 'Subprocess exited successfully but returned no result payload', + ) def timed(func: Callable): From e1763acdcdfa590cee3c74f6ba1acadf1d9c6f9c Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Fri, 3 Apr 2026 12:19:09 +0530 Subject: [PATCH 43/96] fix: use forkserver as process start method Signed-off-by: Anupam Kumar --- context_chat_backend/controller.py | 4 ---- main.py | 13 +++++++++++++ 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py index 49d1d73..3a8e15a 100644 --- a/context_chat_backend/controller.py +++ b/context_chat_backend/controller.py @@ -16,7 +16,6 @@ # ruff: noqa: E402 import logging -import multiprocessing as mp import os import tempfile import threading @@ -122,9 +121,6 @@ async def lifespan(app: FastAPI): index_lock = threading.Lock() _indexing = {} -# limit the number of concurrent document parsing -doc_parse_semaphore = mp.Semaphore(app_config.doc_parser_worker_limit) - # middlewares diff --git a/main.py b/main.py index 8d838d8..4e88ee9 100755 --- a/main.py +++ b/main.py @@ -3,6 +3,7 @@ # SPDX-FileCopyrightText: 2023 Nextcloud GmbH and Nextcloud contributors # SPDX-License-Identifier: AGPL-3.0-or-later # + import logging from os import cpu_count, getenv @@ -44,6 +45,18 @@ def _setup_log_levels(debug: bool): if __name__ == '__main__': + import multiprocessing as mp + + # do forks from a clean process that doesn't have any threads or locks + mp.set_start_method('forkserver') + mp.set_forkserver_preload([ + 'langchain', + 'sqlalchemy', + 'numpy', + 'context_chat_backend.chain.ingest.injest', + 'context_chat_backend.vectordb.pgvector', + ]) + logging_config = get_logging_config(LOGGER_CONFIG_NAME) setup_logging(logging_config) app_config: TConfig = app.extra['CONFIG'] From 330165205127524780038280854dacc19f552e9c Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Fri, 3 Apr 2026 13:16:49 +0530 Subject: [PATCH 44/96] fix(ci): consider eligible files as the total files count Signed-off-by: Anupam Kumar --- .github/workflows/integration-test.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index 8e6ca7d..8ec8eab 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -241,17 +241,17 @@ jobs: continue fi - # Extract total queued files - total_files=$(echo "$stats" | jq '.queued_documents_counts.files__default' || echo "") + # Extract total eligible files + total_eligible_files=$(echo "$stats" | jq '.eligible_files_count' || echo "") # Extract indexed documents count (files__default) indexed_count=$(echo "$stats" | jq '.vectordb_document_counts.files__default' || echo "") - echo "Total queued files: $total_files" + echo "Total eligible files: $total_eligible_files" echo "Indexed documents (files__default): $indexed_count" - diff=$((total_files - indexed_count)) - threshold=$((total_files * 3 / 100)) + diff=$((total_eligible_files - indexed_count)) + threshold=$((total_eligible_files * 3 / 100)) # Check if difference is within tolerance if [ $diff -le $threshold ]; then @@ -259,7 +259,7 @@ jobs: success=1 break else - progress=$((diff * 100 / total_files)) + progress=$((diff * 100 / total_eligible_files)) echo "Outside 3% tolerance: diff=$diff (${progress}%), threshold=$threshold" fi From 32aa37474547c3f3e7993cf638171ef309c1e1df Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Fri, 3 Apr 2026 15:13:14 +0530 Subject: [PATCH 45/96] fix: use logging config in forkserver and other fixes Signed-off-by: Anupam Kumar --- context_chat_backend/utils.py | 12 ++++++++---- main.py | 17 +++++++++-------- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py index fe4ee96..5f12d0c 100644 --- a/context_chat_backend/utils.py +++ b/context_chat_backend/utils.py @@ -86,10 +86,10 @@ def __init__(self, pid: int, target_name: str, exitcode: int): class SubprocessExecutionError(RuntimeError): - """Raised when a subprocess exits non-zero without a recoverable Python exception payload.""" + """Raised when a subprocess exits without a recoverable Python exception payload.""" def __init__(self, pid: int, target_name: str, exitcode: int, details: str = ''): - msg = f'Subprocess PID {pid} for {target_name} exited with non-zero exit code {exitcode}' + msg = f'Subprocess PID {pid} for {target_name} exited with exit code {exitcode}' if details: msg = f'{msg}: {details}' super().__init__(msg) @@ -199,7 +199,11 @@ def _raw_diag(msg: str) -> None: os.close(_diag_fd) -def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daemon=None): # noqa: B006 +def exec_in_proc(group=None, target=None, name=None, args=(), kwargs=None, *, daemon=None): + if not kwargs: + kwargs = {} + + # parent, child pconn, cconn = mp.Pipe() std_pconn, std_cconn = mp.Pipe() kwargs['resconn'] = cconn @@ -318,7 +322,7 @@ def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daem p.pid or 0, target_name, 0, - 'Subprocess exited successfully but returned no result payload', + f'Subprocess exited successfully but returned no result payload: {result}', ) diff --git a/main.py b/main.py index 4e88ee9..c261451 100755 --- a/main.py +++ b/main.py @@ -47,21 +47,22 @@ def _setup_log_levels(debug: bool): if __name__ == '__main__': import multiprocessing as mp + logging_config = get_logging_config(LOGGER_CONFIG_NAME) + setup_logging(logging_config) + app_config: TConfig = app.extra['CONFIG'] + _setup_log_levels(app_config.debug) + # do forks from a clean process that doesn't have any threads or locks mp.set_start_method('forkserver') mp.set_forkserver_preload([ - 'langchain', - 'sqlalchemy', - 'numpy', 'context_chat_backend.chain.ingest.injest', 'context_chat_backend.vectordb.pgvector', + 'langchain', + 'logging', + 'numpy', + 'sqlalchemy', ]) - logging_config = get_logging_config(LOGGER_CONFIG_NAME) - setup_logging(logging_config) - app_config: TConfig = app.extra['CONFIG'] - _setup_log_levels(app_config.debug) - print(f'CPU count: {cpu_count()}, Memory: {psutil.virtual_memory()}') print('App config:\n' + redact_config(app_config).model_dump_json(indent=2), flush=True) From 33ee38ab24d9567f2a0152b7d55870a28ca2bbe1 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Fri, 3 Apr 2026 15:23:40 +0530 Subject: [PATCH 46/96] fix: remove extra diagnostics Signed-off-by: Anupam Kumar --- .../chain/ingest/doc_loader.py | 20 +-- context_chat_backend/chain/ingest/injest.py | 15 +- context_chat_backend/task_fetcher.py | 60 ++----- context_chat_backend/utils.py | 146 +++++------------- context_chat_backend/vectordb/pgvector.py | 2 +- 5 files changed, 62 insertions(+), 181 deletions(-) diff --git a/context_chat_backend/chain/ingest/doc_loader.py b/context_chat_backend/chain/ingest/doc_loader.py index 04c611d..832c833 100644 --- a/context_chat_backend/chain/ingest/doc_loader.py +++ b/context_chat_backend/chain/ingest/doc_loader.py @@ -7,8 +7,6 @@ import tempfile from collections.abc import Callable from io import BytesIO -import logging -from time import perf_counter_ns import docx2txt from epub2txt import epub2txt @@ -21,8 +19,6 @@ from ...types import IndexingException, SourceItem -logger = logging.getLogger('ccb.doc_loader') - def _temp_file_wrapper(file: BytesIO, loader: Callable, sep: str = '\n') -> str: raw_bytes = file.read() @@ -137,22 +133,10 @@ def decode_source(source: SourceItem) -> str: else: io_obj = source.content - loader_fn = _loader_map.get(source.type) - if loader_fn: - logger.debug( - 'Decoding source %r with loader %s (mime: %s) — may be slow or block', - source.title, loader_fn.__name__, source.type, - ) - t0 = perf_counter_ns() - result = loader_fn(io_obj) - elapsed_ms = (perf_counter_ns() - t0) / 1e6 - logger.debug( - 'Loader %s for %r finished in %.2f ms (%d chars)', - loader_fn.__name__, source.title, elapsed_ms, len(result), - ) + if _loader_map.get(source.type): + result = _loader_map[source.type](io_obj) return result.encode('utf-8', 'ignore').decode('utf-8', 'ignore').strip() - logger.debug('No specific loader for mime type %s, reading as plain text for %r', source.type, source.title) return io_obj.read().decode('utf-8', 'ignore').strip() except IndexingException: raise diff --git a/context_chat_backend/chain/ingest/injest.py b/context_chat_backend/chain/ingest/injest.py index 7ede94a..8e32108 100644 --- a/context_chat_backend/chain/ingest/injest.py +++ b/context_chat_backend/chain/ingest/injest.py @@ -43,8 +43,6 @@ async def __fetch_file_content( async with semaphore: nc = AsyncNextcloudApp() try: - logger.debug('Downloading file id %d for user %s', file_id, user_id) - t0 = perf_counter_ns() # a file pointer for storing the stream in memory until it is consumed fp = BytesIO() await nc._session.download2fp( @@ -54,8 +52,6 @@ async def __fetch_file_content( params={ 'userId': user_id }, ) fp.seek(0) - elapsed_ms = (perf_counter_ns() - t0) / 1e6 - logger.debug('Downloaded file id %d for user %s in %.2f ms (%d bytes)', file_id, user_id, elapsed_ms, fp.getbuffer().nbytes) return fp except niquests.exceptions.RequestException as e: if e.response is None: @@ -131,11 +127,7 @@ async def __fetch_files_content( # any user id from the list should have read access to the file tasks.append(asyncio.ensure_future(__fetch_file_content(semaphore, file.file_id, file.userIds[0]))) - logger.debug('Gathering %d file download task(s) — this blocks until all downloads complete or fail', len(tasks)) - t0 = perf_counter_ns() results = await asyncio.gather(*tasks, return_exceptions=True) - elapsed_ms = (perf_counter_ns() - t0) / 1e6 - logger.debug('All %d file download task(s) completed in %.2f ms', len(tasks), elapsed_ms) for (db_id, file), result in zip(sources.items(), results, strict=True): if isinstance(file, SourceItem): continue @@ -227,10 +219,7 @@ def _sources_to_indocuments( # transform the source to have text data try: - logger.debug( - 'Decoding source %s (type: %s, title: %r) — may be slow for complex file types', - source.reference, source.type, source.title, - ) + logger.debug('Decoding source %s (type: %s)', source.reference, source.type) t0 = perf_counter_ns() content = decode_source(source) elapsed_ms = (perf_counter_ns() - t0) / 1e6 @@ -353,7 +342,7 @@ def _process_sources( source_proc_results = _increase_access_for_existing_sources(vectordb, existing_sources) logger.debug( - 'Fetching file contents for %d source(s) — this blocks on network I/O to Nextcloud', + 'Fetching file contents for %d source(s) from Nextcloud', len(to_embed_sources), ) t0 = perf_counter_ns() diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index edeabc1..c75cec0 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -31,7 +31,6 @@ ActionsQueueItems, ActionType, AppRole, - EmbeddingException, FilesQueueItems, IndexingError, LoaderException, @@ -89,29 +88,6 @@ def files_indexing_thread(app_config: TConfig, app_enabled: Event) -> None: LOGGER.error('Error initializing vector DB loader, files indexing thread will not start:', exc_info=e) return - def _embed_one(db_id: int, item: SourceItem | ReceivedFileItem) -> tuple[int, IndexingError | None]: - """Run embed_sources for a single item in its own subprocess. Returns (db_id, error_or_None).""" - try: - result = exec_in_proc( - target=embed_sources, - args=(vectordb_loader, app_config, {db_id: item}), - ) - return db_id, result.get(db_id) - except SubprocessKilledError as e: - LOGGER.error( - 'embed_sources subprocess killed for individual source %s — marking as non-retryable' - ' to prevent infinite OOM retry loop', - item.reference, exc_info=e, - ) - return db_id, IndexingError(error=f'Subprocess killed (OOM?): {e}', retryable=False) - except Exception as e: - err_name = {DbException: 'DB', EmbeddingException: 'Embedding'}.get(type(e), 'Unknown') - LOGGER.error( - 'embed_sources raised a %s error for individual source %s, marking as retryable', - err_name, item.reference, exc_info=e, - ) - return db_id, IndexingError(error=str(e), retryable=True) - def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> Mapping[int, IndexingError | None]: source_refs = [s.reference for s in source_items.values()] LOGGER.info('Starting embed_sources subprocess for %d source(s): %s', len(source_items), source_refs) @@ -122,43 +98,39 @@ def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> ) errors = {k: v for k, v in result.items() if isinstance(v, IndexingError)} LOGGER.info( - 'embed_sources subprocess finished for %d source(s): %d succeeded, %d errored', - len(source_items), - len(result) - len(errors), - len(errors), - extra={'errors': errors} if errors else {}, + 'embed_sources finished for %d source(s): %d succeeded, %d errored', + len(source_items), len(result) - len(errors), len(errors), + extra={'errors': errors}, ) return result except SubprocessKilledError as e: LOGGER.error( - 'embed_sources subprocess was killed (likely OOM) for %d source(s): %s', - len(source_items), source_refs, exc_info=e, + 'embed_sources subprocess was killed for %d source(s) with exitcode %s: %s', + len(source_items), e.exitcode, source_refs, exc_info=e, ) if len(source_items) == 1: - # Single-item subprocess was killed — mark non-retryable to break infinite OOM loop. - LOGGER.error( - 'Single-item subprocess killed for %s — marking as non-retryable', - source_refs, + return dict.fromkeys( + source_items, + IndexingError(error=f'Subprocess killed with exitcode {e.exitcode}: {e}', retryable=False), ) - return {db_id: IndexingError(error=f'Subprocess killed (OOM?): {e}', retryable=False) - for db_id in source_items} - # Multi-item batch: fall back to one subprocess per source to pinpoint the problematic file. + # Fall back to one-by-one to isolate the problematic file. LOGGER.warning( - 'Falling back to individual processing for %d sources to isolate any OOM-causing file(s)', + 'Falling back to individual processing for %d sources', len(source_items), ) - return dict(_embed_one(db_id, item) for db_id, item in source_items.items()) - + fallback: dict[int, IndexingError | None] = {} + for db_id, item in source_items.items(): + fallback.update(_load_sources({db_id: item})) + return fallback except Exception as e: - err_name = {DbException: 'DB', EmbeddingException: 'Embedding'}.get(type(e), 'Unknown') err = IndexingError( - error=f'{err_name} Error: {e}', + error=f'{e.__class__.__name__}: {e}', retryable=True, ) LOGGER.error( 'embed_sources subprocess raised a %s error for sources %s, marking all as retryable', - err_name, source_refs, exc_info=e, + e.__class__.__name__, source_refs, exc_info=e, ) return dict.fromkeys(source_items, err) diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py index 5f12d0c..4552e32 100644 --- a/context_chat_backend/utils.py +++ b/context_chat_backend/utils.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: 2023 Nextcloud GmbH and Nextcloud contributors # SPDX-License-Identifier: AGPL-3.0-or-later # -import atexit import faulthandler import io import logging @@ -23,6 +22,7 @@ T = TypeVar('T') _logger = logging.getLogger('ccb.utils') +_MAX_STD_CAPTURE_CHARS = 64 * 1024 def not_none(value: T | None) -> TypeGuard[T]: @@ -77,7 +77,7 @@ def JSONResponse( class SubprocessKilledError(RuntimeError): """Raised when a subprocess is terminated by a signal (for example SIGKILL).""" - def __init__(self, pid: int, target_name: str, exitcode: int): + def __init__(self, pid: int | None, target_name: str, exitcode: int): super().__init__( f'Subprocess PID {pid} for {target_name} exited with signal {abs(exitcode)} ' f'(raw exit code: {exitcode})' @@ -88,7 +88,7 @@ def __init__(self, pid: int, target_name: str, exitcode: int): class SubprocessExecutionError(RuntimeError): """Raised when a subprocess exits without a recoverable Python exception payload.""" - def __init__(self, pid: int, target_name: str, exitcode: int, details: str = ''): + def __init__(self, pid: int | None, target_name: str, exitcode: int, details: str = ''): msg = f'Subprocess PID {pid} for {target_name} exited with exit code {exitcode}' if details: msg = f'{msg}: {details}' @@ -96,47 +96,29 @@ def __init__(self, pid: int, target_name: str, exitcode: int, details: str = '') self.exitcode = exitcode -_MAX_STD_CAPTURE_CHARS = 64 * 1024 - - -def _truncate_capture(text: str) -> tuple[str, bool]: +def _truncate_capture(text: str) -> str: if len(text) <= _MAX_STD_CAPTURE_CHARS: - return text, False + return text head = _MAX_STD_CAPTURE_CHARS // 2 tail = _MAX_STD_CAPTURE_CHARS - head omitted = len(text) - _MAX_STD_CAPTURE_CHARS - truncated = ( + return ( f'[truncated {omitted} chars]\n' f'{text[:head]}\n' '[...snip...]\n' f'{text[-tail:]}' ) - return truncated, True def exception_wrap(fun: Callable | None, *args, resconn: Connection, stdconn: Connection, **kwargs): - # --- diagnostic probes: write directly to the real stderr FD so they survive - # Python's stdout/stderr redirection below and even os._exit() won't hide them - # from the parent process's stderr stream. - _diag_fd = os.dup(2) # dup before we capture sys.stderr - - def _raw_diag(msg: str) -> None: - with suppress(Exception): - os.write(_diag_fd, (msg + '\n').encode()) - - # Enable faulthandler on the real FD so crash tracebacks (SIGSEGV etc.) appear. + # Preserve real stderr FD for faulthandler before we redirect sys.stderr. + _faulthandler_fd = os.dup(2) with suppress(Exception): - faulthandler.enable(file=os.fdopen(os.dup(_diag_fd), 'w', closefd=True), all_threads=True) - - # Atexit probe: if this message NEVER appears, it means os._exit() (C-level) - # was called with Python's cleanup phase entirely skipped. - _fun_name = getattr(fun, '__name__', str(fun)) - atexit.register( - _raw_diag, - f'[exception_wrap/atexit] pid={os.getpid()} target={_fun_name}' - ': Python atexit reached (normal Python exit)', - ) + faulthandler.enable( + file=os.fdopen(_faulthandler_fd, 'w', closefd=False), + all_threads=True, + ) stdout_capture = io.StringIO() stderr_capture = io.StringIO() @@ -148,55 +130,31 @@ def _raw_diag(msg: str) -> None: try: if fun is None: resconn.send({ 'value': None, 'error': None }) - _raw_diag(f'[exception_wrap/probe] pid={os.getpid()} target={_fun_name}: result sent (fun=None)') else: - result_value = fun(*args, **kwargs) - _raw_diag(f'[exception_wrap/probe] pid={os.getpid()} target={_fun_name}: fun() returned, sending result') - resconn.send({ 'value': result_value, 'error': None }) - _raw_diag(f'[exception_wrap/probe] pid={os.getpid()} target={_fun_name}: result pipe send complete') + resconn.send({ 'value': fun(*args, **kwargs), 'error': None }) except BaseException as e: tb = traceback.format_exc() - _raw_diag( - f'[exception_wrap/probe] pid={os.getpid()} target={_fun_name}' - f': caught {type(e).__name__}: {e}' - ) payload = { 'value': None, 'error': e, 'traceback': tb, - 'error_type': type(e).__name__, - 'error_module': type(e).__module__, - 'error_message': str(e), } try: resconn.send(payload) except Exception as send_err: - # Fallback for unpicklable exceptions. - with suppress(Exception): - resconn.send({ - 'value': None, - 'error': None, - 'traceback': tb, - 'error_type': type(e).__name__, - 'error_module': type(e).__module__, - 'error_message': str(e), - 'send_error': str(send_err), - }) + stderr_capture.write(f'Original error: {e}, pipe send error: {send_err}') finally: sys.stdout = orig_stdout sys.stderr = orig_stderr - stdout_text, stdout_truncated = _truncate_capture(stdout_capture.getvalue()) - stderr_text, stderr_truncated = _truncate_capture(stderr_capture.getvalue()) + stdout_text = _truncate_capture(stdout_capture.getvalue()) + stderr_text = _truncate_capture(stderr_capture.getvalue()) with suppress(Exception): stdconn.send({ 'stdout': stdout_text, 'stderr': stderr_text, - 'stdout_truncated': stdout_truncated, - 'stderr_truncated': stderr_truncated, }) - _raw_diag(f'[exception_wrap/probe] pid={os.getpid()} target={_fun_name}: finally block complete') with suppress(Exception): - os.close(_diag_fd) + os.close(_faulthandler_fd) def exec_in_proc(group=None, target=None, name=None, args=(), kwargs=None, *, daemon=None): @@ -217,22 +175,17 @@ def exec_in_proc(group=None, target=None, name=None, args=(), kwargs=None, *, da daemon=daemon, ) target_name = getattr(target, '__name__', str(target)) - _logger.debug('Starting subprocess for %s', target_name) start = perf_counter_ns() p.start() - _logger.debug('Subprocess PID %d started for %s, waiting for it to finish (no timeout)', p.pid, target_name) + _logger.debug('Subprocess PID %d started for %s', p.pid, target_name) result = None - stdobj = { - 'stdout': '', - 'stderr': '', - 'stdout_truncated': False, - 'stderr_truncated': False, - } + stdobj = { 'stdout': '', 'stderr': '' } got_result = False got_std = False # Drain result/std pipes while child is still alive to avoid deadlock on full pipe buffers. + # Pipe's buffer size is 64 KiB while p.is_alive() and (not got_result or not got_std): if not got_result and pconn.poll(0.1): with suppress(EOFError, OSError, BrokenPipeError): @@ -254,72 +207,55 @@ def exec_in_proc(group=None, target=None, name=None, args=(), kwargs=None, *, da with suppress(EOFError, OSError, BrokenPipeError): if std_pconn.poll(): stdobj = std_pconn.recv() - got_std = True - if stdobj['stdout'] or stdobj['stderr']: - extra = { - 'stdout': stdobj['stdout'], - 'stderr': stdobj['stderr'], - } - if stdobj.get('stdout_truncated') or stdobj.get('stderr_truncated'): - extra['stdio_truncated'] = { - 'stdout': bool(stdobj.get('stdout_truncated')), - 'stderr': bool(stdobj.get('stderr_truncated')), - } - _logger.info('std info for %s', target_name, extra=extra) + # no need to update got_std here + if stdobj.get('stdout') or stdobj.get('stderr'): + _logger.info('std info for %s', target_name, extra={ + 'stdout': stdobj.get('stdout', ''), + 'stderr': stdobj.get('stderr', ''), + }) if not got_result: with suppress(EOFError, OSError, BrokenPipeError): if pconn.poll(): result = pconn.recv() - got_result = True + # no need to update got_result here if result is not None and result.get('error') is not None: - _logger.error('original traceback: %s', result.get('traceback', '')) + _logger.error( + 'original traceback of %s (PID %d, exitcode: %s): %s', + target_name, + p.pid, + p.exitcode, + result.get('traceback', ''), + ) raise result['error'] - if result is not None and result.get('error_type'): - details = ( - f"{result.get('error_module', '')}.{result.get('error_type', '')}: " - f"{result.get('error_message', '')}" - ) - if result.get('traceback'): - _logger.error('remote traceback: %s', result['traceback']) - raise SubprocessExecutionError(p.pid or 0, target_name, p.exitcode or 1, details) - - # If we received a valid result payload, return it even if the exit - # code is non-zero. The non-zero code typically comes from - # multiprocessing/C-extension cleanup (e.g. util._exit_function or - # a native atexit handler) that runs *after* exception_wrap has - # already sent the result over the pipe. if result is not None and 'value' in result: if p.exitcode not in (None, 0): _logger.warning( 'Subprocess PID %d for %s exited with code %s after %.2f ms' - ' but returned a valid result — accepting the result.' - ' The non-zero exit likely originates from process' - ' cleanup (multiprocessing finalizers, C-extension' - ' atexit, etc.).', + ' but returned a valid result', p.pid, target_name, p.exitcode, elapsed_ms, ) return result['value'] if p.exitcode and p.exitcode < 0: _logger.warning( - 'Subprocess PID %d for %s exited due to signal %d after %.2f ms', - p.pid, target_name, abs(p.exitcode), elapsed_ms, + 'Subprocess PID %d for %s exited due to signal %d, exitcode %d after %.2f ms', + p.pid, target_name, abs(p.exitcode), p.exitcode, elapsed_ms, ) - raise SubprocessKilledError(p.pid or 0, target_name, p.exitcode) + raise SubprocessKilledError(p.pid, target_name, p.exitcode) if p.exitcode not in (None, 0): raise SubprocessExecutionError( - p.pid or 0, + p.pid, target_name, p.exitcode, - 'No structured exception payload received from child process', + f'No structured exception payload received from child process: {result}', ) raise SubprocessExecutionError( - p.pid or 0, + p.pid, target_name, 0, f'Subprocess exited successfully but returned no result payload: {result}', diff --git a/context_chat_backend/vectordb/pgvector.py b/context_chat_backend/vectordb/pgvector.py index 33dfb03..41d7f0d 100644 --- a/context_chat_backend/vectordb/pgvector.py +++ b/context_chat_backend/vectordb/pgvector.py @@ -156,7 +156,7 @@ def add_indocuments(self, indocuments: Mapping[int, InDocument]) -> Mapping[int, total_chunks = len(indoc.documents) num_batches = max(1, -(-total_chunks // batch_size)) # ceiling division logger.debug( - 'Embedding source %s: %d chunk(s) in %d batch(es) — blocks on embedding model', + 'Embedding source %s: %d chunk(s) in %d batch(es)', indoc.source_id, total_chunks, num_batches, ) for i in range(0, total_chunks, batch_size): From d9ebdac85772930b556f02ea501d3c73160d567b Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Fri, 3 Apr 2026 17:54:44 +0530 Subject: [PATCH 47/96] fix: use zip on the subset of filtered sources Signed-off-by: Anupam Kumar --- context_chat_backend/chain/ingest/injest.py | 23 ++++++++++----------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/context_chat_backend/chain/ingest/injest.py b/context_chat_backend/chain/ingest/injest.py index 8e32108..190eebd 100644 --- a/context_chat_backend/chain/ingest/injest.py +++ b/context_chat_backend/chain/ingest/injest.py @@ -89,6 +89,7 @@ async def __fetch_files_content( error_items = {} semaphore = asyncio.Semaphore(CONCURRENT_FILE_FETCHES) tasks = [] + task_sources = {} file_count = sum(1 for s in sources.values() if isinstance(s, ReceivedFileItem)) logger.debug('Fetching content for %d file(s) (max %d concurrent)', file_count, CONCURRENT_FILE_FETCHES) @@ -126,13 +127,18 @@ async def __fetch_files_content( continue # any user id from the list should have read access to the file tasks.append(asyncio.ensure_future(__fetch_file_content(semaphore, file.file_id, file.userIds[0]))) + task_sources[db_id] = file results = await asyncio.gather(*tasks, return_exceptions=True) - for (db_id, file), result in zip(sources.items(), results, strict=True): - if isinstance(file, SourceItem): - continue - - if isinstance(result, IndexingException): + for (db_id, file), result in zip(task_sources.items(), results, strict=True): + if isinstance(result, str) or isinstance(result, BytesIO): + source_items[db_id] = SourceItem( + **{ + **file.model_dump(), + 'content': result, + } + ) + elif isinstance(result, IndexingException): logger.error( f'Error fetching content for db id {db_id}, file id {file.file_id}, reference {file.reference}' f': {result}', @@ -142,13 +148,6 @@ async def __fetch_files_content( error=str(result), retryable=result.retryable, ) - elif isinstance(result, str) or isinstance(result, BytesIO): - source_items[db_id] = SourceItem( - **{ - **file.model_dump(), - 'content': result, - } - ) elif isinstance(result, BaseException): logger.error( f'Unexpected error fetching content for db id {db_id}, file id {file.file_id},' From ea77480df7060a21cb556d7dfe13f8d5da21337f Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Fri, 3 Apr 2026 18:41:30 +0530 Subject: [PATCH 48/96] fix(em): use tcp socket connection check Signed-off-by: Anupam Kumar --- context_chat_backend/network_em.py | 29 ++++++++++++++++++++++++---- context_chat_backend/task_fetcher.py | 17 +++++++--------- 2 files changed, 32 insertions(+), 14 deletions(-) diff --git a/context_chat_backend/network_em.py b/context_chat_backend/network_em.py index 43ced6c..ba1edc9 100644 --- a/context_chat_backend/network_em.py +++ b/context_chat_backend/network_em.py @@ -3,8 +3,10 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # import logging +import socket from time import sleep from typing import Literal, TypedDict +from urllib.parse import urlparse import niquests from langchain_core.embeddings import Embeddings @@ -19,6 +21,7 @@ ) logger = logging.getLogger('ccb.nextwork_em') +TCP_CONNECT_TIMEOUT = 2.0 # seconds # Copied from llama_cpp/llama_types.py @@ -44,12 +47,30 @@ class NetworkEmbeddings(Embeddings): def __init__(self, app_config: TConfig): self.app_config = app_config - def check_connection(self) -> bool: + def _get_host_and_port(self) -> tuple[str, int]: + parsed = urlparse(self.app_config.embedding.base_url) + host = parsed.hostname + + if not host: + raise ValueError("Invalid URL: Missing hostname") + + if parsed.port: + port = parsed.port + else: + port = 443 if parsed.scheme == "https" else 80 + + return host, port + + def check_connection(self, check_origin: str) -> bool: try: - self.embed_query('hello') + host, port = self._get_host_and_port() + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.settimeout(TCP_CONNECT_TIMEOUT) + sock.connect((host, port)) + sock.close() return True - except EmbeddingException as e: - logger.warning('Embedding server connection failed', exc_info=e) + except (ValueError, TimeoutError, ConnectionRefusedError, socket.gaierror) as e: + logger.warning(f'[{check_origin}] Embedding server is not reachable, retrying after some time: {e}') return False def _get_embedding(self, input_: str | list[str], try_: int = 3) -> list[float] | list[list[float]]: diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index c75cec0..c931e7d 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -83,6 +83,7 @@ class ThreadType(Enum): def files_indexing_thread(app_config: TConfig, app_enabled: Event) -> None: try: + network_em = NetworkEmbeddings(app_config) vectordb_loader = VectorDBLoader(app_config) except LoaderException as e: LOGGER.error('Error initializing vector DB loader, files indexing thread will not start:', exc_info=e) @@ -141,7 +142,7 @@ def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> return try: - if not __check_em_server(app_config): + if not network_em.check_connection(ThreadType.FILES_INDEXING.value): sleep(POLLING_COOLDOWN) continue @@ -456,6 +457,7 @@ def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None: LOGGER.info('Starting task fetcher loop') try: + network_em = NetworkEmbeddings(app_config) vectordb_loader = VectorDBLoader(app_config) llm_loader = LLMModelLoader(app_config) except LoaderException as e: @@ -466,14 +468,14 @@ def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None: llm: LLM = llm_loader.load() while True: - if not __check_em_server(app_config): - sleep(POLLING_COOLDOWN) - continue - if THREAD_STOP_EVENT.is_set(): LOGGER.info('Updates processing thread is stopping due to stop event being set') return + if not network_em.check_connection(ThreadType.REQUEST_PROCESSING.value): + sleep(POLLING_COOLDOWN) + continue + try: # Fetch pending task try: @@ -877,8 +879,3 @@ def process_search_task( task_input.get('scopeList'), ) ) - - -def __check_em_server(app_config: TConfig) -> bool: - embedding_model = NetworkEmbeddings(app_config=app_config) - return embedding_model.check_connection() From 1ce237a36addb872e3affc790faeae5583e80b28 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Fri, 3 Apr 2026 18:42:59 +0530 Subject: [PATCH 49/96] fix(ci): remove github CI restrictions Signed-off-by: Anupam Kumar --- context_chat_backend/task_fetcher.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index c931e7d..004104f 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -55,16 +55,9 @@ THREAD_STOP_EVENT = Event() LOGGER = logging.getLogger('ccb.task_fetcher') FILES_INDEXING_BATCH_SIZE = 16 # theoretical max RAM usage: 16 * 100 MiB, todo: config? -if os.getenv('GITHUB_ACTIONS'): - FILES_INDEXING_BATCH_SIZE = 4 MIN_FILES_PER_CPU = 4 -if os.getenv('GITHUB_ACTIONS'): - MIN_FILES_PER_CPU = 2 # divides the batch into these many chunks PARALLEL_FILE_PARSING_COUNT = max(1, (os.cpu_count() or 2) - 1) # todo: config? -if os.getenv('GITHUB_ACTIONS'): - # Keep CI memory usage predictable and avoid OOM-killed workers. - PARALLEL_FILE_PARSING_COUNT = max(1, min(PARALLEL_FILE_PARSING_COUNT, 2)) LOGGER.info(f'Using {PARALLEL_FILE_PARSING_COUNT} parallel file parsing workers') ACTIONS_BATCH_SIZE = 512 # todo: config? POLLING_COOLDOWN = 30 From d82e01b6555e4a362ba58fda1414cba83dc00023 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Fri, 3 Apr 2026 18:54:20 +0530 Subject: [PATCH 50/96] fix: remove unused code and some de-duplication Signed-off-by: Anupam Kumar --- context_chat_backend/task_fetcher.py | 286 +++++++-------------------- 1 file changed, 75 insertions(+), 211 deletions(-) diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index 004104f..1e45646 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -15,16 +15,14 @@ import niquests from langchain.llms.base import LLM -from langchain.schema import Document from nc_py_api import NextcloudApp, NextcloudException from niquests import JSONDecodeError, RequestException from pydantic import ValidationError -from .chain.context import do_doc_search, get_context_chunks, get_context_docs +from .chain.context import do_doc_search from .chain.ingest.injest import embed_sources from .chain.one_shot import process_context_query -from .chain.query_proc import get_pruned_query -from .chain.types import ContextException, EnrichedSourceList, LLMOutput, ScopeList, ScopeType, SearchResult +from .chain.types import ContextException, EnrichedSourceList, LLMOutput, ScopeList, SearchResult from .dyn_loader import LLMModelLoader, VectorDBLoader from .network_em import NetworkEmbeddings from .types import ( @@ -39,7 +37,6 @@ TConfig, ) from .utils import SubprocessKilledError, exec_in_proc, get_app_role -from .vectordb.base import BaseVectorDB from .vectordb.service import ( decl_update_access, delete_by_provider, @@ -498,11 +495,16 @@ def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None: if task['type'] == 'context_chat:context_chat': result: LLMOutput = process_normal_task(task, vectordb_loader, llm, app_config) # Return result to Nextcloud - success = return_normal_result_to_nextcloud(task['id'], userId, result) + success = return_result_to_nextcloud(task['id'], userId, { + 'output': result['output'], + 'sources': enrich_sources(result['sources'], userId), + }) elif task['type'] == 'context_chat:context_chat_search': search_result: list[SearchResult] = process_search_task(task, vectordb_loader) # Return result to Nextcloud - success = return_search_result_to_nextcloud(task['id'], userId, search_result) + success = return_result_to_nextcloud(task['id'], userId, { + 'sources': enrich_sources(search_result, userId), + }) else: LOGGER.error(f'Unknown task type {task["type"]}') success = return_error_to_nextcloud(task['id'], Exception(f'Unknown task type {task["type"]}')) @@ -541,200 +543,6 @@ def wait_for_tasks(interval = None): TRIGGER.clear() - -def start_bg_threads(app_config: TConfig, app_enabled: Event): - if APP_ROLE == AppRole.INDEXING or APP_ROLE == AppRole.NORMAL: - if ( - ThreadType.FILES_INDEXING in THREADS - or ThreadType.UPDATES_PROCESSING in THREADS - ): - LOGGER.info('Background threads already running, skipping start') - return - - THREAD_STOP_EVENT.clear() - THREADS[ThreadType.FILES_INDEXING] = Thread( - target=files_indexing_thread, - args=(app_config, app_enabled), - name='FilesIndexingThread', - ) - THREADS[ThreadType.UPDATES_PROCESSING] = Thread( - target=updates_processing_thread, - args=(app_config, app_enabled), - name='UpdatesProcessingThread', - ) - THREADS[ThreadType.FILES_INDEXING].start() - THREADS[ThreadType.UPDATES_PROCESSING].start() - - if APP_ROLE == AppRole.RP or APP_ROLE == AppRole.NORMAL: - if ThreadType.REQUEST_PROCESSING in THREADS: - LOGGER.info('Background threads already running, skipping start') - return - - THREAD_STOP_EVENT.clear() - THREADS[ThreadType.REQUEST_PROCESSING] = Thread( - target=request_processing_thread, - args=(app_config, app_enabled), - name='RequestProcessingThread', - ) - THREADS[ThreadType.REQUEST_PROCESSING].start() - - -def wait_for_bg_threads(): - if APP_ROLE == AppRole.INDEXING or APP_ROLE == AppRole.NORMAL: - if (ThreadType.FILES_INDEXING not in THREADS or ThreadType.UPDATES_PROCESSING not in THREADS): - return - - THREAD_STOP_EVENT.set() - THREADS[ThreadType.FILES_INDEXING].join() - THREADS[ThreadType.UPDATES_PROCESSING].join() - THREADS.pop(ThreadType.FILES_INDEXING) - THREADS.pop(ThreadType.UPDATES_PROCESSING) - - if APP_ROLE == AppRole.RP or APP_ROLE == AppRole.NORMAL: - if (ThreadType.REQUEST_PROCESSING not in THREADS): - return - - THREAD_STOP_EVENT.set() - THREADS[ThreadType.REQUEST_PROCESSING].join() - THREADS.pop(ThreadType.REQUEST_PROCESSING) - - -def query_vector_database( - user_id: str, - query: str, - vectordb: BaseVectorDB, - ctx_limit: int, - scope_type: ScopeType | None = None, - scope_list: list[str] | None = None, -) -> list[Document]: - """ - Query the vector database to retrieve relevant documents. - - Args: - user_id: User ID for scoping the search - query: The search query text - vectordb: Vector database instance - ctx_limit: Maximum number of documents to return - scope_type: Optional scope type (PROVIDER or SOURCE) - scope_list: Optional list of scope identifiers - - Returns: - List of relevant Document objects - - Raises: - ContextException: If scope type is provided without scope list - """ - context_docs = get_context_docs(user_id, query, vectordb, ctx_limit, scope_type, scope_list) - LOGGER.debug('Retrieved context documents', extra={ - 'user_id': user_id, - 'num_docs': len(context_docs), - 'ctx_limit': ctx_limit, - }) - return context_docs - - -def prepare_context_chunks(context_docs: list[Document]) -> list[str]: - """ - Extract and format text chunks from documents for LLM context. - - Args: - context_docs: List of Document objects from vector DB - - Returns: - List of formatted text chunks including titles and content - """ - return get_context_chunks(context_docs) - - -def generate_llm_response( - llm: LLM, - app_config: TConfig, - user_id: str, - query: str, - template: str, - context_chunks: list[str], - end_separator: str = '', -) -> str: - """ - Generate LLM response using the pruned query and context. - - Args: - llm: Language model instance - app_config: Application configuration - user_id: User ID for the request - query: The original query text - template: Template for formatting the prompt - context_chunks: Context chunks to include in the prompt - end_separator: Optional separator to stop generation - - Returns: - Generated LLM output text - - Raises: - ValueError: If context length is too small to fit the query - """ - pruned_query_text = get_pruned_query(llm, app_config, query, template, context_chunks) - - stop = [end_separator] if end_separator else None - output = llm.invoke( - pruned_query_text, - stop=stop, - userid=user_id, - ).strip() - - LOGGER.debug('Generated LLM response', extra={ - 'user_id': user_id, - 'output_length': len(output), - }) - return output - - -def extract_unique_sources(context_docs: list[Document]) -> list[str]: - """ - Extract unique source IDs from context documents. - - Args: - context_docs: List of Document objects - - Returns: - List of unique source IDs - """ - unique_sources: list[str] = list({ - source for d in context_docs if (source := d.metadata.get('source')) - }) - return unique_sources - -def return_normal_result_to_nextcloud(task_id: int, userId: str, result: LLMOutput) -> bool: - """ - Return query result back to Nextcloud. - - Args: - task_id: Unique task identifier - result: The LLMOutput result to return - - Returns: - True if successful, False otherwise - """ - LOGGER.debug('Returning result to Nextcloud', extra={ - 'task_id': task_id, - 'output_length': len(result['output']), - 'num_sources': len(result['sources']), - }) - - nc = NextcloudApp() - - try: - nc.providers.task_processing.report_result(task_id, { - 'output': result['output'], - 'sources': enrich_sources(result['sources'], userId), - }) - except (NextcloudException, RequestException, JSONDecodeError) as e: - LOGGER.error(f"Network error reporting task result {e}", exc_info=e) - return False - - return True - - def enrich_sources(results: list[SearchResult], userId: str) -> list[str]: nc = NextcloudApp() data = nc.ocs('POST', '/ocs/v2.php/apps/context_chat/enrich_sources', json={'sources': results, 'userId': userId}) @@ -742,34 +550,32 @@ def enrich_sources(results: list[SearchResult], userId: str) -> list[str]: return [s.model_dump_json() for s in sources] -def return_search_result_to_nextcloud(task_id: int, userId: str, result: list[SearchResult]) -> bool: +def return_result_to_nextcloud(task_id: int, userId: str, result: dict[str, Any]) -> bool: """ - Return search result back to Nextcloud. + Return query result back to Nextcloud. Args: - task_id: Unique task identifier - result: The list of search results to return + result: dict[str, Any] Returns: True if successful, False otherwise """ - LOGGER.debug('Returning search result to Nextcloud', extra={ + LOGGER.debug('Returning result to Nextcloud', extra={ 'task_id': task_id, - 'num_sources': len(result), + 'result': result, }) nc = NextcloudApp() try: - nc.providers.task_processing.report_result(task_id, { - 'sources': enrich_sources(result, userId), - }) + nc.providers.task_processing.report_result(task_id, result) except (NextcloudException, RequestException, JSONDecodeError) as e: - LOGGER.error(f"Network error reporting search task result {e}", exc_info=e) + LOGGER.error(f"Network error reporting task result {e}", exc_info=e) return False return True + def return_error_to_nextcloud(task_id: int, e: Exception) -> bool: """ Return error result back to Nextcloud. @@ -827,6 +633,7 @@ def process_normal_task( if task_input.get('scopeType') == 'none': task_input['scopeType'] = None + # todo: document no template support return exec_in_proc(target=process_context_query, args=( user_id, @@ -872,3 +679,60 @@ def process_search_task( task_input.get('scopeList'), ) ) + + +def start_bg_threads(app_config: TConfig, app_enabled: Event): + if APP_ROLE == AppRole.INDEXING or APP_ROLE == AppRole.NORMAL: + if ( + ThreadType.FILES_INDEXING in THREADS + or ThreadType.UPDATES_PROCESSING in THREADS + ): + LOGGER.info('Background threads already running, skipping start') + return + + THREAD_STOP_EVENT.clear() + THREADS[ThreadType.FILES_INDEXING] = Thread( + target=files_indexing_thread, + args=(app_config, app_enabled), + name='FilesIndexingThread', + ) + THREADS[ThreadType.UPDATES_PROCESSING] = Thread( + target=updates_processing_thread, + args=(app_config, app_enabled), + name='UpdatesProcessingThread', + ) + THREADS[ThreadType.FILES_INDEXING].start() + THREADS[ThreadType.UPDATES_PROCESSING].start() + + if APP_ROLE == AppRole.RP or APP_ROLE == AppRole.NORMAL: + if ThreadType.REQUEST_PROCESSING in THREADS: + LOGGER.info('Background threads already running, skipping start') + return + + THREAD_STOP_EVENT.clear() + THREADS[ThreadType.REQUEST_PROCESSING] = Thread( + target=request_processing_thread, + args=(app_config, app_enabled), + name='RequestProcessingThread', + ) + THREADS[ThreadType.REQUEST_PROCESSING].start() + + +def wait_for_bg_threads(): + if APP_ROLE == AppRole.INDEXING or APP_ROLE == AppRole.NORMAL: + if (ThreadType.FILES_INDEXING not in THREADS or ThreadType.UPDATES_PROCESSING not in THREADS): + return + + THREAD_STOP_EVENT.set() + THREADS[ThreadType.FILES_INDEXING].join() + THREADS[ThreadType.UPDATES_PROCESSING].join() + THREADS.pop(ThreadType.FILES_INDEXING) + THREADS.pop(ThreadType.UPDATES_PROCESSING) + + if APP_ROLE == AppRole.RP or APP_ROLE == AppRole.NORMAL: + if (ThreadType.REQUEST_PROCESSING not in THREADS): + return + + THREAD_STOP_EVENT.set() + THREADS[ThreadType.REQUEST_PROCESSING].join() + THREADS.pop(ThreadType.REQUEST_PROCESSING) From 286db22e8cb664f600ddfa3b759ce8e83963ff2b Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Fri, 3 Apr 2026 19:32:28 +0530 Subject: [PATCH 51/96] fix(mp): run repairs and config file check only in MainProcess Signed-off-by: Anupam Kumar --- context_chat_backend/controller.py | 8 ++++++-- main.py | 3 +-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py index 3a8e15a..9c3812e 100644 --- a/context_chat_backend/controller.py +++ b/context_chat_backend/controller.py @@ -16,6 +16,7 @@ # ruff: noqa: E402 import logging +import multiprocessing as mp import os import tempfile import threading @@ -39,8 +40,11 @@ # setup -repair_run() -ensure_config_file() +# only run once +if mp.current_process().name == 'MainProcess': + repair_run() + ensure_config_file() + logger = logging.getLogger('ccb.controller') app_config = get_config(os.environ['CC_CONFIG_PATH']) __download_models_from_hf = os.environ.get('CC_DOWNLOAD_MODELS_FROM_HF', 'true').lower() in ('1', 'true', 'yes') diff --git a/main.py b/main.py index c261451..076b7db 100755 --- a/main.py +++ b/main.py @@ -5,6 +5,7 @@ # import logging +import multiprocessing as mp from os import cpu_count, getenv import psutil @@ -45,8 +46,6 @@ def _setup_log_levels(debug: bool): if __name__ == '__main__': - import multiprocessing as mp - logging_config = get_logging_config(LOGGER_CONFIG_NAME) setup_logging(logging_config) app_config: TConfig = app.extra['CONFIG'] From 726eb64f5624eb9a2262aa6c6b17641e04b33973 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Tue, 7 Apr 2026 16:43:07 +0530 Subject: [PATCH 52/96] fix: attach source_ids as keys in json logs Signed-off-by: Anupam Kumar --- context_chat_backend/task_fetcher.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index 1e45646..be74b31 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -81,7 +81,9 @@ def files_indexing_thread(app_config: TConfig, app_enabled: Event) -> None: def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> Mapping[int, IndexingError | None]: source_refs = [s.reference for s in source_items.values()] - LOGGER.info('Starting embed_sources subprocess for %d source(s): %s', len(source_items), source_refs) + LOGGER.info('Starting embed_sources subprocess for %d source(s)', len(source_items), extra={ + 'source_ids': source_refs, + }) try: result = exec_in_proc( target=embed_sources, @@ -96,8 +98,10 @@ def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> return result except SubprocessKilledError as e: LOGGER.error( - 'embed_sources subprocess was killed for %d source(s) with exitcode %s: %s', - len(source_items), e.exitcode, source_refs, exc_info=e, + 'embed_sources subprocess was killed for %d source(s) with exitcode %s', + len(source_items), e.exitcode, exc_info=e, extra={ + 'source_ids': source_refs, + }, ) if len(source_items) == 1: return dict.fromkeys( @@ -120,8 +124,10 @@ def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> retryable=True, ) LOGGER.error( - 'embed_sources subprocess raised a %s error for sources %s, marking all as retryable', - e.__class__.__name__, source_refs, exc_info=e, + 'embed_sources subprocess raised a %s error for %d sources, marking all as retryable', + e.__class__.__name__, len(source_refs), exc_info=e, extra={ + 'source_ids': source_refs, + } ) return dict.fromkeys(source_items, err) From 073f9d0e4a2f7fd52c1ef0df3410ea390c70c683 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Tue, 7 Apr 2026 16:43:26 +0530 Subject: [PATCH 53/96] fix(ci): upload db dump artifacts Signed-off-by: Anupam Kumar --- .github/workflows/integration-test.yml | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index 8ec8eab..9c66483 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -89,7 +89,7 @@ jobs: POSTGRES_USER: root POSTGRES_PASSWORD: rootpassword POSTGRES_DB: nextcloud - options: --health-cmd pg_isready --health-interval 5s --health-timeout 2s --health-retries 5 + options: --health-cmd pg_isready --health-interval 5s --health-timeout 2s --health-retries 5 --name postgres --hostname postgres steps: - name: Checkout server @@ -214,6 +214,13 @@ jobs: php cron.php sleep 10 done & + sleep 30 + # list all the bg jobs + ./occ background-job:list + + - name: Initial dump of DB with context_chat_queue populated + run: | + docker exec postgres pg_dump nextcloud > /tmp/0_pgdump_nextcloud - name: Periodically check context_chat stats for 15 minutes to allow the backend to index the files run: | @@ -315,6 +322,10 @@ jobs: echo "Memory usage during scan is stable. No memory leak detected." fi + - name: Final dump of DB with vectordb populated + run: | + docker exec postgres pg_dump nextcloud > /tmp/1_pgdump_nextcloud + - name: Show server logs if: always() run: | @@ -350,6 +361,14 @@ jobs: run: | tail -v -n +1 context_chat_backend/persistent_storage/logs/em_server.log* || echo "No logs in logs directory" + - name: Upload database dumps + uses: actions/upload-artifact@v4 + with: + name: database-dumps + path: | + /tmp/0_pgdump_nextcloud + /tmp/1_pgdump_nextcloud + summary: permissions: contents: none From 13ea740d94841069b1c72398440dab9a2a30cd31 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Tue, 7 Apr 2026 18:01:47 +0530 Subject: [PATCH 54/96] fix: retry PGVector object creation if table already exists Signed-off-by: Anupam Kumar --- context_chat_backend/vectordb/pgvector.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/context_chat_backend/vectordb/pgvector.py b/context_chat_backend/vectordb/pgvector.py index 41d7f0d..d7b718d 100644 --- a/context_chat_backend/vectordb/pgvector.py +++ b/context_chat_backend/vectordb/pgvector.py @@ -120,7 +120,15 @@ def __init__(self, embedding: Embeddings | None = None, **kwargs): kwargs['connection'] = os.environ['CCB_DB_URL'] # setup langchain db + our access list table - self.client = PGVector(embedding, collection_name=COLLECTION_NAME, **kwargs) + try: + self.client = PGVector(embedding, collection_name=COLLECTION_NAME, **kwargs) + except sa.exc.IntegrityError as ie: # pyright: ignore[reportAttributeAccessIssue] + if not isinstance(ie.orig, psycopg.errors.UniqueViolation): + raise + + # tried to create the tables but it was already created in another process + # init the client again to detect it already exists, and continue from there + self.client = PGVector(embedding, collection_name=COLLECTION_NAME, **kwargs) def get_instance(self) -> VectorStore: return self.client From dcb04e7209558ea9185f902637474e301d70f1b9 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Tue, 7 Apr 2026 20:11:24 +0530 Subject: [PATCH 55/96] fix: unique db dump artifact id Signed-off-by: Anupam Kumar --- .github/workflows/integration-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index 9c66483..384e352 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -364,7 +364,7 @@ jobs: - name: Upload database dumps uses: actions/upload-artifact@v4 with: - name: database-dumps + name: database-dumps-${{ matrix.server-versions }}-php@${{ matrix.php-versions }} path: | /tmp/0_pgdump_nextcloud /tmp/1_pgdump_nextcloud From dc1d57b15161ff13ffa56208bc4a21bb4e13b10b Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Tue, 7 Apr 2026 20:12:51 +0530 Subject: [PATCH 56/96] fix(ci): log stats before exit Signed-off-by: Anupam Kumar --- .github/workflows/integration-test.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index 384e352..d30073a 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -282,9 +282,6 @@ jobs: echo "::endgroup::" - ./occ context_chat:stats - ./occ context_chat:stats --json - if [ $success -ne 1 ]; then echo "Max attempts reached" exit 1 @@ -369,6 +366,11 @@ jobs: /tmp/0_pgdump_nextcloud /tmp/1_pgdump_nextcloud + - name: Final stats log + run: | + ./occ context_chat:stats + ./occ context_chat:stats --json + summary: permissions: contents: none From eae1cd4e7c4958fcb7046a9638a6a1c5f6c7df91 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Thu, 9 Apr 2026 16:44:24 +0530 Subject: [PATCH 57/96] fix: mark unembeddable files as such Signed-off-by: Anupam Kumar --- context_chat_backend/network_em.py | 20 +++++++++++++++++--- context_chat_backend/task_fetcher.py | 4 ++++ context_chat_backend/types.py | 10 +++++++++- context_chat_backend/vectordb/pgvector.py | 16 +++++++++++++++- 4 files changed, 45 insertions(+), 5 deletions(-) diff --git a/context_chat_backend/network_em.py b/context_chat_backend/network_em.py index ba1edc9..8b85169 100644 --- a/context_chat_backend/network_em.py +++ b/context_chat_backend/network_em.py @@ -12,6 +12,7 @@ from langchain_core.embeddings import Embeddings from .types import ( + DocErrorEmbeddingException, EmbeddingException, FatalEmbeddingException, RetryableEmbeddingException, @@ -105,14 +106,27 @@ def _get_embedding(self, input_: str | list[str], try_: int = 3) -> list[float] if response.status_code is None: raise EmbeddingException('Error: no response from embedding service') if response.status_code // 100 == 4: - raise FatalEmbeddingException(response.text) + raise FatalEmbeddingException( + response.text or f'Error: embedding request returned non-2xx status code {response.status_code}', + ) if response.status_code // 100 != 2: - raise EmbeddingException(response.text) - # todo: rework exception handling and their downstream interpretation + raise EmbeddingException( + response.text or f'Error: embedding request returned non-2xx status code {response.status_code}', + response, + ) except FatalEmbeddingException as e: logger.error('Fatal error while getting embeddings: %s', str(e), exc_info=e) raise e except EmbeddingException as e: + try: + if e.response: + err_msg = e.response.json().get('error', {}).get('message', '') + if err_msg == 'llama_decode returned -1': + # the document coult not be processed + raise DocErrorEmbeddingException(f'Failed to embed the document: {err_msg}') from e + except niquests.exceptions.JSONDecodeError: + ... + if try_ > 0: logger.debug('Retrying embedding request in 5 secs', extra={'try': try_}) sleep(5) diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index be74b31..09be98a 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -29,6 +29,7 @@ ActionsQueueItems, ActionType, AppRole, + EmbeddingException, FilesQueueItems, IndexingError, LoaderException, @@ -520,6 +521,9 @@ def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None: else: LOGGER.error(f'Failed to return result for task {task["id"]}') + except EmbeddingException as e: + LOGGER.warning(f'Embedding server error for task {task["id"]}: {e}') + return_error_to_nextcloud(task['id'], e) except ContextException as e: LOGGER.warning(f'Context error for task {task["id"]}: {e}') return_error_to_nextcloud(task['id'], e) diff --git a/context_chat_backend/types.py b/context_chat_backend/types.py index 59d2568..410dc3f 100644 --- a/context_chat_backend/types.py +++ b/context_chat_backend/types.py @@ -8,6 +8,7 @@ from io import BytesIO from typing import Annotated, Literal, Self +import niquests from pydantic import AfterValidator, BaseModel, Discriminator, computed_field, field_validator, model_validator from .mimetype_list import SUPPORTED_MIMETYPES @@ -123,7 +124,9 @@ class LoaderException(Exception): class EmbeddingException(Exception): - ... + def __init__(self, msg: str, response: niquests.Response | None = None): + super().__init__(msg) + self.response = response class RetryableEmbeddingException(EmbeddingException): """ @@ -140,6 +143,11 @@ class FatalEmbeddingException(EmbeddingException): Either malformed request, authentication error, or other non-retryable error. """ +class DocErrorEmbeddingException(EmbeddingException): + """ + Exception that indicates a fatal error for the document, this document should not be retried. + """ + class AppRole(str, Enum): NORMAL = 'normal' diff --git a/context_chat_backend/vectordb/pgvector.py b/context_chat_backend/vectordb/pgvector.py index d7b718d..9d88024 100644 --- a/context_chat_backend/vectordb/pgvector.py +++ b/context_chat_backend/vectordb/pgvector.py @@ -20,6 +20,7 @@ from ..chain.types import InDocument, ScopeType from ..types import ( + DocErrorEmbeddingException, EmbeddingException, FatalEmbeddingException, IndexingError, @@ -215,13 +216,24 @@ def add_indocuments(self, indocuments: Mapping[int, InDocument]) -> Mapping[int, retryable=True, ) continue + except DocErrorEmbeddingException as e: + logger.warning( + 'Error adding documents to vectordb, server failed to index it, it will not be retried', + exc_info=e, + extra={ 'source_id': indoc.source_id }, + ) + results[php_db_id] = IndexingError( + error=str(e), + retryable=False, + ) + continue except FatalEmbeddingException as e: raise EmbeddingException( f'Fatal error while embedding documents for source {indoc.source_id}: {e}' ) from e except (RetryableEmbeddingException, EmbeddingException) as e: # temporary error, continue with the next document - logger.exception('Error adding documents to vectordb, should be retried later.', exc_info=e, extra={ + logger.warning('Error adding documents to vectordb, should be retried later.', exc_info=e, extra={ 'source_id': indoc.source_id, }) results[php_db_id] = IndexingError( @@ -615,6 +627,8 @@ def doc_search( # get embeddings return self._similarity_search(session, query, chunk_ids, k) + except EmbeddingException: + raise except Exception as e: raise DbException('Error: performing doc search in vectordb') from e From 7b10b27afe5a3e6bfebab3c16895f4fe64808a4d Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Thu, 9 Apr 2026 16:51:34 +0530 Subject: [PATCH 58/96] chore: migrate default values in the type definition Signed-off-by: Anupam Kumar --- context_chat_backend/config_parser.py | 22 ++++++++-------------- context_chat_backend/types.py | 17 +++++++++-------- 2 files changed, 17 insertions(+), 22 deletions(-) diff --git a/context_chat_backend/config_parser.py b/context_chat_backend/config_parser.py index dafef75..0a62019 100644 --- a/context_chat_backend/config_parser.py +++ b/context_chat_backend/config_parser.py @@ -103,17 +103,11 @@ def get_config(file_path: str) -> TConfig: except Exception as e: raise AssertionError('Error: could not create embedding config from config file') from e - return TConfig( - debug=config.get('debug', False), - uvicorn_log_level=config.get('uvicorn_log_level', 'info'), - disable_aaa=config.get('disable_aaa', False), - verify_ssl=config.get('verify_ssl', config.get('httpx_verify_ssl', True)), - use_colors=config.get('use_colors', True), - uvicorn_workers=config.get('uvicorn_workers', 1), - embedding_chunk_size=config.get('embedding_chunk_size', 1000), - doc_parser_worker_limit=config.get('doc_parser_worker_limit', 10), - - vectordb=vectordb, - embedding=embedding_config, - llm=llm, - ) + config['verify_ssl'] = config.get('verify_ssl', config.get('httpx_verify_ssl', True)) + config.pop('httpx_verify_ssl', None) + + config['llm'] = llm + config['vectordb'] = vectordb + config['embedding'] = embedding_config + + return TConfig(**config) diff --git a/context_chat_backend/types.py b/context_chat_backend/types.py index 410dc3f..345eb6e 100644 --- a/context_chat_backend/types.py +++ b/context_chat_backend/types.py @@ -105,14 +105,15 @@ class TEmbeddingConfig(BaseModel): class TConfig(BaseModel): - debug: bool - uvicorn_log_level: str - disable_aaa: bool - verify_ssl: bool - use_colors: bool - uvicorn_workers: int - embedding_chunk_size: int - doc_parser_worker_limit: int + debug: bool = False + uvicorn_log_level: str = 'info' + disable_aaa: bool = False + verify_ssl: bool = True + use_colors: bool = True + uvicorn_workers: int = 1 + embedding_chunk_size: int = 2000 + # todo: unused now + doc_parser_worker_limit: int = 10 vectordb: tuple[str, dict] embedding: TEmbeddingConfig From 8b4d26046e87317392e756963529f3d16758bd34 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Thu, 9 Apr 2026 18:33:32 +0530 Subject: [PATCH 59/96] chore(config): add config entries for tunables Signed-off-by: Anupam Kumar --- config.cpu.yaml | 5 +- config.gpu.yaml | 5 +- context_chat_backend/chain/ingest/injest.py | 15 ++--- context_chat_backend/task_fetcher.py | 66 +++++++++++---------- context_chat_backend/types.py | 6 +- 5 files changed, 55 insertions(+), 42 deletions(-) diff --git a/config.cpu.yaml b/config.cpu.yaml index 1512ea0..304cb7d 100644 --- a/config.cpu.yaml +++ b/config.cpu.yaml @@ -7,7 +7,10 @@ verify_ssl: true use_colors: true uvicorn_workers: 1 embedding_chunk_size: 2000 -doc_parser_worker_limit: 10 +doc_indexing_batch_size: 32 # theoretical max RAM usage: 32 * 100 MiB +actions_batch_size: 512 +file_parsing_cpu_count: -1 # divides the batch into these many chunks, -1 = auto +concurrent_file_fetches: 10 # maximum number of files to fetch concurrently to not overload the NC server vectordb: diff --git a/config.gpu.yaml b/config.gpu.yaml index fc3acaf..16dcb01 100644 --- a/config.gpu.yaml +++ b/config.gpu.yaml @@ -7,7 +7,10 @@ verify_ssl: true use_colors: true uvicorn_workers: 1 embedding_chunk_size: 2000 -doc_parser_worker_limit: 10 +doc_indexing_batch_size: 32 # theoretical max RAM usage: 32 * 100 MiB +actions_batch_size: 512 +file_parsing_cpu_count: -1 # divides the batch into these many chunks, -1 = auto +concurrent_file_fetches: 10 # maximum number of files to fetch concurrently to not overload the NC server vectordb: diff --git a/context_chat_backend/chain/ingest/injest.py b/context_chat_backend/chain/ingest/injest.py index 190eebd..ad2777e 100644 --- a/context_chat_backend/chain/ingest/injest.py +++ b/context_chat_backend/chain/ingest/injest.py @@ -23,9 +23,7 @@ logger = logging.getLogger('ccb.injest') -# max concurrent fetches to avoid overloading the NC server or hitting rate limits -CONCURRENT_FILE_FETCHES = 10 # todo: config? -MAX_FILE_SIZE = 100 * 1024 * 1024 # 100 MB, all loaded in RAM at once, todo: config? +MAX_FILE_SIZE = 100 * 1024 * 1024 # 100 MB, all loaded in RAM at once async def __fetch_file_content( @@ -83,16 +81,17 @@ async def __fetch_file_content( async def __fetch_files_content( - sources: Mapping[int, SourceItem | ReceivedFileItem] + sources: Mapping[int, SourceItem | ReceivedFileItem], + concurrent_file_fetches: int, ) -> tuple[Mapping[int, SourceItem], Mapping[int, IndexingError]]: source_items = {} error_items = {} - semaphore = asyncio.Semaphore(CONCURRENT_FILE_FETCHES) tasks = [] task_sources = {} + semaphore = asyncio.Semaphore(concurrent_file_fetches) file_count = sum(1 for s in sources.values() if isinstance(s, ReceivedFileItem)) - logger.debug('Fetching content for %d file(s) (max %d concurrent)', file_count, CONCURRENT_FILE_FETCHES) + logger.debug('Fetching content for %d file(s) (max %d concurrent)', file_count, concurrent_file_fetches) for db_id, file in sources.items(): if isinstance(file, SourceItem): @@ -345,7 +344,9 @@ def _process_sources( len(to_embed_sources), ) t0 = perf_counter_ns() - populated_to_embed_sources, errored_sources = asyncio.run(__fetch_files_content(to_embed_sources)) + populated_to_embed_sources, errored_sources = asyncio.run( + __fetch_files_content(to_embed_sources, config.concurrent_file_fetches) + ) elapsed_ms = (perf_counter_ns() - t0) / 1e6 logger.debug( 'File content fetch complete in %.2f ms: %d fetched, %d errored', diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index 09be98a..38f0df8 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -52,17 +52,14 @@ THREADS = {} THREAD_STOP_EVENT = Event() LOGGER = logging.getLogger('ccb.task_fetcher') -FILES_INDEXING_BATCH_SIZE = 16 # theoretical max RAM usage: 16 * 100 MiB, todo: config? MIN_FILES_PER_CPU = 4 -# divides the batch into these many chunks -PARALLEL_FILE_PARSING_COUNT = max(1, (os.cpu_count() or 2) - 1) # todo: config? -LOGGER.info(f'Using {PARALLEL_FILE_PARSING_COUNT} parallel file parsing workers') -ACTIONS_BATCH_SIZE = 512 # todo: config? POLLING_COOLDOWN = 30 -TRIGGER = Event() -CHECK_INTERVAL = 5 -CHECK_INTERVAL_WITH_TRIGGER = 5 * 60 -CHECK_INTERVAL_ON_ERROR = 15 + +# task processing or request processing +TP_TRIGGER = Event() +TP_CHECK_INTERVAL = 5 +TP_CHECK_INTERVAL_WITH_TRIGGER = 5 * 60 +TP_CHECK_INTERVAL_ON_ERROR = 15 CONTEXT_LIMIT=20 @@ -133,6 +130,13 @@ def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> return dict.fromkeys(source_items, err) + # divides the batch into these many chunks + file_parsing_cpu_count = ( + app_config.file_parsing_cpu_count, # when set to a positive value + max(1, (os.cpu_count() or 2) - 1), # when set to auto (-1) + )[app_config.file_parsing_cpu_count == -1] + LOGGER.info(f'Using {file_parsing_cpu_count} parallel file parsing workers') + while True: if THREAD_STOP_EVENT.is_set(): LOGGER.info('Files indexing thread is stopping due to stop event being set') @@ -147,7 +151,7 @@ def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> q_items_res = nc.ocs( 'GET', '/ocs/v2.php/apps/context_chat/queues/documents', - params={ 'n': FILES_INDEXING_BATCH_SIZE } + params={ 'n': app_config.doc_indexing_batch_size } ) try: @@ -164,14 +168,14 @@ def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> providers_result = {} # chunk file parsing for better file operation parallelism - file_chunk_size = max(MIN_FILES_PER_CPU, math.ceil(len(q_items.files) / PARALLEL_FILE_PARSING_COUNT)) + file_chunk_size = max(MIN_FILES_PER_CPU, math.ceil(len(q_items.files) / file_parsing_cpu_count)) file_chunks = [ dict(list(q_items.files.items())[i:i+file_chunk_size]) for i in range(0, len(q_items.files), file_chunk_size) ] provider_chunk_size = max( MIN_FILES_PER_CPU, - math.ceil(len(q_items.content_providers) / PARALLEL_FILE_PARSING_COUNT), + math.ceil(len(q_items.content_providers) / file_parsing_cpu_count), ) provider_chunks = [ dict(list(q_items.content_providers.items())[i:i+provider_chunk_size]) @@ -179,12 +183,12 @@ def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> ] with ThreadPoolExecutor( - max_workers=PARALLEL_FILE_PARSING_COUNT, + max_workers=file_parsing_cpu_count, thread_name_prefix='IndexingPool', ) as executor: LOGGER.info( - 'Dispatching %d file chunk(s) and %d provider chunk(s) to %d IndexingPool worker(s)', - len(file_chunks), len(provider_chunks), PARALLEL_FILE_PARSING_COUNT, + 'Dispatching %d file chunk(s) and %d provider chunk(s)', + len(file_chunks), len(provider_chunks), ) file_futures = [executor.submit(_load_sources, chunk) for chunk in file_chunks] provider_futures = [executor.submit(_load_sources, chunk) for chunk in provider_chunks] @@ -286,7 +290,7 @@ def updates_processing_thread(app_config: TConfig, app_enabled: Event) -> None: q_items_res = nc.ocs( 'GET', '/ocs/v2.php/apps/context_chat/queues/actions', - params={ 'n': ACTIONS_BATCH_SIZE } + params={ 'n': app_config.actions_batch_size } ) try: @@ -451,7 +455,7 @@ def resolve_scope_list(source_ids: list[str], userId: str) -> list[str]: def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None: - LOGGER.info('Starting task fetcher loop') + LOGGER.info('Starting request processing thread') try: network_em = NetworkEmbeddings(app_config) @@ -466,7 +470,7 @@ def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None: while True: if THREAD_STOP_EVENT.is_set(): - LOGGER.info('Updates processing thread is stopping due to stop event being set') + LOGGER.info('Request processing thread is stopping due to stop event being set') return if not network_em.check_connection(ThreadType.REQUEST_PROCESSING.value): @@ -485,7 +489,7 @@ def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None: continue except (NextcloudException, RequestException, JSONDecodeError) as e: LOGGER.error(f"Network error fetching the next task {e}", exc_info=e) - wait_for_tasks(CHECK_INTERVAL_ON_ERROR) + wait_for_tasks(TP_CHECK_INTERVAL_ON_ERROR) continue # Process task @@ -536,21 +540,21 @@ def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None: except Exception as e: LOGGER.exception('Error in task fetcher loop', exc_info=e) - wait_for_tasks(CHECK_INTERVAL_ON_ERROR) + wait_for_tasks(TP_CHECK_INTERVAL_ON_ERROR) -def trigger_handler(providerId: str): - global TRIGGER - print('TRIGGER called') - TRIGGER.set() +def trigger_handler(provider_id: str): + global TP_TRIGGER + LOGGER.debug('Task processing trigger received', extra={'provider_id': provider_id}) + TP_TRIGGER.set() def wait_for_tasks(interval = None): - global TRIGGER - global CHECK_INTERVAL - global CHECK_INTERVAL_WITH_TRIGGER - actual_interval = CHECK_INTERVAL if interval is None else interval - if TRIGGER.wait(timeout=actual_interval): - CHECK_INTERVAL = CHECK_INTERVAL_WITH_TRIGGER - TRIGGER.clear() + global TP_TRIGGER + global TP_CHECK_INTERVAL + global TP_CHECK_INTERVAL_WITH_TRIGGER + actual_interval = TP_CHECK_INTERVAL if interval is None else interval + if TP_TRIGGER.wait(timeout=actual_interval): + TP_CHECK_INTERVAL = TP_CHECK_INTERVAL_WITH_TRIGGER + TP_TRIGGER.clear() def enrich_sources(results: list[SearchResult], userId: str) -> list[str]: diff --git a/context_chat_backend/types.py b/context_chat_backend/types.py index 345eb6e..2694998 100644 --- a/context_chat_backend/types.py +++ b/context_chat_backend/types.py @@ -112,8 +112,10 @@ class TConfig(BaseModel): use_colors: bool = True uvicorn_workers: int = 1 embedding_chunk_size: int = 2000 - # todo: unused now - doc_parser_worker_limit: int = 10 + doc_indexing_batch_size: int = 32 + actions_batch_size: int = 512 + file_parsing_cpu_count: int = -1 + concurrent_file_fetches: int = 10 vectordb: tuple[str, dict] embedding: TEmbeddingConfig From e4be682f4261a5428517cf9a8f20ee2432f745b2 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Thu, 9 Apr 2026 18:35:11 +0530 Subject: [PATCH 60/96] fix: ignore SIGTERM and SIGINT for subprocesses Signed-off-by: Anupam Kumar --- context_chat_backend/utils.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py index 4552e32..c793978 100644 --- a/context_chat_backend/utils.py +++ b/context_chat_backend/utils.py @@ -7,6 +7,7 @@ import logging import multiprocessing as mp import os +import signal import sys import traceback from collections.abc import Callable @@ -112,6 +113,12 @@ def _truncate_capture(text: str) -> str: def exception_wrap(fun: Callable | None, *args, resconn: Connection, stdconn: Connection, **kwargs): + # ignore SIGINT and SIGTERM in child processes these signals don't immediately stop these processes + # the handling is done in the fastapi lifetime to do a graceful shutdown + # SIGKILL is not ignored + signal.signal(signal.SIGINT, signal.SIG_IGN) + signal.signal(signal.SIGTERM, signal.SIG_IGN) + # Preserve real stderr FD for faulthandler before we redirect sys.stderr. _faulthandler_fd = os.dup(2) with suppress(Exception): @@ -128,10 +135,11 @@ def exception_wrap(fun: Callable | None, *args, resconn: Connection, stdconn: Co sys.stderr = stderr_capture try: - if fun is None: - resconn.send({ 'value': None, 'error': None }) - else: - resconn.send({ 'value': fun(*args, **kwargs), 'error': None }) + value = None if fun is None else fun(*args, **kwargs) + try: + resconn.send({ 'value': value, 'error': None }) + except (BrokenPipeError, OSError, EOFError): + ... # parent closed the pipe during shutdown, exit cleanly except BaseException as e: tb = traceback.format_exc() payload = { From d7c9e4f4837329848060eb91b8c89a3caacc0a76 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Thu, 9 Apr 2026 19:07:31 +0530 Subject: [PATCH 61/96] fix: cleanup request processing, get template from config Signed-off-by: Anupam Kumar --- config.cpu.yaml | 5 ++-- config.gpu.yaml | 5 ++-- context_chat_backend/chain/one_shot.py | 41 ++++++++------------------ context_chat_backend/task_fetcher.py | 2 +- 4 files changed, 20 insertions(+), 33 deletions(-) diff --git a/config.cpu.yaml b/config.cpu.yaml index 304cb7d..6ceac91 100644 --- a/config.cpu.yaml +++ b/config.cpu.yaml @@ -46,6 +46,9 @@ embedding: llm: nc_texttotext: + # template: + # n_ctx: + # max_tokens: llama: # all options: https://python.langchain.com/api_reference/community/llms/langchain_community.llms.llamacpp.LlamaCpp.html @@ -55,14 +58,12 @@ llm: max_tokens: 4096 template: "<|im_start|> system \nYou're an AI assistant named Nextcloud Assistant, good at finding relevant context from documents to answer questions provided by the user. <|im_end|>\n<|im_start|> user\nUse the following documents as context to answer the question at the end. REMEMBER to excersice source critisicm as the documents are returned by a search provider that can return unrelated documents.\n\nSTART OF CONTEXT: \n{context} \n\nEND OF CONTEXT!\n\nIf you don't know the answer or are unsure, just say that you don't know, don't try to make up an answer. Don't mention the context in your answer but rather just answer the question directly. Detect the language of the question and make sure to use the same language that was used in the question to answer the question. Don't mention which language was used, but just answer the question directly in the same langauge. \nQuestion: {question} Let's think this step-by-step. \n<|im_end|>\n<|im_start|> assistant\n" no_ctx_template: "<|im_start|> system \nYou're an AI assistant named Nextcloud Assistant.<|im_end|>\n<|im_start|> user\n{question}<|im_end|>\n<|im_start|> assistant\n" - end_separator: "<|im_end|>" ctransformer: # all options: https://python.langchain.com/api_reference/community/llms/langchain_community.llms.ctransformers.CTransformers.html model: dolphin-2.2.1-mistral-7b.Q5_K_M.gguf template: "<|im_start|> system \nYou're an AI assistant named Nextcloud Assistant, good at finding relevant context from documents to answer questions provided by the user. <|im_end|>\n<|im_start|> user\nUse the following documents as context to answer the question at the end. REMEMBER to excersice source critisicm as the documents are returned by a search provider that can return unrelated documents.\n\nSTART OF CONTEXT: \n{context} \n\nEND OF CONTEXT!\n\nIf you don't know the answer or are unsure, just say that you don't know, don't try to make up an answer. Don't mention the context in your answer but rather just answer the question directly. Detect the language of the question and make sure to use the same language that was used in the question to answer the question. Don't mention which language was used, but just answer the question directly in the same langauge. \nQuestion: {question} Let's think this step-by-step. \n<|im_end|>\n<|im_start|> assistant\n" no_ctx_template: "<|im_start|> system \nYou're an AI assistant named Nextcloud Assistant.<|im_end|>\n<|im_start|> user\n{question}<|im_end|>\n<|im_start|> assistant\n" - end_separator: "<|im_end|>" config: context_length: 8192 max_new_tokens: 4096 diff --git a/config.gpu.yaml b/config.gpu.yaml index 16dcb01..a12fd1b 100644 --- a/config.gpu.yaml +++ b/config.gpu.yaml @@ -47,6 +47,9 @@ embedding: llm: nc_texttotext: + # template: + # n_ctx: + # max_tokens: llama: # all options: https://python.langchain.com/api_reference/community/llms/langchain_community.llms.llamacpp.LlamaCpp.html @@ -56,7 +59,6 @@ llm: max_tokens: 4096 template: "<|im_start|> system \nYou're an AI assistant named Nextcloud Assistant, good at finding relevant context from documents to answer questions provided by the user. <|im_end|>\n<|im_start|> user\nUse the following documents as context to answer the question at the end. REMEMBER to excersice source critisicm as the documents are returned by a search provider that can return unrelated documents.\n\nSTART OF CONTEXT: \n{context} \n\nEND OF CONTEXT!\n\nIf you don't know the answer or are unsure, just say that you don't know, don't try to make up an answer. Don't mention the context in your answer but rather just answer the question directly. Detect the language of the question and make sure to use the same language that was used in the question to answer the question. Don't mention which language was used, but just answer the question directly in the same langauge. \nQuestion: {question} Let's think this step-by-step. \n<|im_end|>\n<|im_start|> assistant\n" no_ctx_template: "<|im_start|> system \nYou're an AI assistant named Nextcloud Assistant.<|im_end|>\n<|im_start|> user\n{question}<|im_end|>\n<|im_start|> assistant\n" - end_separator: "<|im_end|>" n_gpu_layers: -1 model_kwargs: device: cuda @@ -66,7 +68,6 @@ llm: model: dolphin-2.2.1-mistral-7b.Q5_K_M.gguf template: "<|im_start|> system \nYou're an AI assistant named Nextcloud Assistant, good at finding relevant context from documents to answer questions provided by the user. <|im_end|>\n<|im_start|> user\nUse the following documents as context to answer the question at the end. REMEMBER to excersice source critisicm as the documents are returned by a search provider that can return unrelated documents.\n\nSTART OF CONTEXT: \n{context} \n\nEND OF CONTEXT!\n\nIf you don't know the answer or are unsure, just say that you don't know, don't try to make up an answer. Don't mention the context in your answer but rather just answer the question directly. Detect the language of the question and make sure to use the same language that was used in the question to answer the question. Don't mention which language was used, but just answer the question directly in the same langauge. \nQuestion: {question} Let's think this step-by-step. \n<|im_end|>\n<|im_start|> assistant\n" no_ctx_template: "<|im_start|> system \nYou're an AI assistant named Nextcloud Assistant.<|im_end|>\n<|im_start|> user\n{question}<|im_end|>\n<|im_start|> assistant\n" - end_separator: "<|im_end|>" config: context_length: 8192 max_new_tokens: 4096 diff --git a/context_chat_backend/chain/one_shot.py b/context_chat_backend/chain/one_shot.py index c79f272..c387621 100644 --- a/context_chat_backend/chain/one_shot.py +++ b/context_chat_backend/chain/one_shot.py @@ -12,38 +12,25 @@ from .query_proc import get_pruned_query from .types import ContextException, LLMOutput, ScopeType, SearchResult -_LLM_TEMPLATE = '''Answer based only on this context and do not add any imaginative details. Make sure to use the same language as the question in your answer. +_LLM_TEMPLATE = '''You're an AI assistant named Nextcloud Assistant, good at finding relevant context from documents to answer questions provided by the user. +Use the following documents as context to answer the question at the end. REMEMBER to excersice source critisicm as the documents are returned by a search provider that can return unrelated documents. + +START OF CONTEXT: {context} -{question} -''' # noqa: E501 +END OF CONTEXT! -logger = logging.getLogger('ccb.chain') +If you don't know the answer or are unsure, just say that you don't know, don't try to make up an answer. +Don't mention the context in your answer but rather just answer the question directly. +Detect the language of the question and make sure to use the same language that was used in the question to answer the question. +Don't mention which language was used, but just answer the question directly in the same langauge. -# todo: remove this maybe -def process_query( - user_id: str, - llm: LLM, - app_config: TConfig, - query: str, - no_ctx_template: str | None = None, - end_separator: str = '', -): - """ - Raises - ------ - ValueError - If the context length is too small to fit the query - """ - stop = [end_separator] if end_separator else None - output = llm.invoke( - (query, get_pruned_query(llm, app_config, query, no_ctx_template, []))[no_ctx_template is not None], # pyright: ignore[reportArgumentType] - stop=stop, - userid=user_id, - ).strip() +Question: {question} - return LLMOutput(output=output, sources=[]) +Let's think this step-by-step. +''' # noqa: E501 +logger = logging.getLogger('ccb.chain') def process_context_query( user_id: str, @@ -55,7 +42,6 @@ def process_context_query( scope_type: ScopeType | None = None, scope_list: list[str] | None = None, template: str | None = None, - end_separator: str = '', ): """ Raises @@ -76,7 +62,6 @@ def process_context_query( output = llm.invoke( get_pruned_query(llm, app_config, query, template or _LLM_TEMPLATE, context_chunks), - stop=[end_separator], userid=user_id, ).strip() unique_sources = [SearchResult( diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index 38f0df8..d8fee7c 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -647,7 +647,6 @@ def process_normal_task( if task_input.get('scopeType') == 'none': task_input['scopeType'] = None - # todo: document no template support return exec_in_proc(target=process_context_query, args=( user_id, @@ -658,6 +657,7 @@ def process_normal_task( CONTEXT_LIMIT, task_input.get('scopeType'), task_input.get('scopeList'), + app_config.llm[1].get('template'), ) ) From da680e3ddfd09ed20d0e4fa283995e636cc0180c Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Fri, 10 Apr 2026 12:08:40 +0530 Subject: [PATCH 62/96] fix: explicit check for non-None response Signed-off-by: Anupam Kumar --- context_chat_backend/network_em.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/context_chat_backend/network_em.py b/context_chat_backend/network_em.py index 8b85169..5ba8faf 100644 --- a/context_chat_backend/network_em.py +++ b/context_chat_backend/network_em.py @@ -119,7 +119,7 @@ def _get_embedding(self, input_: str | list[str], try_: int = 3) -> list[float] raise e except EmbeddingException as e: try: - if e.response: + if e.response is not None: err_msg = e.response.json().get('error', {}).get('message', '') if err_msg == 'llama_decode returned -1': # the document coult not be processed From ecf07c4f3cf0d693074ee9ab236c1b6c584bf022 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Wed, 15 Apr 2026 12:05:02 +0530 Subject: [PATCH 63/96] fix: add default value of limit in search task type Signed-off-by: Anupam Kumar --- context_chat_backend/controller.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py index 9c3812e..13fbb7c 100644 --- a/context_chat_backend/controller.py +++ b/context_chat_backend/controller.py @@ -67,16 +67,19 @@ def enabled_handler(enabled: bool, nc: NextcloudApp | AsyncNextcloudApp) -> str: try: if enabled: provider = TaskProcessingProvider( - id="context_chat-context_chat_search", - name="Context Chat", - task_type="context_chat:context_chat_search", + id='context_chat-context_chat_search', + name='Context Chat', + task_type='context_chat:context_chat_search', expected_runtime=30, + input_shape_defaults={ + 'limit': 10, + }, ) nc.providers.task_processing.register(provider) provider = TaskProcessingProvider( - id="context_chat-context_chat", - name="Context Chat", - task_type="context_chat:context_chat", + id='context_chat-context_chat', + name='Context Chat', + task_type='context_chat:context_chat', expected_runtime=30, ) nc.providers.task_processing.register(provider) From 531e58105951f0d43f5ab432665e08850b4f0fb6 Mon Sep 17 00:00:00 2001 From: Marcel Klehr Date: Wed, 15 Apr 2026 10:24:05 +0200 Subject: [PATCH 64/96] fix(bg_threads): Poll app enabled state every 30s in all threads needed because enabled handler --- context_chat_backend/controller.py | 14 ++++---- context_chat_backend/task_fetcher.py | 51 ++++++++++++++++++++++------ 2 files changed, 49 insertions(+), 16 deletions(-) diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py index 13fbb7c..007e945 100644 --- a/context_chat_backend/controller.py +++ b/context_chat_backend/controller.py @@ -35,7 +35,7 @@ from .models.types import LlmException from nc_py_api.ex_app import AppAPIAuthMiddleware from .utils import JSONResponse, exec_in_proc -from .task_fetcher import start_bg_threads, trigger_handler, wait_for_bg_threads +from .task_fetcher import THREAD_STOP_EVENT, start_bg_threads, trigger_handler, wait_for_bg_threads from .vectordb.service import count_documents_by_provider # setup @@ -84,7 +84,11 @@ def enabled_handler(enabled: bool, nc: NextcloudApp | AsyncNextcloudApp) -> str: ) nc.providers.task_processing.register(provider) app_enabled.set() - start_bg_threads(app_config, app_enabled) + if THREAD_STOP_EVENT.is_set(): + # If the threads were previously stopped, we start them again + # otherwise the lifecycle handler has already started them + start_bg_threads(app_config) + THREAD_STOP_EVENT.clear() else: app_enabled.clear() wait_for_bg_threads() @@ -99,11 +103,9 @@ def enabled_handler(enabled: bool, nc: NextcloudApp | AsyncNextcloudApp) -> str: @asynccontextmanager async def lifespan(app: FastAPI): set_handlers(app, enabled_handler, models_to_fetch=models_to_fetch, trigger_handler=trigger_handler) + start_bg_threads(app_config) nc = NextcloudApp() - if nc.enabled_state: - app_enabled.set() - start_bg_threads(app_config, app_enabled) - logger.info(f'App enable state at startup: {app_enabled.is_set()}') + logger.info(f'App enable state at startup: {nc.enabled_state}') yield vectordb_loader.offload() wait_for_bg_threads() diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index d8fee7c..b40ea2a 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -10,7 +10,7 @@ from contextlib import suppress from enum import Enum from threading import Event, Thread -from time import sleep +from time import sleep, time from typing import Any import niquests @@ -69,7 +69,7 @@ class ThreadType(Enum): REQUEST_PROCESSING = 'request_processing' -def files_indexing_thread(app_config: TConfig, app_enabled: Event) -> None: +def files_indexing_thread(app_config: TConfig) -> None: try: network_em = NetworkEmbeddings(app_config) vectordb_loader = VectorDBLoader(app_config) @@ -137,17 +137,28 @@ def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> )[app_config.file_parsing_cpu_count == -1] LOGGER.info(f'Using {file_parsing_cpu_count} parallel file parsing workers') + nc = NextcloudApp() + last_enabled_check = time() + enabled_state = nc.enabled_state while True: if THREAD_STOP_EVENT.is_set(): LOGGER.info('Files indexing thread is stopping due to stop event being set') return + if time() - last_enabled_check > 30: # check enabled state every 30 seconds + enabled_state = nc.enabled_state + last_enabled_check = time() + + if not enabled_state: + LOGGER.info('App is disabled, files indexing thread will sleep until next enabled state check') + sleep(POLLING_COOLDOWN) + continue + try: if not network_em.check_connection(ThreadType.FILES_INDEXING.value): sleep(POLLING_COOLDOWN) continue - nc = NextcloudApp() q_items_res = nc.ocs( 'GET', '/ocs/v2.php/apps/context_chat/queues/documents', @@ -273,20 +284,30 @@ def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> -def updates_processing_thread(app_config: TConfig, app_enabled: Event) -> None: +def updates_processing_thread(app_config: TConfig) -> None: try: vectordb_loader = VectorDBLoader(app_config) except LoaderException as e: LOGGER.error('Error initializing vector DB loader, files indexing thread will not start:', exc_info=e) return + nc = NextcloudApp() + enabled_state = nc.enabled_state + last_enabled_check = time() while True: if THREAD_STOP_EVENT.is_set(): LOGGER.info('Updates processing thread is stopping due to stop event being set') return + if time() - last_enabled_check > 30: # check enabled state every 30 seconds + enabled_state = nc.enabled_state + last_enabled_check = time() + + if not enabled_state: + sleep(POLLING_COOLDOWN) + continue + try: - nc = NextcloudApp() q_items_res = nc.ocs( 'GET', '/ocs/v2.php/apps/context_chat/queues/actions', @@ -454,7 +475,7 @@ def resolve_scope_list(source_ids: list[str], userId: str) -> list[str]: return ScopeList.model_validate(data).source_ids -def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None: +def request_processing_thread(app_config: TConfig) -> None: LOGGER.info('Starting request processing thread') try: @@ -466,6 +487,8 @@ def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None: return nc = NextcloudApp() + enabled_state = nc.enabled_state + last_enabled_check = time() llm: LLM = llm_loader.load() while True: @@ -477,6 +500,14 @@ def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None: sleep(POLLING_COOLDOWN) continue + if time() - last_enabled_check > 30: # check enabled state every 30 seconds + enabled_state = nc.enabled_state + last_enabled_check = time() + + if not enabled_state: + sleep(POLLING_COOLDOWN) + continue + try: # Fetch pending task try: @@ -695,7 +726,7 @@ def process_search_task( ) -def start_bg_threads(app_config: TConfig, app_enabled: Event): +def start_bg_threads(app_config: TConfig): if APP_ROLE == AppRole.INDEXING or APP_ROLE == AppRole.NORMAL: if ( ThreadType.FILES_INDEXING in THREADS @@ -707,12 +738,12 @@ def start_bg_threads(app_config: TConfig, app_enabled: Event): THREAD_STOP_EVENT.clear() THREADS[ThreadType.FILES_INDEXING] = Thread( target=files_indexing_thread, - args=(app_config, app_enabled), + args=(app_config,), name='FilesIndexingThread', ) THREADS[ThreadType.UPDATES_PROCESSING] = Thread( target=updates_processing_thread, - args=(app_config, app_enabled), + args=(app_config,), name='UpdatesProcessingThread', ) THREADS[ThreadType.FILES_INDEXING].start() @@ -726,7 +757,7 @@ def start_bg_threads(app_config: TConfig, app_enabled: Event): THREAD_STOP_EVENT.clear() THREADS[ThreadType.REQUEST_PROCESSING] = Thread( target=request_processing_thread, - args=(app_config, app_enabled), + args=(app_config,), name='RequestProcessingThread', ) THREADS[ThreadType.REQUEST_PROCESSING].start() From 7337d1710dae3b422f5a5ab3d981f7ea0b602856 Mon Sep 17 00:00:00 2001 From: Marcel Klehr Date: Wed, 15 Apr 2026 11:41:28 +0200 Subject: [PATCH 65/96] fix(app_enabled): centralize app_enabled check to reduce requests to nextcloud server Signed-off-by: Marcel Klehr --- context_chat_backend/controller.py | 26 +++++++++++++----- context_chat_backend/task_fetcher.py | 40 ++++++++-------------------- context_chat_backend/utils.py | 4 +-- 3 files changed, 33 insertions(+), 37 deletions(-) diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py index 007e945..a719f11 100644 --- a/context_chat_backend/controller.py +++ b/context_chat_backend/controller.py @@ -2,6 +2,8 @@ # SPDX-FileCopyrightText: 2023 Nextcloud GmbH and Nextcloud contributors # SPDX-License-Identifier: AGPL-3.0-or-later # +import time + from nc_py_api.ex_app.providers.task_processing import TaskProcessingProvider # isort: off @@ -61,7 +63,20 @@ 'revision': '607a30d783dfa663caf39e06633721c8d4cfcd7e', } } if __download_models_from_hf else {} + + app_enabled = threading.Event() +last_enabled_check: int|None = None +def get_enabled_state() -> bool: + global last_enabled_check + if last_enabled_check is None or time.time() - last_enabled_check > 30: + nc = NextcloudApp() + if nc.enabled_state: + app_enabled.set() + else: + app_enabled.clear() + last_enabled_check = time.time() + return app_enabled.is_set() def enabled_handler(enabled: bool, nc: NextcloudApp | AsyncNextcloudApp) -> str: try: @@ -87,7 +102,7 @@ def enabled_handler(enabled: bool, nc: NextcloudApp | AsyncNextcloudApp) -> str: if THREAD_STOP_EVENT.is_set(): # If the threads were previously stopped, we start them again # otherwise the lifecycle handler has already started them - start_bg_threads(app_config) + start_bg_threads(app_config, get_enabled_state) THREAD_STOP_EVENT.clear() else: app_enabled.clear() @@ -103,9 +118,8 @@ def enabled_handler(enabled: bool, nc: NextcloudApp | AsyncNextcloudApp) -> str: @asynccontextmanager async def lifespan(app: FastAPI): set_handlers(app, enabled_handler, models_to_fetch=models_to_fetch, trigger_handler=trigger_handler) - start_bg_threads(app_config) - nc = NextcloudApp() - logger.info(f'App enable state at startup: {nc.enabled_state}') + start_bg_threads(app_config, get_enabled_state) + logger.info(f'App enable state at startup: {get_enabled_state()}') yield vectordb_loader.offload() wait_for_bg_threads() @@ -192,7 +206,7 @@ def decorator(func: Callable): @wraps(func) def wrapper(*args, **kwargs): disable_aaa = app.extra['CONFIG'].disable_aaa - if not disable_aaa and not app_enabled.is_set(): + if not disable_aaa and not get_enabled_state(): return JSONResponse('Context Chat is disabled, enable it from AppAPI to use it.', 503) return func(*args, **kwargs) @@ -213,7 +227,7 @@ def _(request: Request): @app.get('/enabled') def _(): - return JSONResponse(content={'enabled': app_enabled.is_set()}, status_code=200) + return JSONResponse(content={'enabled': get_enabled_state()}, status_code=200) @app.post('/countIndexedDocuments') diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index b40ea2a..6cce556 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -10,7 +10,7 @@ from contextlib import suppress from enum import Enum from threading import Event, Thread -from time import sleep, time +from time import sleep from typing import Any import niquests @@ -69,7 +69,7 @@ class ThreadType(Enum): REQUEST_PROCESSING = 'request_processing' -def files_indexing_thread(app_config: TConfig) -> None: +def files_indexing_thread(app_config: TConfig, get_enabled_state) -> None: try: network_em = NetworkEmbeddings(app_config) vectordb_loader = VectorDBLoader(app_config) @@ -138,18 +138,12 @@ def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> LOGGER.info(f'Using {file_parsing_cpu_count} parallel file parsing workers') nc = NextcloudApp() - last_enabled_check = time() - enabled_state = nc.enabled_state while True: if THREAD_STOP_EVENT.is_set(): LOGGER.info('Files indexing thread is stopping due to stop event being set') return - if time() - last_enabled_check > 30: # check enabled state every 30 seconds - enabled_state = nc.enabled_state - last_enabled_check = time() - - if not enabled_state: + if not get_enabled_state(): LOGGER.info('App is disabled, files indexing thread will sleep until next enabled state check') sleep(POLLING_COOLDOWN) continue @@ -284,7 +278,7 @@ def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> -def updates_processing_thread(app_config: TConfig) -> None: +def updates_processing_thread(app_config: TConfig, get_enabled_state) -> None: try: vectordb_loader = VectorDBLoader(app_config) except LoaderException as e: @@ -292,18 +286,12 @@ def updates_processing_thread(app_config: TConfig) -> None: return nc = NextcloudApp() - enabled_state = nc.enabled_state - last_enabled_check = time() while True: if THREAD_STOP_EVENT.is_set(): LOGGER.info('Updates processing thread is stopping due to stop event being set') return - if time() - last_enabled_check > 30: # check enabled state every 30 seconds - enabled_state = nc.enabled_state - last_enabled_check = time() - - if not enabled_state: + if not get_enabled_state(): sleep(POLLING_COOLDOWN) continue @@ -475,7 +463,7 @@ def resolve_scope_list(source_ids: list[str], userId: str) -> list[str]: return ScopeList.model_validate(data).source_ids -def request_processing_thread(app_config: TConfig) -> None: +def request_processing_thread(app_config: TConfig, get_enabled_state) -> None: LOGGER.info('Starting request processing thread') try: @@ -487,8 +475,6 @@ def request_processing_thread(app_config: TConfig) -> None: return nc = NextcloudApp() - enabled_state = nc.enabled_state - last_enabled_check = time() llm: LLM = llm_loader.load() while True: @@ -500,11 +486,7 @@ def request_processing_thread(app_config: TConfig) -> None: sleep(POLLING_COOLDOWN) continue - if time() - last_enabled_check > 30: # check enabled state every 30 seconds - enabled_state = nc.enabled_state - last_enabled_check = time() - - if not enabled_state: + if not get_enabled_state(): sleep(POLLING_COOLDOWN) continue @@ -726,7 +708,7 @@ def process_search_task( ) -def start_bg_threads(app_config: TConfig): +def start_bg_threads(app_config: TConfig, get_enabled_state): if APP_ROLE == AppRole.INDEXING or APP_ROLE == AppRole.NORMAL: if ( ThreadType.FILES_INDEXING in THREADS @@ -738,12 +720,12 @@ def start_bg_threads(app_config: TConfig): THREAD_STOP_EVENT.clear() THREADS[ThreadType.FILES_INDEXING] = Thread( target=files_indexing_thread, - args=(app_config,), + args=(app_config,get_enabled_state), name='FilesIndexingThread', ) THREADS[ThreadType.UPDATES_PROCESSING] = Thread( target=updates_processing_thread, - args=(app_config,), + args=(app_config,get_enabled_state), name='UpdatesProcessingThread', ) THREADS[ThreadType.FILES_INDEXING].start() @@ -757,7 +739,7 @@ def start_bg_threads(app_config: TConfig): THREAD_STOP_EVENT.clear() THREADS[ThreadType.REQUEST_PROCESSING] = Thread( target=request_processing_thread, - args=(app_config,), + args=(app_config,get_enabled_state), name='RequestProcessingThread', ) THREADS[ThreadType.REQUEST_PROCESSING].start() diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py index c793978..2e82353 100644 --- a/context_chat_backend/utils.py +++ b/context_chat_backend/utils.py @@ -9,6 +9,7 @@ import os import signal import sys +import time import traceback from collections.abc import Callable from contextlib import suppress @@ -18,7 +19,6 @@ from typing import Any, TypeGuard, TypeVar from fastapi.responses import JSONResponse as FastAPIJSONResponse - from .types import AppRole, TConfig, TEmbeddingAuthApiKey, TEmbeddingAuthBasic, TEmbeddingConfig T = TypeVar('T') @@ -313,4 +313,4 @@ def get_app_role() -> AppRole: if role not in ['indexing', 'rp']: _logger.warning(f'Invalid app role: {role}, defaulting to all roles') return AppRole.NORMAL - return AppRole(role) + return AppRole(role) \ No newline at end of file From 7c1cf456ca53aa15e69617b4e3f319769edbf3b7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 15 Apr 2026 10:53:56 +0000 Subject: [PATCH 66/96] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- context_chat_backend/utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py index 2e82353..ca7a486 100644 --- a/context_chat_backend/utils.py +++ b/context_chat_backend/utils.py @@ -9,7 +9,6 @@ import os import signal import sys -import time import traceback from collections.abc import Callable from contextlib import suppress @@ -313,4 +312,4 @@ def get_app_role() -> AppRole: if role not in ['indexing', 'rp']: _logger.warning(f'Invalid app role: {role}, defaulting to all roles') return AppRole.NORMAL - return AppRole(role) \ No newline at end of file + return AppRole(role) From 5e9eb76e2b9c63a31ecabf5a1bb96e107433bd2f Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Mon, 20 Apr 2026 15:30:04 +0530 Subject: [PATCH 67/96] pyright and ruff fixes Signed-off-by: Anupam Kumar --- context_chat_backend/controller.py | 2 +- context_chat_backend/utils.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py index a719f11..bcf48e0 100644 --- a/context_chat_backend/controller.py +++ b/context_chat_backend/controller.py @@ -66,7 +66,7 @@ app_enabled = threading.Event() -last_enabled_check: int|None = None +last_enabled_check: float | None = None def get_enabled_state() -> bool: global last_enabled_check if last_enabled_check is None or time.time() - last_enabled_check > 30: diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py index ca7a486..c793978 100644 --- a/context_chat_backend/utils.py +++ b/context_chat_backend/utils.py @@ -18,6 +18,7 @@ from typing import Any, TypeGuard, TypeVar from fastapi.responses import JSONResponse as FastAPIJSONResponse + from .types import AppRole, TConfig, TEmbeddingAuthApiKey, TEmbeddingAuthBasic, TEmbeddingConfig T = TypeVar('T') From 04d7fe1c56366e1dbaad83a2095e868a97ee6492 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Mon, 20 Apr 2026 21:53:52 +0530 Subject: [PATCH 68/96] fix(k8s): do not register task proc trigger endpoint Signed-off-by: Anupam Kumar --- context_chat_backend/controller.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py index bcf48e0..f65f19b 100644 --- a/context_chat_backend/controller.py +++ b/context_chat_backend/controller.py @@ -8,7 +8,7 @@ # isort: off from .chain.types import ContextException -from .types import LoaderException, EmbeddingException +from .types import AppRole, LoaderException, EmbeddingException from .vectordb.types import DbException, SafeDbException from .setup_functions import ensure_config_file, repair_run, setup_env_vars @@ -36,7 +36,7 @@ from .dyn_loader import VectorDBLoader from .models.types import LlmException from nc_py_api.ex_app import AppAPIAuthMiddleware -from .utils import JSONResponse, exec_in_proc +from .utils import JSONResponse, exec_in_proc, get_app_role from .task_fetcher import THREAD_STOP_EVENT, start_bg_threads, trigger_handler, wait_for_bg_threads from .vectordb.service import count_documents_by_provider @@ -117,7 +117,13 @@ def enabled_handler(enabled: bool, nc: NextcloudApp | AsyncNextcloudApp) -> str: @asynccontextmanager async def lifespan(app: FastAPI): - set_handlers(app, enabled_handler, models_to_fetch=models_to_fetch, trigger_handler=trigger_handler) + app_role = get_app_role() + if app_role == AppRole.NORMAL: + set_handlers(app, enabled_handler, models_to_fetch=models_to_fetch, trigger_handler=trigger_handler) + else: + # k8s' rp role pulls tasks + set_handlers(app, enabled_handler, models_to_fetch=models_to_fetch) + start_bg_threads(app_config, get_enabled_state) logger.info(f'App enable state at startup: {get_enabled_state()}') yield From 2fbf9fc47de33015e0e3a21a2b3d0f07869d366b Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Mon, 20 Apr 2026 21:54:40 +0530 Subject: [PATCH 69/96] fix(k8s): do not start internal pgsql in k8s env Signed-off-by: Anupam Kumar --- dockerfile_scripts/pgsql/setup.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/dockerfile_scripts/pgsql/setup.sh b/dockerfile_scripts/pgsql/setup.sh index cee4295..7578ed8 100755 --- a/dockerfile_scripts/pgsql/setup.sh +++ b/dockerfile_scripts/pgsql/setup.sh @@ -18,7 +18,7 @@ fi # Check if EXTERNAL_DB is set if [ -n "${EXTERNAL_DB}" ]; then if [[ "$EXTERNAL_DB" != "postgresql+psycopg://"* ]]; then - echo "EXTERNAL_DB must be a PostgreSQL URL and start with 'postgresql+psycopg://'" + printf "%s\n" "EXTERNAL_DB must be a PostgreSQL URL and start with 'postgresql+psycopg://'" >&2 exit 1 fi @@ -31,6 +31,11 @@ if [ -n "${EXTERNAL_DB}" ]; then exit 0 fi +if [[ -n "${APP_ROLE}" && "$APP_ROLE" != "normal" && "$APP_ROLE" != "" ]]; then + printf "%s\n" "Refusing to start the internal postgresql server in Kubernetes environment, use an external database through the EXTERNAL_DB env var." >&2 + exit 1 +fi + # Ensure the directory exists and has the correct permissions mkdir -p "$DATA_DIR" chmod +rx "${APP_PERSISTENT_STORAGE:-persistent_storage}" From cf98ab2ec33cc1f28fd008ba67196a7c86e8a0be Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Tue, 21 Apr 2026 15:38:13 +0530 Subject: [PATCH 70/96] fix(k8s): log exclusively to stderr for k8s env Signed-off-by: Anupam Kumar --- context_chat_backend/controller.py | 11 +++++-- context_chat_backend/logger.py | 1 + context_chat_backend/task_fetcher.py | 14 +++++---- context_chat_backend/utils.py | 5 ++++ logger_config.k8s.yaml | 43 ++++++++++++++++++++++++++++ main.py | 21 +++++++++----- main_em.py | 21 +++++++++----- 7 files changed, 94 insertions(+), 22 deletions(-) create mode 100644 logger_config.k8s.yaml diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py index f65f19b..02db402 100644 --- a/context_chat_backend/controller.py +++ b/context_chat_backend/controller.py @@ -36,7 +36,7 @@ from .dyn_loader import VectorDBLoader from .models.types import LlmException from nc_py_api.ex_app import AppAPIAuthMiddleware -from .utils import JSONResponse, exec_in_proc, get_app_role +from .utils import JSONResponse, exec_in_proc, get_app_role, is_k8s_env from .task_fetcher import THREAD_STOP_EVENT, start_bg_threads, trigger_handler, wait_for_bg_threads from .vectordb.service import count_documents_by_provider @@ -134,6 +134,7 @@ async def lifespan(app: FastAPI): app = FastAPI(debug=app_config.debug, lifespan=lifespan) # pyright: ignore[reportArgumentType] app.extra['CONFIG'] = app_config +k8s_env = is_k8s_env() # loaders @@ -244,7 +245,13 @@ def _(): @app.get('/downloadLogs') -def download_logs() -> FileResponse: +def download_logs(): + if k8s_env: + return JSONResponse( + 'Download of logs is not supported in Kubernetes environment. Use the standard logging infrastructure.', + status_code=400, + ) + with tempfile.NamedTemporaryFile('wb', delete=False) as tmp: with zipfile.ZipFile(tmp, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file: files = os.listdir(os.path.join(persistent_storage(), 'logs')) diff --git a/context_chat_backend/logger.py b/context_chat_backend/logger.py index 79e99af..25fb161 100644 --- a/context_chat_backend/logger.py +++ b/context_chat_backend/logger.py @@ -51,6 +51,7 @@ def __init__( self, *, fmt_keys: dict[str, str] | None = None, + use_colors: bool = False, ): super().__init__() self.fmt_keys = fmt_keys if fmt_keys is not None else {} diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index 6cce556..dd6f296 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -282,7 +282,7 @@ def updates_processing_thread(app_config: TConfig, get_enabled_state) -> None: try: vectordb_loader = VectorDBLoader(app_config) except LoaderException as e: - LOGGER.error('Error initializing vector DB loader, files indexing thread will not start:', exc_info=e) + LOGGER.error('Error initializing vector DB loader, updates processing thread will not start:', exc_info=e) return nc = NextcloudApp() @@ -292,6 +292,7 @@ def updates_processing_thread(app_config: TConfig, get_enabled_state) -> None: return if not get_enabled_state(): + LOGGER.info('App is disabled, updates processing thread will sleep until next enabled state check') sleep(POLLING_COOLDOWN) continue @@ -471,7 +472,7 @@ def request_processing_thread(app_config: TConfig, get_enabled_state) -> None: vectordb_loader = VectorDBLoader(app_config) llm_loader = LLMModelLoader(app_config) except LoaderException as e: - LOGGER.error('Error initializing vector DB loader, files indexing thread will not start:', exc_info=e) + LOGGER.error('Error initializing vector DB loader, request processing thread will not start:', exc_info=e) return nc = NextcloudApp() @@ -482,15 +483,16 @@ def request_processing_thread(app_config: TConfig, get_enabled_state) -> None: LOGGER.info('Request processing thread is stopping due to stop event being set') return - if not network_em.check_connection(ThreadType.REQUEST_PROCESSING.value): - sleep(POLLING_COOLDOWN) - continue - if not get_enabled_state(): + LOGGER.info('App is disabled, request processing thread will sleep until next enabled state check') sleep(POLLING_COOLDOWN) continue try: + if not network_em.check_connection(ThreadType.REQUEST_PROCESSING.value): + sleep(POLLING_COOLDOWN) + continue + # Fetch pending task try: response = nc.providers.task_processing.next_task( diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py index c793978..d1e6210 100644 --- a/context_chat_backend/utils.py +++ b/context_chat_backend/utils.py @@ -314,3 +314,8 @@ def get_app_role() -> AppRole: _logger.warning(f'Invalid app role: {role}, defaulting to all roles') return AppRole.NORMAL return AppRole(role) + + +def is_k8s_env(): + role = get_app_role() + return role == AppRole.NORMAL diff --git a/logger_config.k8s.yaml b/logger_config.k8s.yaml new file mode 100644 index 0000000..6d5c729 --- /dev/null +++ b/logger_config.k8s.yaml @@ -0,0 +1,43 @@ +# +# SPDX-FileCopyrightText: 2022 MCODING, LLC +# SPDX-FileCopyrightText: 2025 Nextcloud GmbH and Nextcloud contributors +# SPDX-License-Identifier: AGPL-3.0-or-later +# + +version: 1 +disable_existing_loggers: false + +formatters: + json: + (): context_chat_backend.logger.JSONFormatter + fmt_keys: + timestamp: timestamp + level: levelname + logger: name + message: message + filename: filename + function: funcName + line: lineno + thread_name: threadName + pid: process + + +handlers: + stderr: + class: logging.StreamHandler + level: DEBUG + formatter: json + stream: ext://sys.stderr + + +loggers: + root: + level: WARNING + handlers: + - stderr + + ccb: + level: WARNING + handlers: + - stderr + propagate: false diff --git a/main.py b/main.py index 076b7db..8a2beda 100755 --- a/main.py +++ b/main.py @@ -15,9 +15,10 @@ from context_chat_backend.types import TConfig # isort: skip from context_chat_backend.controller import app # isort: skip from context_chat_backend.logger import get_logging_config, setup_logging # isort: skip -from context_chat_backend.utils import redact_config # isort: skip +from context_chat_backend.utils import is_k8s_env, redact_config # isort: skip LOGGER_CONFIG_NAME = 'logger_config.yaml' +LOGGER_K8S_CONFIG_NAME = 'logger_config.k8s.yaml' def _setup_log_levels(debug: bool): ''' @@ -46,7 +47,8 @@ def _setup_log_levels(debug: bool): if __name__ == '__main__': - logging_config = get_logging_config(LOGGER_CONFIG_NAME) + k8s_env = is_k8s_env() + logging_config = get_logging_config(LOGGER_K8S_CONFIG_NAME if k8s_env else LOGGER_CONFIG_NAME) setup_logging(logging_config) app_config: TConfig = app.extra['CONFIG'] _setup_log_levels(app_config.debug) @@ -66,11 +68,16 @@ def _setup_log_levels(debug: bool): print('App config:\n' + redact_config(app_config).model_dump_json(indent=2), flush=True) uv_log_config = uvicorn.config.LOGGING_CONFIG # pyright: ignore[reportAttributeAccessIssue] - uv_log_config['formatters']['json'] = logging_config['formatters']['json'] - uv_log_config['handlers']['file_json'] = logging_config['handlers']['file_json'] + use_colors = False if k8s_env else (app_config.use_colors and getenv('CI', 'false') == 'false') - uv_log_config['loggers']['uvicorn']['handlers'].append('file_json') - uv_log_config['loggers']['uvicorn.access']['handlers'].append('file_json') + if k8s_env: + uv_log_config['formatters']['default'] = logging_config['formatters']['json'] + uv_log_config['formatters']['access'] = logging_config['formatters']['json'] + else: + uv_log_config['formatters']['json'] = logging_config['formatters']['json'] + uv_log_config['handlers']['file_json'] = logging_config['handlers']['file_json'] + uv_log_config['loggers']['uvicorn']['handlers'].append('file_json') + uv_log_config['loggers']['uvicorn.access']['handlers'].append('file_json') run_app( uvicorn_app=app, @@ -78,7 +85,7 @@ def _setup_log_levels(debug: bool): interface='asgi3', log_config=uv_log_config, log_level=app_config.uvicorn_log_level, - use_colors=bool(app_config.use_colors and getenv('CI', 'false') == 'false'), + use_colors=use_colors, # limit_concurrency=10, # backlog=20, timeout_keep_alive=120, diff --git a/main_em.py b/main_em.py index b7d5a93..17d9f9a 100755 --- a/main_em.py +++ b/main_em.py @@ -16,10 +16,11 @@ from context_chat_backend.config_parser import get_config # isort: skip from context_chat_backend.logger import get_logging_config, setup_logging # isort: skip from context_chat_backend.setup_functions import ensure_config_file, setup_env_vars # isort: skip -from context_chat_backend.utils import redact_config # isort: skip +from context_chat_backend.utils import is_k8s_env, redact_config # isort: skip LOGGER_CONFIG_NAME = 'logger_config_em.yaml' +LOGGER_K8S_CONFIG_NAME = 'logger_config.k8s.yaml' STARTUP_CHECK_SEC = 10 MAX_TRIES = 180 # 180*10 secs = 30 minutes max @@ -108,7 +109,8 @@ def _wait_main_app_enabled() -> None: # in local embedding server config print('Embedder config:\n' + redact_config(em_conf).model_dump_json(indent=2), flush=True) - logging_config = get_logging_config(LOGGER_CONFIG_NAME) + k8s_env = is_k8s_env() + logging_config = get_logging_config(LOGGER_K8S_CONFIG_NAME if k8s_env else LOGGER_CONFIG_NAME) setup_logging(logging_config) logger = logging.getLogger('emserver') if app_config.debug: @@ -158,11 +160,16 @@ def _wait_main_app_enabled() -> None: ) uv_log_config = uvicorn.config.LOGGING_CONFIG # pyright: ignore[reportAttributeAccessIssue] - uv_log_config['formatters']['json'] = logging_config['formatters']['json'] - uv_log_config['handlers']['file_json'] = logging_config['handlers']['file_json'] + use_colors = False if k8s_env else (app_config.use_colors and os.getenv('CI', 'false') == 'false') - uv_log_config['loggers']['uvicorn']['handlers'].append('file_json') - uv_log_config['loggers']['uvicorn.access']['handlers'].append('file_json') + if k8s_env: + uv_log_config['formatters']['default'] = logging_config['formatters']['json'] + uv_log_config['formatters']['access'] = logging_config['formatters']['json'] + else: + uv_log_config['formatters']['json'] = logging_config['formatters']['json'] + uv_log_config['handlers']['file_json'] = logging_config['handlers']['file_json'] + uv_log_config['loggers']['uvicorn']['handlers'].append('file_json') + uv_log_config['loggers']['uvicorn.access']['handlers'].append('file_json') uvicorn.run( # todo: use string import of the app @@ -173,6 +180,6 @@ def _wait_main_app_enabled() -> None: interface='asgi3', log_config=uv_log_config, log_level=app_config.uvicorn_log_level, - use_colors=bool(app_config.use_colors and os.getenv('CI', 'false') == 'false'), + use_colors=use_colors, workers=em_conf.workers, ) From 6e7f20b8cb19e0d49172764f0aa95c86e2fde8af Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Tue, 21 Apr 2026 14:22:50 +0530 Subject: [PATCH 71/96] chore: separate out updates processing in an app role Signed-off-by: Anupam Kumar --- appinfo/info.xml | 6 +++++ context_chat_backend/task_fetcher.py | 34 +++++++++++++++++----------- context_chat_backend/types.py | 1 + context_chat_backend/utils.py | 2 +- 4 files changed, 29 insertions(+), 14 deletions(-) diff --git a/appinfo/info.xml b/appinfo/info.xml index 30194ba..879524d 100644 --- a/appinfo/info.xml +++ b/appinfo/info.xml @@ -89,6 +89,12 @@ Setup background job workers as described here: https://docs.nextcloud.com/serve APP_ROLE=rp true + + up + Metadata Updates Processing Mode + APP_ROLE=up + false + indexing Indexing Mode diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index dd6f296..81307d1 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -711,34 +711,37 @@ def process_search_task( def start_bg_threads(app_config: TConfig, get_enabled_state): + THREAD_STOP_EVENT.clear() + if APP_ROLE == AppRole.INDEXING or APP_ROLE == AppRole.NORMAL: - if ( - ThreadType.FILES_INDEXING in THREADS - or ThreadType.UPDATES_PROCESSING in THREADS - ): - LOGGER.info('Background threads already running, skipping start') + if ThreadType.FILES_INDEXING in THREADS: + LOGGER.info('Indexing background threads are already up, skipping start') return - THREAD_STOP_EVENT.clear() THREADS[ThreadType.FILES_INDEXING] = Thread( target=files_indexing_thread, args=(app_config,get_enabled_state), name='FilesIndexingThread', ) + THREADS[ThreadType.FILES_INDEXING].start() + + if APP_ROLE == AppRole.UP or APP_ROLE == AppRole.NORMAL: + if ThreadType.UPDATES_PROCESSING in THREADS: + LOGGER.info('Updates processing background threads are already up, skipping start') + return + THREADS[ThreadType.UPDATES_PROCESSING] = Thread( target=updates_processing_thread, args=(app_config,get_enabled_state), name='UpdatesProcessingThread', ) - THREADS[ThreadType.FILES_INDEXING].start() THREADS[ThreadType.UPDATES_PROCESSING].start() if APP_ROLE == AppRole.RP or APP_ROLE == AppRole.NORMAL: if ThreadType.REQUEST_PROCESSING in THREADS: - LOGGER.info('Background threads already running, skipping start') + LOGGER.info('Request processing background threads are already up, skipping start') return - THREAD_STOP_EVENT.clear() THREADS[ThreadType.REQUEST_PROCESSING] = Thread( target=request_processing_thread, args=(app_config,get_enabled_state), @@ -748,20 +751,25 @@ def start_bg_threads(app_config: TConfig, get_enabled_state): def wait_for_bg_threads(): + THREAD_STOP_EVENT.set() + if APP_ROLE == AppRole.INDEXING or APP_ROLE == AppRole.NORMAL: - if (ThreadType.FILES_INDEXING not in THREADS or ThreadType.UPDATES_PROCESSING not in THREADS): + if ThreadType.FILES_INDEXING not in THREADS: return - THREAD_STOP_EVENT.set() THREADS[ThreadType.FILES_INDEXING].join() - THREADS[ThreadType.UPDATES_PROCESSING].join() THREADS.pop(ThreadType.FILES_INDEXING) + + if APP_ROLE == AppRole.UP or APP_ROLE == AppRole.NORMAL: + if ThreadType.UPDATES_PROCESSING not in THREADS: + return + + THREADS[ThreadType.UPDATES_PROCESSING].join() THREADS.pop(ThreadType.UPDATES_PROCESSING) if APP_ROLE == AppRole.RP or APP_ROLE == AppRole.NORMAL: if (ThreadType.REQUEST_PROCESSING not in THREADS): return - THREAD_STOP_EVENT.set() THREADS[ThreadType.REQUEST_PROCESSING].join() THREADS.pop(ThreadType.REQUEST_PROCESSING) diff --git a/context_chat_backend/types.py b/context_chat_backend/types.py index 2694998..12574a9 100644 --- a/context_chat_backend/types.py +++ b/context_chat_backend/types.py @@ -156,6 +156,7 @@ class AppRole(str, Enum): NORMAL = 'normal' INDEXING = 'indexing' RP = 'rp' + UP = 'up' class CommonSourceItem(BaseModel): diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py index d1e6210..f27a38d 100644 --- a/context_chat_backend/utils.py +++ b/context_chat_backend/utils.py @@ -310,7 +310,7 @@ def get_app_role() -> AppRole: role = os.getenv('APP_ROLE', '').lower() if role == '': return AppRole.NORMAL - if role not in ['indexing', 'rp']: + if role not in ['indexing', 'rp', 'up']: _logger.warning(f'Invalid app role: {role}, defaulting to all roles') return AppRole.NORMAL return AppRole(role) From 309de32d2f588880bc0a512b0f034c4336228563 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Tue, 21 Apr 2026 18:23:59 +0530 Subject: [PATCH 72/96] fix(k8s): app role fixes Signed-off-by: Anupam Kumar --- Dockerfile | 3 +-- context_chat_backend/controller.py | 7 ++++--- context_chat_backend/utils.py | 2 +- main_em.py | 11 ++++++++--- 4 files changed, 14 insertions(+), 9 deletions(-) diff --git a/Dockerfile b/Dockerfile index 3430a5e..79c0d47 100644 --- a/Dockerfile +++ b/Dockerfile @@ -47,8 +47,7 @@ COPY context_chat_backend context_chat_backend COPY main.py . COPY main_em.py . COPY config.?pu.yaml . -COPY logger_config.yaml . -COPY logger_config_em.yaml . +COPY logger_config*.yaml . COPY hwdetect.sh . COPY harp_connect.sh . COPY supervisord.conf /etc/supervisor/supervisord.conf diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py index 02db402..4c2729b 100644 --- a/context_chat_backend/controller.py +++ b/context_chat_backend/controller.py @@ -43,7 +43,9 @@ # setup # only run once -if mp.current_process().name == 'MainProcess': +APP_ROLE = get_app_role() +if mp.current_process().name == 'MainProcess' and APP_ROLE in (AppRole.NORMAL, AppRole.RP): + # normal docker containers and RP role in k8s repair_run() ensure_config_file() @@ -117,8 +119,7 @@ def enabled_handler(enabled: bool, nc: NextcloudApp | AsyncNextcloudApp) -> str: @asynccontextmanager async def lifespan(app: FastAPI): - app_role = get_app_role() - if app_role == AppRole.NORMAL: + if APP_ROLE == AppRole.NORMAL: set_handlers(app, enabled_handler, models_to_fetch=models_to_fetch, trigger_handler=trigger_handler) else: # k8s' rp role pulls tasks diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py index f27a38d..507b53b 100644 --- a/context_chat_backend/utils.py +++ b/context_chat_backend/utils.py @@ -318,4 +318,4 @@ def get_app_role() -> AppRole: def is_k8s_env(): role = get_app_role() - return role == AppRole.NORMAL + return role != AppRole.NORMAL diff --git a/main_em.py b/main_em.py index 17d9f9a..4cadbd4 100755 --- a/main_em.py +++ b/main_em.py @@ -12,11 +12,11 @@ import niquests import uvicorn -from context_chat_backend.types import DEFAULT_EM_MODEL_ALIAS # isort: skip +from context_chat_backend.types import DEFAULT_EM_MODEL_ALIAS, AppRole # isort: skip from context_chat_backend.config_parser import get_config # isort: skip from context_chat_backend.logger import get_logging_config, setup_logging # isort: skip from context_chat_backend.setup_functions import ensure_config_file, setup_env_vars # isort: skip -from context_chat_backend.utils import is_k8s_env, redact_config # isort: skip +from context_chat_backend.utils import get_app_role, is_k8s_env, redact_config # isort: skip LOGGER_CONFIG_NAME = 'logger_config_em.yaml' @@ -89,9 +89,14 @@ def _wait_main_app_enabled() -> None: if __name__ == '__main__': + app_role = get_app_role() + if app_role == AppRole.UP: + print('Internal embedding server is not required for the Updates Processing role, stopping this process.') + exit(0) + # intial buffer print( - f"Waiting for {STARTUP_CHECK_SEC} seconds before starting embedding server to allow main app to start", + f'Waiting for {STARTUP_CHECK_SEC} seconds before starting embedding server to allow main app to start', flush=True, ) sleep(STARTUP_CHECK_SEC) From 7b5020e1a408b3f80f5f49d21141b6bd3bf0c864 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Tue, 21 Apr 2026 18:25:23 +0530 Subject: [PATCH 73/96] fix: scoped context search fixes Signed-off-by: Anupam Kumar --- context_chat_backend/chain/one_shot.py | 2 ++ context_chat_backend/vectordb/pgvector.py | 8 +++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/context_chat_backend/chain/one_shot.py b/context_chat_backend/chain/one_shot.py index c387621..3b4224c 100644 --- a/context_chat_backend/chain/one_shot.py +++ b/context_chat_backend/chain/one_shot.py @@ -52,6 +52,8 @@ def process_context_query( db = vectordb_loader.load() context_docs = get_context_docs(user_id, query, db, ctx_limit, scope_type, scope_list) if len(context_docs) == 0: + if scope_type is not None: + raise ContextException('No documents retrieved, please choose a wider scope of documents to search from') raise ContextException('No documents retrieved, please index a few documents first') context_chunks = get_context_chunks(context_docs) diff --git a/context_chat_backend/vectordb/pgvector.py b/context_chat_backend/vectordb/pgvector.py index 9d88024..e833356 100644 --- a/context_chat_backend/vectordb/pgvector.py +++ b/context_chat_backend/vectordb/pgvector.py @@ -610,10 +610,9 @@ def doc_search( try: with self.session_maker() as session: doc_filters = [AccessListStore.uid == user_id] - match scope_type: - case ScopeType.PROVIDER: + if scope_type == ScopeType.PROVIDER.value: doc_filters.append(DocumentsStore.provider.in_(scope_list)) # pyright: ignore[reportArgumentType] - case ScopeType.SOURCE: + elif scope_type == ScopeType.SOURCE.value: doc_filters.append(DocumentsStore.source_id.in_(scope_list)) # pyright: ignore[reportArgumentType] # get chunks associated with the user @@ -625,6 +624,9 @@ def doc_search( result = session.execute(stmt).fetchall() chunk_ids = [str(c) for res in result for c in res.chunks] + if len(chunk_ids) == 0: + return [] + # get embeddings return self._similarity_search(session, query, chunk_ids, k) except EmbeddingException: From bdd842fb41b0535b3f433aa1d82745e8bcfadd76 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Wed, 22 Apr 2026 14:13:29 +0530 Subject: [PATCH 74/96] chore: better naming of app roles Signed-off-by: Anupam Kumar --- appinfo/info.xml | 4 ++-- context_chat_backend/controller.py | 2 +- context_chat_backend/task_fetcher.py | 8 ++++---- context_chat_backend/types.py | 4 ++-- context_chat_backend/utils.py | 5 +++-- main_em.py | 2 +- 6 files changed, 13 insertions(+), 12 deletions(-) diff --git a/appinfo/info.xml b/appinfo/info.xml index 879524d..c65a8e8 100644 --- a/appinfo/info.xml +++ b/appinfo/info.xml @@ -86,13 +86,13 @@ Setup background job workers as described here: https://docs.nextcloud.com/serve rp Request Processing Mode - APP_ROLE=rp + APP_ROLE=requestproc true up Metadata Updates Processing Mode - APP_ROLE=up + APP_ROLE=updatesproc false diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py index 4c2729b..278f892 100644 --- a/context_chat_backend/controller.py +++ b/context_chat_backend/controller.py @@ -44,7 +44,7 @@ # only run once APP_ROLE = get_app_role() -if mp.current_process().name == 'MainProcess' and APP_ROLE in (AppRole.NORMAL, AppRole.RP): +if mp.current_process().name == 'MainProcess' and APP_ROLE in (AppRole.NORMAL, AppRole.REQUEST_PROC): # normal docker containers and RP role in k8s repair_run() ensure_config_file() diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index 81307d1..a41ae06 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -725,7 +725,7 @@ def start_bg_threads(app_config: TConfig, get_enabled_state): ) THREADS[ThreadType.FILES_INDEXING].start() - if APP_ROLE == AppRole.UP or APP_ROLE == AppRole.NORMAL: + if APP_ROLE == AppRole.UPDATES_PROC or APP_ROLE == AppRole.NORMAL: if ThreadType.UPDATES_PROCESSING in THREADS: LOGGER.info('Updates processing background threads are already up, skipping start') return @@ -737,7 +737,7 @@ def start_bg_threads(app_config: TConfig, get_enabled_state): ) THREADS[ThreadType.UPDATES_PROCESSING].start() - if APP_ROLE == AppRole.RP or APP_ROLE == AppRole.NORMAL: + if APP_ROLE == AppRole.REQUEST_PROC or APP_ROLE == AppRole.NORMAL: if ThreadType.REQUEST_PROCESSING in THREADS: LOGGER.info('Request processing background threads are already up, skipping start') return @@ -760,14 +760,14 @@ def wait_for_bg_threads(): THREADS[ThreadType.FILES_INDEXING].join() THREADS.pop(ThreadType.FILES_INDEXING) - if APP_ROLE == AppRole.UP or APP_ROLE == AppRole.NORMAL: + if APP_ROLE == AppRole.UPDATES_PROC or APP_ROLE == AppRole.NORMAL: if ThreadType.UPDATES_PROCESSING not in THREADS: return THREADS[ThreadType.UPDATES_PROCESSING].join() THREADS.pop(ThreadType.UPDATES_PROCESSING) - if APP_ROLE == AppRole.RP or APP_ROLE == AppRole.NORMAL: + if APP_ROLE == AppRole.REQUEST_PROC or APP_ROLE == AppRole.NORMAL: if (ThreadType.REQUEST_PROCESSING not in THREADS): return diff --git a/context_chat_backend/types.py b/context_chat_backend/types.py index 12574a9..700d7dd 100644 --- a/context_chat_backend/types.py +++ b/context_chat_backend/types.py @@ -155,8 +155,8 @@ class DocErrorEmbeddingException(EmbeddingException): class AppRole(str, Enum): NORMAL = 'normal' INDEXING = 'indexing' - RP = 'rp' - UP = 'up' + REQUEST_PROC = 'requestproc' + UPDATES_PROC = 'updatesproc' class CommonSourceItem(BaseModel): diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py index 507b53b..d572714 100644 --- a/context_chat_backend/utils.py +++ b/context_chat_backend/utils.py @@ -310,10 +310,11 @@ def get_app_role() -> AppRole: role = os.getenv('APP_ROLE', '').lower() if role == '': return AppRole.NORMAL - if role not in ['indexing', 'rp', 'up']: + try: + return AppRole(role) + except ValueError: _logger.warning(f'Invalid app role: {role}, defaulting to all roles') return AppRole.NORMAL - return AppRole(role) def is_k8s_env(): diff --git a/main_em.py b/main_em.py index 4cadbd4..addcfd6 100755 --- a/main_em.py +++ b/main_em.py @@ -90,7 +90,7 @@ def _wait_main_app_enabled() -> None: if __name__ == '__main__': app_role = get_app_role() - if app_role == AppRole.UP: + if app_role == AppRole.UPDATES_PROC: print('Internal embedding server is not required for the Updates Processing role, stopping this process.') exit(0) From f691743ec5d1156be4cd3e3d6516e1ac4b4580c8 Mon Sep 17 00:00:00 2001 From: Marcel Klehr Date: Wed, 22 Apr 2026 11:26:19 +0200 Subject: [PATCH 75/96] fix(app_enabled): Add lock for app enabled check Signed-off-by: Marcel Klehr --- context_chat_backend/controller.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py index 278f892..3dadf18 100644 --- a/context_chat_backend/controller.py +++ b/context_chat_backend/controller.py @@ -69,16 +69,18 @@ app_enabled = threading.Event() last_enabled_check: float | None = None +enabled_check_lock: threading.Lock = threading.Lock() def get_enabled_state() -> bool: global last_enabled_check - if last_enabled_check is None or time.time() - last_enabled_check > 30: - nc = NextcloudApp() - if nc.enabled_state: - app_enabled.set() - else: - app_enabled.clear() - last_enabled_check = time.time() - return app_enabled.is_set() + with enabled_check_lock: + if last_enabled_check is None or time.time() - last_enabled_check > 30: + nc = NextcloudApp() + if nc.enabled_state: + app_enabled.set() + else: + app_enabled.clear() + last_enabled_check = time.time() + return app_enabled.is_set() def enabled_handler(enabled: bool, nc: NextcloudApp | AsyncNextcloudApp) -> str: try: From 17aa8105c49b54e7d4a30024073167134d1b3e7a Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Thu, 23 Apr 2026 13:39:02 +0530 Subject: [PATCH 76/96] feat: build llama cpp python and add cpu/cuda/vulkan builds Signed-off-by: Anupam Kumar --- Dockerfile | 270 +++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 251 insertions(+), 19 deletions(-) diff --git a/Dockerfile b/Dockerfile index 79c0d47..63eec9a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,24 +1,191 @@ # SPDX-FileCopyrightText: 2023 Nextcloud GmbH and Nextcloud contributors # SPDX-License-Identifier: AGPL-3.0-or-later # -FROM docker.io/nvidia/cuda:12.2.2-runtime-ubuntu22.04 +ARG CPU_IMAGE=ubuntu:22.04 +ARG CUDA_DEVEL_IMAGE=nvidia/cuda:12.4.1-devel-ubuntu22.04 +ARG CUDA_RUNTIME_IMAGE=nvidia/cuda:12.4.1-runtime-ubuntu22.04 +ARG LLAMA_CPP_PYTHON_VERSION=0.3.20 + +# ============================================================ +# CPU / ARM builder +# Builds llama_cpp_python for any x86_64 (AVX+, Sandy Bridge 2011+) +# and for arm64 (NEON always available). +# ubuntu:22.04 is a multi-arch image so this stage covers both. +# +# GGML_NATIVE=OFF: no -march=native; the host build machine's SIMD +# capabilities are not baked in. AVX/AVX2/FMA/F16C default to ON in +# llama.cpp cmake and are used when the CPU supports them at runtime +# (the ggml_cpu_has_*() guards). On arm64 those x86 flags are never +# emitted by cmake, so NEON/SVE detection remains intact. +# ============================================================ +FROM ubuntu:22.04 AS llama-builder-cpu +ARG LLAMA_CPP_PYTHON_VERSION + +ENV DEBIAN_FRONTEND=noninteractive +WORKDIR /build +ADD dockerfile_scripts/install_py11.sh dockerfile_scripts/install_py11.sh +RUN ./dockerfile_scripts/install_py11.sh +# install_py11.sh leaves apt lists in place – install build tools in one layer +RUN apt-get install -y --no-install-recommends \ + python3.11-dev \ + cmake build-essential ninja-build git \ + libgomp1 \ + && rm -rf /var/lib/apt/lists/* + +RUN python3.11 -m pip install --no-cache-dir --upgrade pip setuptools wheel + +ENV CMAKE_ARGS="-DGGML_NATIVE=OFF" + +RUN python3.11 -m pip wheel \ + --no-cache-dir \ + --no-binary llama-cpp-python \ + --wheel-dir=/wheels \ + "llama-cpp-python==${LLAMA_CPP_PYTHON_VERSION}" + +# ============================================================ +# CUDA (NVIDIA) builder +# Builds llama_cpp_python with CUDA support. +# sm_90 is the maximum compute capability supported by CUDA 12.4 +# (Hopper / H100). Blackwell sm_100 requires CUDA 12.8+. +# ============================================================ +FROM ${CUDA_DEVEL_IMAGE} AS llama-builder-cuda +ARG LLAMA_CPP_PYTHON_VERSION + +ENV DEBIAN_FRONTEND=noninteractive +WORKDIR /build +ADD dockerfile_scripts/install_py11.sh dockerfile_scripts/install_py11.sh +RUN ./dockerfile_scripts/install_py11.sh +# gcc-12 is required: Ubuntu 22.04 ships gcc-11 by default which CUDA 12.4 +# treats as "unsupported"; we pin gcc-12 to match the official CI workflow. +RUN apt-get install -y --no-install-recommends \ + python3.11-dev \ + cmake build-essential ninja-build git \ + gcc-12 g++-12 \ + libgomp1 \ + && rm -rf /var/lib/apt/lists/* + +ENV CC=/usr/bin/gcc-12 +ENV CXX=/usr/bin/g++-12 +ENV CUDAHOSTCXX=/usr/bin/g++-12 + +RUN python3.11 -m pip install --no-cache-dir --upgrade pip setuptools wheel + +# Architecture list aligned with the official llama-cpp-python CUDA CI workflow: +# https://github.com/abetlen/llama-cpp-python/blob/main/.github/workflows/build-wheels-cuda.yaml +ENV CMAKE_ARGS="-DGGML_CUDA=ON -DGGML_CUDA_FORCE_MMQ=ON -DGGML_NATIVE=OFF \ + -DCMAKE_CUDA_ARCHITECTURES=70-real;75-real;80-real;86-real;89-real;90-real;90-virtual \ + -DCMAKE_CUDA_FLAGS=--allow-unsupported-compiler \ + -DCMAKE_CUDA_HOST_COMPILER=/usr/bin/g++-12" + +RUN python3.11 -m pip wheel \ + --no-cache-dir \ + --no-binary llama-cpp-python \ + --wheel-dir=/wheels \ + "llama-cpp-python==${LLAMA_CPP_PYTHON_VERSION}" + +# ============================================================ +# Vulkan (AMD / Intel / any Vulkan-capable GPU) builder +# Builds llama_cpp_python with Vulkan compute backend. +# Works on RDNA1/2/3, GCN, Intel Arc, and more. +# ============================================================ +FROM ubuntu:22.04 AS llama-builder-vulkan +ARG LLAMA_CPP_PYTHON_VERSION + +ENV DEBIAN_FRONTEND=noninteractive +WORKDIR /build +ADD dockerfile_scripts/install_py11.sh dockerfile_scripts/install_py11.sh +RUN ./dockerfile_scripts/install_py11.sh +# Vulkan headers + glslang (shader compiler) are build-time only +RUN apt-get install -y --no-install-recommends \ + python3.11-dev \ + cmake build-essential ninja-build git \ + libgomp1 \ + libvulkan-dev glslang-tools \ + && rm -rf /var/lib/apt/lists/* + +RUN python3.11 -m pip install --no-cache-dir --upgrade pip setuptools wheel + +ENV CMAKE_ARGS="-DGGML_VULKAN=ON -DGGML_NATIVE=OFF" + +RUN python3.11 -m pip wheel \ + --no-cache-dir \ + --no-binary llama-cpp-python \ + --wheel-dir=/wheels \ + "llama-cpp-python==${LLAMA_CPP_PYTHON_VERSION}" + +# ============================================================ +# CPU / ARM runtime +# ============================================================ +FROM ubuntu:22.04 AS runtime-cpu + +ARG CCB_DB_NAME=ccb +ARG CCB_DB_USER=ccbuser +ARG CCB_DB_PASS=ccbpass + +ENV CCB_DB_NAME=${CCB_DB_NAME} +ENV CCB_DB_USER=${CCB_DB_USER} +ENV CCB_DB_PASS=${CCB_DB_PASS} +ENV DEBIAN_FRONTEND=noninteractive +ENV AA_DOCKER_ENV=1 + +WORKDIR /app + +ADD dockerfile_scripts/install_deps.sh dockerfile_scripts/install_deps.sh +RUN ./dockerfile_scripts/install_deps.sh +ADD dockerfile_scripts/install_py11.sh dockerfile_scripts/install_py11.sh +RUN ./dockerfile_scripts/install_py11.sh +ADD dockerfile_scripts/pgsql dockerfile_scripts/pgsql +RUN ./dockerfile_scripts/pgsql/install.sh +ADD dockerfile_scripts/install_frpc.sh dockerfile_scripts/install_frpc.sh +RUN ./dockerfile_scripts/install_frpc.sh +RUN apt-get autoclean +ADD dockerfile_scripts/entrypoint.sh dockerfile_scripts/entrypoint.sh + +ENV DEBIAN_FRONTEND=dialog + +# Install llama_cpp_python from the CPU builder wheel +COPY --from=llama-builder-cpu /wheels /wheels +RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel \ + && python3 -m pip install --no-cache-dir --no-index --find-links=/wheels llama-cpp-python \ + && python3 -m pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu \ + && rm -rf /wheels \ + && pip cache purge + +COPY requirements.txt . +RUN sed -i '/^llama_cpp_python/d' requirements.txt \ + && python3 -m pip install --no-cache-dir -r requirements.txt \ + && python3 -m pip cache purge + +COPY context_chat_backend context_chat_backend +COPY main.py . +COPY main_em.py . +COPY config.?pu.yaml . +COPY logger_config*.yaml . +COPY hwdetect.sh . +COPY harp_connect.sh . +COPY supervisord.conf /etc/supervisor/supervisord.conf + +ENTRYPOINT ["supervisord", "-c", "/etc/supervisor/supervisord.conf"] + +# ============================================================ +# CUDA (NVIDIA GPU) runtime +# ============================================================ +FROM ${CUDA_RUNTIME_IMAGE} AS runtime-cuda ARG CCB_DB_NAME=ccb ARG CCB_DB_USER=ccbuser ARG CCB_DB_PASS=ccbpass -ENV CCB_DB_NAME ${CCB_DB_NAME} -ENV CCB_DB_USER ${CCB_DB_USER} -ENV CCB_DB_PASS ${CCB_DB_PASS} -ENV DEBIAN_FRONTEND noninteractive -ENV NVIDIA_VISIBLE_DEVICES all -ENV NVIDIA_DRIVER_CAPABILITIES compute -ENV AA_DOCKER_ENV 1 +ENV CCB_DB_NAME=${CCB_DB_NAME} +ENV CCB_DB_USER=${CCB_DB_USER} +ENV CCB_DB_PASS=${CCB_DB_PASS} +ENV DEBIAN_FRONTEND=noninteractive +ENV NVIDIA_VISIBLE_DEVICES=all +ENV NVIDIA_DRIVER_CAPABILITIES=compute +ENV AA_DOCKER_ENV=1 -# Set working directory WORKDIR /app -# Install dependencies ADD dockerfile_scripts/install_deps.sh dockerfile_scripts/install_deps.sh RUN ./dockerfile_scripts/install_deps.sh ADD dockerfile_scripts/install_py11.sh dockerfile_scripts/install_py11.sh @@ -30,19 +197,82 @@ RUN ./dockerfile_scripts/install_frpc.sh RUN apt-get autoclean ADD dockerfile_scripts/entrypoint.sh dockerfile_scripts/entrypoint.sh -# Restore interactivity -ENV DEBIAN_FRONTEND dialog +ENV DEBIAN_FRONTEND=dialog + +# Install llama_cpp_python from the CUDA builder wheel +COPY --from=llama-builder-cuda /wheels /wheels +RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel \ + && python3 -m pip install --no-cache-dir --no-index --find-links=/wheels llama-cpp-python \ + && rm -rf /wheels \ + && pip cache purge -# Copy requirements files COPY requirements.txt . +RUN sed -i '/^llama_cpp_python/d' requirements.txt \ + && python3 -m pip install --no-cache-dir -r requirements.txt \ + && python3 -m pip cache purge -# Install requirements -RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel -RUN python3 -m pip install --no-cache-dir https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.13-cu122/llama_cpp_python-0.3.13-cp311-cp311-linux_x86_64.whl -RUN sed -i '/llama_cpp_python/d' requirements.txt -RUN python3 -m pip install --no-cache-dir -r requirements.txt && python3 -m pip cache purge +COPY context_chat_backend context_chat_backend +COPY main.py . +COPY main_em.py . +COPY config.?pu.yaml . +COPY logger_config*.yaml . +COPY hwdetect.sh . +COPY harp_connect.sh . +COPY supervisord.conf /etc/supervisor/supervisord.conf + +ENTRYPOINT ["supervisord", "-c", "/etc/supervisor/supervisord.conf"] + +# ============================================================ +# Vulkan (AMD / Intel / any Vulkan-capable GPU) runtime +# Run with: --device /dev/dri (and optionally --device /dev/kfd for AMD) +# The RADV Mesa driver (mesa-vulkan-drivers) is included and covers +# GCN, RDNA1/2/3 and newer AMD GPUs out of the box. +# ============================================================ +FROM ubuntu:22.04 AS runtime-vulkan + +ARG CCB_DB_NAME=ccb +ARG CCB_DB_USER=ccbuser +ARG CCB_DB_PASS=ccbpass + +ENV CCB_DB_NAME=${CCB_DB_NAME} +ENV CCB_DB_USER=${CCB_DB_USER} +ENV CCB_DB_PASS=${CCB_DB_PASS} +ENV DEBIAN_FRONTEND=noninteractive +ENV AA_DOCKER_ENV=1 + +WORKDIR /app + +ADD dockerfile_scripts/install_deps.sh dockerfile_scripts/install_deps.sh +RUN ./dockerfile_scripts/install_deps.sh +ADD dockerfile_scripts/install_py11.sh dockerfile_scripts/install_py11.sh +RUN ./dockerfile_scripts/install_py11.sh +ADD dockerfile_scripts/pgsql dockerfile_scripts/pgsql +RUN ./dockerfile_scripts/pgsql/install.sh +ADD dockerfile_scripts/install_frpc.sh dockerfile_scripts/install_frpc.sh +RUN ./dockerfile_scripts/install_frpc.sh +RUN apt-get autoclean +ADD dockerfile_scripts/entrypoint.sh dockerfile_scripts/entrypoint.sh + +# Install Vulkan runtime + AMD RADV open-source driver +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + libvulkan1 mesa-vulkan-drivers \ + && rm -rf /var/lib/apt/lists/* + +ENV DEBIAN_FRONTEND=dialog + +# Install llama_cpp_python from the Vulkan builder wheel +COPY --from=llama-builder-vulkan /wheels /wheels +RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel \ + && python3 -m pip install --no-cache-dir --no-index --find-links=/wheels llama-cpp-python \ + && rm -rf /wheels \ + && pip cache purge + +COPY requirements.txt . +RUN sed -i '/^llama_cpp_python/d' requirements.txt \ + && python3 -m pip install --no-cache-dir -r requirements.txt \ + && python3 -m pip cache purge -# Copy application files COPY context_chat_backend context_chat_backend COPY main.py . COPY main_em.py . @@ -53,3 +283,5 @@ COPY harp_connect.sh . COPY supervisord.conf /etc/supervisor/supervisord.conf ENTRYPOINT ["supervisord", "-c", "/etc/supervisor/supervisord.conf"] + +FROM runtime-cpu AS final From 10092cb40bbd54cd40db77c6d37f27c99241dda2 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Thu, 23 Apr 2026 14:20:23 +0530 Subject: [PATCH 77/96] feat(ci): add kubernetes integration test Signed-off-by: Anupam Kumar --- .github/workflows/integration-test-k8s.yml | 421 +++++++++++++++++++++ 1 file changed, 421 insertions(+) create mode 100644 .github/workflows/integration-test-k8s.yml diff --git a/.github/workflows/integration-test-k8s.yml b/.github/workflows/integration-test-k8s.yml new file mode 100644 index 0000000..12e3a11 --- /dev/null +++ b/.github/workflows/integration-test-k8s.yml @@ -0,0 +1,421 @@ +# SPDX-FileCopyrightText: 2025 Nextcloud GmbH and Nextcloud contributors +# SPDX-License-Identifier: AGPL-3.0-or-later + +name: Integration test k8s + +on: + pull_request: + push: + branches: + - master + - stable* + +permissions: + contents: read + +concurrency: + group: integration-test-k8s-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + + +jobs: + changes: + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: read + + outputs: + src: ${{ steps.changes.outputs.src}} + + steps: + - uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 # v3.0.2 + id: changes + continue-on-error: true + with: + filters: | + src: + - 'main.py' + - 'main_em.py' + - 'config.cpu.yaml' + - 'config.gpu.yaml' + - 'context_chat_backend/**' + - 'appinfo/**' + - 'example.env' + - 'hwdetect.sh' + - 'persistent_storage/**' + - 'project.toml' + - 'requirements.txt' + - 'logger_config.k8s.yaml' + - 'supervisord.conf' + - '.github/workflows/integration-test-k8s.yml' + + integration: + runs-on: ubuntu-24.04 + + needs: changes + if: needs.changes.outputs.src != 'false' + + strategy: + # do not stop on another job's failure + fail-fast: false + matrix: + php-versions: [ '8.2' ] + databases: [ 'pgsql' ] + server-versions: [ 'stable32', 'stable33', 'master' ] + + name: Integration test k8s on ${{ matrix.server-versions }} php@${{ matrix.php-versions }} + + env: + MYSQL_PORT: 4444 + PGSQL_PORT: 4445 + # use the same db for ccb and nextcloud + CCB_DB_URL: postgresql+psycopg://root:rootpassword@localhost:4445/nextcloud + HP_SHARED_KEY: test_shared_key_12345 + + services: + mysql: + image: mariadb:10.5 + ports: + - 4444:3306/tcp + env: + MYSQL_ROOT_PASSWORD: rootpassword + options: --health-cmd="mysqladmin ping" --health-interval 5s --health-timeout 2s --health-retries 5 + postgres: + image: pgvector/pgvector:pg17 + ports: + - 4445:5432/tcp + env: + POSTGRES_USER: root + POSTGRES_PASSWORD: rootpassword + POSTGRES_DB: nextcloud + options: --health-cmd pg_isready --health-interval 5s --health-timeout 2s --health-retries 5 --name postgres --hostname postgres + + steps: + - name: Checkout server + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 + with: + repository: nextcloud/server + ref: ${{ matrix.server-versions }} + submodules: 'recursive' + persist-credentials: false + + - name: Set up php ${{ matrix.php-versions }} + uses: shivammathur/setup-php@9e72090525849c5e82e596468b86eb55e9cc5401 # v2 + with: + php-version: ${{ matrix.php-versions }} + tools: phpunit + extensions: mbstring, iconv, fileinfo, intl, sqlite, pdo_mysql, pdo_sqlite, pgsql, pdo_pgsql, gd, zip + + - name: Checkout context_chat php app + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 + with: + repository: nextcloud/context_chat + path: apps/context_chat + persist-credentials: false + + - name: Checkout backend + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 + with: + path: context_chat_backend/ + persist-credentials: false + + - name: Get app version + id: appinfo + uses: skjnldsv/xpath-action@7e6a7c379d0e9abc8acaef43df403ab4fc4f770c # master + with: + filename: context_chat_backend/appinfo/info.xml + expression: "/info/version/text()" + + - name: Set up Nextcloud MYSQL + if: ${{ matrix.databases != 'pgsql'}} + run: | + sleep 25 + mkdir data + ./occ maintenance:install --verbose --database=${{ matrix.databases }} --database-name=nextcloud --database-host=127.0.0.1 --database-port=$MYSQL_PORT --database-user=root --database-pass=rootpassword --admin-user admin --admin-pass password + composer run serve & + + - name: Set up Nextcloud PGSQL + if: ${{ matrix.databases == 'pgsql'}} + run: | + sleep 25 + mkdir data + ./occ maintenance:install --verbose --database=${{ matrix.databases }} --database-name=nextcloud --database-host=127.0.0.1 --database-port=$PGSQL_PORT --database-user=root --database-pass=rootpassword --admin-user admin --admin-pass password + composer run serve & + + - name: Enable context_chat, app_api and testing + run: ./occ app:enable -vvv -f context_chat app_api testing + + - name: Checkout documentation + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 + with: + repository: nextcloud/documentation + path: data/admin/files/documentation + persist-credentials: false + + - name: Prepare docs + run: | + cd data/admin/files + mv documentation/admin_manual . + cp -R documentation/developer_manual . + cd developer_manual + find . -type f -name "*.rst" -exec bash -c 'mv "$0" "${0%.rst}.md"' {} \; + cd .. + cp -R documentation/developer_manual ./developer_manual2 + cd developer_manual2 + find . -type f -name "*.rst" -exec bash -c 'mv "$0" "${0%.rst}.txt"' {} \; + cd .. + rm -rf documentation + + - name: Run files scan + run: | + ./occ files:scan --all + + - name: Install k3s + run: | + curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="--disable traefik --disable servicelb" sh - + sudo chmod 644 /etc/rancher/k3s/k3s.yaml + echo "KUBECONFIG=/etc/rancher/k3s/k3s.yaml" >> $GITHUB_ENV + + - name: Wait for k3s and create namespace + run: | + kubectl wait --for=condition=Ready node --all --timeout=120s + kubectl create namespace nextcloud-exapps + NODE_IP=$(kubectl get node -o jsonpath='{.items[0].status.addresses[?(@.type=="InternalIP")].address}') + echo "NODE_IP=${NODE_IP}" >> $GITHUB_ENV + echo "k3s node IP: $NODE_IP" + + - name: Configure Nextcloud for k3s networking + run: | + ./occ config:system:set overwrite.cli.url --value "http://${{ env.NODE_IP }}" --type=string + ./occ config:system:set trusted_domains 1 --value "${{ env.NODE_IP }}" + + - name: Create K8s service account for HaRP + run: | + kubectl -n nextcloud-exapps create serviceaccount harp-sa + kubectl create clusterrolebinding harp-admin \ + --clusterrole=cluster-admin \ + --serviceaccount=nextcloud-exapps:harp-sa + K3S_TOKEN=$(kubectl -n nextcloud-exapps create token harp-sa --duration=2h) + echo "K3S_TOKEN=${K3S_TOKEN}" >> $GITHUB_ENV + + - name: Set up QEMU + uses: docker/setup-qemu-action@29109295f81e9208d7d86ff1c6c12d2833863392 # v3.6.0 + with: + cache-image: false + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # v3 + with: + cache-binary: false + + - name: Build the context_chat_backend cpu image + uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # v6 + with: + context: context_chat_backend + push: false + platforms: linux/amd64 + tags: ccb-cpu:latest + target: runtime-cpu + load: true + + - name: Pre-load CCB ExApp image into k3s + run: docker save ccb-cpu:latest | sudo k3s ctr images import - + + - name: Start HaRP with K8s backend + run: | + docker run --net host --name appapi-harp \ + -e HP_SHARED_KEY="${{ env.HP_SHARED_KEY }}" \ + -e NC_INSTANCE_URL="http://${{ env.NODE_IP }}" \ + -e HP_LOG_LEVEL="debug" \ + -e HP_K8S_ENABLED="true" \ + -e HP_K8S_API_SERVER="https://127.0.0.1:6443" \ + -e HP_K8S_BEARER_TOKEN="${{ env.K3S_TOKEN }}" \ + -e HP_K8S_NAMESPACE="nextcloud-exapps" \ + -e HP_K8S_VERIFY_SSL="false" \ + --restart unless-stopped \ + -d ghcr.io/nextcloud/nextcloud-appapi-harp:latest + + - name: Start nginx proxy + run: | + docker run --net host --name nextcloud --rm \ + -v $(pwd)/apps/app_api/tests/simple-nginx-NOT-FOR-PRODUCTION.conf:/etc/nginx/conf.d/default.conf:ro \ + -d nginx + + - name: Wait for HaRP K8s readiness + run: | + for i in $(seq 1 30); do + if curl -sf http://${{ env.NODE_IP }}:8780/exapps/app_api/info \ + -H "harp-shared-key: ${{ env.HP_SHARED_KEY }}" 2>/dev/null | grep -q '"kubernetes"'; then + echo "HaRP is ready with K8s backend" + exit 0 + fi + echo "Waiting for HaRP... ($i/30)" + sleep 2 + done + echo "HaRP K8s readiness check failed" + docker logs appapi-harp + exit 1 + + - name: Register K8s daemon + run: | + ./occ app_api:daemon:register \ + k8s_test "K8s Test" "kubernetes-install" "http" "${{ env.NODE_IP }}:8780" "http://${{ env.NODE_IP }}" \ + --harp --harp_shared_key "${{ env.HP_SHARED_KEY }}" \ + --k8s --k8s_expose_type=nodeport --set-default + ./occ app_api:daemon:list + + - name: Register backend + run: | + sed -i 's;.*;ccb-cpu;' appinfo/info.xml + sed -i 's;.*;latest;' appinfo/info.xml + timeout 120 ./occ app_api:app:register context_chat_backend k8s_test --info-xml context_chat_backend/appinfo/info.xml + + - name: Run cron jobs + run: | + # every 10 seconds indefinitely + while true; do + php cron.php + sleep 10 + done & + sleep 30 + # list all the bg jobs + ./occ background-job:list + + - name: Initial dump of DB with context_chat_queue populated + run: | + docker exec postgres pg_dump nextcloud > /tmp/0_pgdump_nextcloud + + - name: Periodically check context_chat stats for 15 minutes to allow the backend to index the files + run: | + success=0 + echo "::group::Checking stats periodically for 15 minutes to allow the backend to index the files" + for i in {1..90}; do + echo "Checking stats, attempt $i..." + + stats_err=$(mktemp) + stats=$(timeout 5 ./occ context_chat:stats --json 2>"$stats_err") + stats_exit=$? + echo "Stats output:" + echo "$stats" + if [ -s "$stats_err" ]; then + echo "Stderr:" + cat "$stats_err" + fi + echo "---" + rm -f "$stats_err" + + # Check for critical errors in output + if [ $stats_exit -ne 0 ] || echo "$stats" | grep -q "Error during request"; then + echo "Backend connection error detected (exit=$stats_exit), retrying..." + sleep 10 + continue + fi + + # Extract total eligible files + total_eligible_files=$(echo "$stats" | jq '.eligible_files_count' || echo "") + + # Extract indexed documents count (files__default) + indexed_count=$(echo "$stats" | jq '.vectordb_document_counts.files__default' || echo "") + + echo "Total eligible files: $total_eligible_files" + echo "Indexed documents (files__default): $indexed_count" + + diff=$((total_eligible_files - indexed_count)) + threshold=$((total_eligible_files * 3 / 100)) + + # Check if difference is within tolerance + if [ $diff -le $threshold ]; then + echo "Indexing within 3% tolerance (diff=$diff, threshold=$threshold)" + success=1 + break + else + progress=$((diff * 100 / total_eligible_files)) + echo "Outside 3% tolerance: diff=$diff (${progress}%), threshold=$threshold" + fi + + sleep 10 + done + + echo "::endgroup::" + + if [ $success -ne 1 ]; then + echo "Max attempts reached" + exit 1 + fi + + - name: Run the prompts + run: | + ./occ background-job:worker 'OC\TaskProcessing\SynchronousBackgroundJob' > worker1_logs 2>&1 & + ./occ background-job:worker 'OC\TaskProcessing\SynchronousBackgroundJob' > worker2_logs 2>&1 & + + OUT1=$(./occ context_chat:prompt admin "Which factors are taken into account for the Ethical AI Rating?") + echo "$OUT1" + echo '--------------------------------------------------' + OUT2=$(./occ context_chat:prompt admin "Welche Faktoren beeinflussen das Ethical AI Rating?") + echo "$OUT2" + + echo "$OUT1" | grep -q "If all of these points are met, we give a Green label." || exit 1 + echo "$OUT2" | grep -q "If all of these points are met, we give a Green label." || exit 1 + + - name: Final dump of DB with vectordb populated + run: | + docker exec postgres pg_dump nextcloud > /tmp/1_pgdump_nextcloud + + - name: Show server logs + if: always() + run: | + cat data/nextcloud.log + + - name: Show context_chat specific logs + if: always() + run: | + cat data/context_chat.log + + - name: Show task processing worker logs + if: always() + run: | + tail -v -n +1 worker?_logs || echo "No worker logs" + + - name: Show main app indexing logs + if: always() + run: | + kubectl logs -n nextcloud-exapps -l app=nc-app-context-chat-backend-indexing + + - name: Show main app updates processing logs + if: always() + run: | + kubectl logs -n nextcloud-exapps -l app=nc-app-context-chat-backend-updatesproc + + - name: Show main app request processing logs + if: always() + run: | + kubectl logs -n nextcloud-exapps -l app=nc-app-context-chat-backend-requestproc + + - name: Upload database dumps + uses: actions/upload-artifact@v4 + with: + name: database-dumps-${{ matrix.server-versions }}-php@${{ matrix.php-versions }} + path: | + /tmp/0_pgdump_nextcloud + /tmp/1_pgdump_nextcloud + + - name: Final stats log + run: | + ./occ context_chat:stats + ./occ context_chat:stats --json + + summary: + permissions: + contents: none + runs-on: ubuntu-latest-low + needs: [changes, integration] + + if: always() + + # This is the summary, we just avoid to rename it so that branch protection rules still match + name: integration-test-k8s + + steps: + - name: Summary status + run: if ${{ needs.changes.outputs.src != 'false' && needs.integration.result != 'success' }}; then exit 1; fi From 68199023559bc62819f617f84054efdc3e09285f Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Thu, 23 Apr 2026 15:01:31 +0530 Subject: [PATCH 78/96] fix: checkout app_api Signed-off-by: Anupam Kumar --- .github/workflows/integration-test-k8s.yml | 7 +++++++ .github/workflows/integration-test.yml | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/.github/workflows/integration-test-k8s.yml b/.github/workflows/integration-test-k8s.yml index 12e3a11..15fc132 100644 --- a/.github/workflows/integration-test-k8s.yml +++ b/.github/workflows/integration-test-k8s.yml @@ -120,6 +120,13 @@ jobs: path: context_chat_backend/ persist-credentials: false + - name: Checkout app_api + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 + with: + repository: nextcloud/app_api + path: apps/app_api + persist-credentials: false + - name: Get app version id: appinfo uses: skjnldsv/xpath-action@7e6a7c379d0e9abc8acaef43df403ab4fc4f770c # master diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index d30073a..69d5945 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -122,6 +122,13 @@ jobs: path: context_chat_backend/ persist-credentials: false + - name: Checkout app_api + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 + with: + repository: nextcloud/app_api + path: apps/app_api + persist-credentials: false + - name: Get app version id: appinfo uses: skjnldsv/xpath-action@7e6a7c379d0e9abc8acaef43df403ab4fc4f770c # master From 2376535191016713e109c620ce08e341602871f2 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Thu, 23 Apr 2026 15:21:34 +0530 Subject: [PATCH 79/96] fix: cache docker build image Signed-off-by: Anupam Kumar --- .github/workflows/integration-test-k8s.yml | 6 ++---- .github/workflows/integration-test.yml | 2 -- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/.github/workflows/integration-test-k8s.yml b/.github/workflows/integration-test-k8s.yml index 15fc132..2e751f5 100644 --- a/.github/workflows/integration-test-k8s.yml +++ b/.github/workflows/integration-test-k8s.yml @@ -208,13 +208,9 @@ jobs: - name: Set up QEMU uses: docker/setup-qemu-action@29109295f81e9208d7d86ff1c6c12d2833863392 # v3.6.0 - with: - cache-image: false - name: Set up Docker Buildx uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # v3 - with: - cache-binary: false - name: Build the context_chat_backend cpu image uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # v6 @@ -225,6 +221,8 @@ jobs: tags: ccb-cpu:latest target: runtime-cpu load: true + cache-from: type=registry,ref=nextcloud/context_chat_backend:cpu-build-cache,compression=zstd,mode=max + cache-to: type=registry,ref=nextcloud/context_chat_backend:cpu-build-cache - name: Pre-load CCB ExApp image into k3s run: docker save ccb-cpu:latest | sudo k3s ctr images import - diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index 69d5945..7c8a4cf 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -113,8 +113,6 @@ jobs: repository: nextcloud/context_chat path: apps/context_chat persist-credentials: false - # todo: remove later - ref: feat/reverse-content-flow - name: Checkout backend uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 From cf6ba4c7da78e2587b37af00658da787fd7f128e Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Thu, 23 Apr 2026 15:53:19 +0530 Subject: [PATCH 80/96] fix: correct info.xml path + register command fixes Signed-off-by: Anupam Kumar --- .github/workflows/integration-test-k8s.yml | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/workflows/integration-test-k8s.yml b/.github/workflows/integration-test-k8s.yml index 2e751f5..ea0b051 100644 --- a/.github/workflows/integration-test-k8s.yml +++ b/.github/workflows/integration-test-k8s.yml @@ -69,8 +69,6 @@ jobs: env: MYSQL_PORT: 4444 PGSQL_PORT: 4445 - # use the same db for ccb and nextcloud - CCB_DB_URL: postgresql+psycopg://root:rootpassword@localhost:4445/nextcloud HP_SHARED_KEY: test_shared_key_12345 services: @@ -81,6 +79,7 @@ jobs: env: MYSQL_ROOT_PASSWORD: rootpassword options: --health-cmd="mysqladmin ping" --health-interval 5s --health-timeout 2s --health-retries 5 + # use the same db for ccb and nextcloud postgres: image: pgvector/pgvector:pg17 ports: @@ -272,9 +271,12 @@ jobs: - name: Register backend run: | - sed -i 's;.*;ccb-cpu;' appinfo/info.xml - sed -i 's;.*;latest;' appinfo/info.xml - timeout 120 ./occ app_api:app:register context_chat_backend k8s_test --info-xml context_chat_backend/appinfo/info.xml + sed -i 's;.*;ccb-cpu;' context_chat_backend/appinfo/info.xml + sed -i 's;.*;latest;' context_chat_backend/appinfo/info.xml + timeout 120 ./occ app_api:app:register context_chat_backend k8s_test \ + --info-xml context_chat_backend/appinfo/info.xml \ + --env EXTERNAL_DB="postgresql+psycopg://root:rootpassword@localhost:4445/nextcloud" \ + --wait-finish - name: Run cron jobs run: | From 7413e5e1195812bd58974f1f7c8ba414675291bf Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Thu, 23 Apr 2026 16:05:29 +0530 Subject: [PATCH 81/96] fix: use gha as cache backend for docker images Signed-off-by: Anupam Kumar --- .github/workflows/integration-test-k8s.yml | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration-test-k8s.yml b/.github/workflows/integration-test-k8s.yml index ea0b051..b944d55 100644 --- a/.github/workflows/integration-test-k8s.yml +++ b/.github/workflows/integration-test-k8s.yml @@ -211,6 +211,13 @@ jobs: - name: Set up Docker Buildx uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # v3 + - name: Login to GitHub Container Registry + uses: docker/login-action@184bdaa0721073962dff0199f1fb9940f07167d1 # v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Build the context_chat_backend cpu image uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # v6 with: @@ -220,8 +227,8 @@ jobs: tags: ccb-cpu:latest target: runtime-cpu load: true - cache-from: type=registry,ref=nextcloud/context_chat_backend:cpu-build-cache,compression=zstd,mode=max - cache-to: type=registry,ref=nextcloud/context_chat_backend:cpu-build-cache + cache-from: type=gha + cache-to: type=gha,mode=max - name: Pre-load CCB ExApp image into k3s run: docker save ccb-cpu:latest | sudo k3s ctr images import - From 4f86758735d16f51e4da612ac8c06203ac8bab60 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Thu, 23 Apr 2026 16:30:41 +0530 Subject: [PATCH 82/96] fix: replace role names in info.xml Signed-off-by: Anupam Kumar --- appinfo/info.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/appinfo/info.xml b/appinfo/info.xml index c65a8e8..e589638 100644 --- a/appinfo/info.xml +++ b/appinfo/info.xml @@ -84,13 +84,13 @@ Setup background job workers as described here: https://docs.nextcloud.com/serve - rp + requestproc Request Processing Mode APP_ROLE=requestproc true - up + updatesproc Metadata Updates Processing Mode APP_ROLE=updatesproc false From 9ca5dd7d5e69add0dab4334f9e89484feb8a52c3 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Thu, 23 Apr 2026 16:47:29 +0530 Subject: [PATCH 83/96] fix: use local tag so image is not pulled from remote Signed-off-by: Anupam Kumar --- .github/workflows/integration-test-k8s.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/integration-test-k8s.yml b/.github/workflows/integration-test-k8s.yml index b944d55..a018784 100644 --- a/.github/workflows/integration-test-k8s.yml +++ b/.github/workflows/integration-test-k8s.yml @@ -224,14 +224,15 @@ jobs: context: context_chat_backend push: false platforms: linux/amd64 - tags: ccb-cpu:latest + # use local tag so image is not pulled from remote + tags: ccb-cpu:local target: runtime-cpu load: true cache-from: type=gha cache-to: type=gha,mode=max - name: Pre-load CCB ExApp image into k3s - run: docker save ccb-cpu:latest | sudo k3s ctr images import - + run: docker save ccb-cpu:local | sudo k3s ctr images import - - name: Start HaRP with K8s backend run: | @@ -279,7 +280,7 @@ jobs: - name: Register backend run: | sed -i 's;.*;ccb-cpu;' context_chat_backend/appinfo/info.xml - sed -i 's;.*;latest;' context_chat_backend/appinfo/info.xml + sed -i 's;.*;local;' context_chat_backend/appinfo/info.xml timeout 120 ./occ app_api:app:register context_chat_backend k8s_test \ --info-xml context_chat_backend/appinfo/info.xml \ --env EXTERNAL_DB="postgresql+psycopg://root:rootpassword@localhost:4445/nextcloud" \ From 41c85fef99dec2cb742d96949150f99fba923e5b Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Thu, 23 Apr 2026 17:04:52 +0530 Subject: [PATCH 84/96] chore: show HaRP container's logs Signed-off-by: Anupam Kumar --- .github/workflows/integration-test-k8s.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/integration-test-k8s.yml b/.github/workflows/integration-test-k8s.yml index a018784..ab9e439 100644 --- a/.github/workflows/integration-test-k8s.yml +++ b/.github/workflows/integration-test-k8s.yml @@ -392,6 +392,11 @@ jobs: run: | tail -v -n +1 worker?_logs || echo "No worker logs" + - name: Show HaRP logs + if: always() + run: | + docker logs appapi-harp + - name: Show main app indexing logs if: always() run: | From cfbc2a17b7c0889376a9ebc8d1bc967905a20de4 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Thu, 23 Apr 2026 17:41:16 +0530 Subject: [PATCH 85/96] fix: add ghcr.io to the docker image name Signed-off-by: Anupam Kumar --- .github/workflows/integration-test-k8s.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration-test-k8s.yml b/.github/workflows/integration-test-k8s.yml index ab9e439..d2939ed 100644 --- a/.github/workflows/integration-test-k8s.yml +++ b/.github/workflows/integration-test-k8s.yml @@ -225,14 +225,14 @@ jobs: push: false platforms: linux/amd64 # use local tag so image is not pulled from remote - tags: ccb-cpu:local + tags: ghcr.io/ccb-cpu:local target: runtime-cpu load: true cache-from: type=gha cache-to: type=gha,mode=max - name: Pre-load CCB ExApp image into k3s - run: docker save ccb-cpu:local | sudo k3s ctr images import - + run: docker save ghcr.io/ccb-cpu:local | sudo k3s ctr images import - - name: Start HaRP with K8s backend run: | From 25ba6880d58ebac384e0f9a5d1eb2742a31354fb Mon Sep 17 00:00:00 2001 From: Marcel Klehr Date: Thu, 23 Apr 2026 15:24:19 +0200 Subject: [PATCH 86/96] tests(k8s): make php listen on all interfaces Signed-off-by: Marcel Klehr --- .github/workflows/integration-test-k8s.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration-test-k8s.yml b/.github/workflows/integration-test-k8s.yml index d2939ed..953892b 100644 --- a/.github/workflows/integration-test-k8s.yml +++ b/.github/workflows/integration-test-k8s.yml @@ -139,7 +139,6 @@ jobs: sleep 25 mkdir data ./occ maintenance:install --verbose --database=${{ matrix.databases }} --database-name=nextcloud --database-host=127.0.0.1 --database-port=$MYSQL_PORT --database-user=root --database-pass=rootpassword --admin-user admin --admin-pass password - composer run serve & - name: Set up Nextcloud PGSQL if: ${{ matrix.databases == 'pgsql'}} @@ -147,7 +146,6 @@ jobs: sleep 25 mkdir data ./occ maintenance:install --verbose --database=${{ matrix.databases }} --database-name=nextcloud --database-host=127.0.0.1 --database-port=$PGSQL_PORT --database-user=root --database-pass=rootpassword --admin-user admin --admin-pass password - composer run serve & - name: Enable context_chat, app_api and testing run: ./occ app:enable -vvv -f context_chat app_api testing @@ -254,6 +252,9 @@ jobs: -v $(pwd)/apps/app_api/tests/simple-nginx-NOT-FOR-PRODUCTION.conf:/etc/nginx/conf.d/default.conf:ro \ -d nginx + - name: Start nextcloud + run: PHP_CLI_SERVER_WORKERS=2 php -S 0.0.0.0:8080 & + - name: Wait for HaRP K8s readiness run: | for i in $(seq 1 30); do From 3df8fdcfe07f2b2cdafea0007371aa1ab2fb59f3 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Tue, 28 Apr 2026 05:54:00 +0530 Subject: [PATCH 87/96] fix(ci): use NODE_IP to reach the vector db Signed-off-by: Anupam Kumar --- .github/workflows/integration-test-k8s.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration-test-k8s.yml b/.github/workflows/integration-test-k8s.yml index 953892b..3e9adb7 100644 --- a/.github/workflows/integration-test-k8s.yml +++ b/.github/workflows/integration-test-k8s.yml @@ -284,7 +284,7 @@ jobs: sed -i 's;.*;local;' context_chat_backend/appinfo/info.xml timeout 120 ./occ app_api:app:register context_chat_backend k8s_test \ --info-xml context_chat_backend/appinfo/info.xml \ - --env EXTERNAL_DB="postgresql+psycopg://root:rootpassword@localhost:4445/nextcloud" \ + --env EXTERNAL_DB="postgresql+psycopg://root:rootpassword@${{ env.NODE_IP }}:4445/nextcloud" \ --wait-finish - name: Run cron jobs From fd12d841fb0630b6e04f92652487e03beaf5016e Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Tue, 28 Apr 2026 07:25:55 +0530 Subject: [PATCH 88/96] fix(ci): increase timeout for context chat stats and handle exit status better Signed-off-by: Anupam Kumar --- .github/workflows/integration-test-k8s.yml | 4 ++-- .github/workflows/integration-test.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/integration-test-k8s.yml b/.github/workflows/integration-test-k8s.yml index 3e9adb7..cb1aa53 100644 --- a/.github/workflows/integration-test-k8s.yml +++ b/.github/workflows/integration-test-k8s.yml @@ -310,8 +310,8 @@ jobs: echo "Checking stats, attempt $i..." stats_err=$(mktemp) - stats=$(timeout 5 ./occ context_chat:stats --json 2>"$stats_err") - stats_exit=$? + stats_exit=0 + stats=$(timeout 30 ./occ context_chat:stats --json 2>"$stats_err") || stats_exit=$? echo "Stats output:" echo "$stats" if [ -s "$stats_err" ]; then diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index 7c8a4cf..ba536b8 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -235,8 +235,8 @@ jobs: echo "Checking stats, attempt $i..." stats_err=$(mktemp) - stats=$(timeout 5 ./occ context_chat:stats --json 2>"$stats_err") - stats_exit=$? + stats_exit=0 + stats=$(timeout 30 ./occ context_chat:stats --json 2>"$stats_err") || stats_exit=$? echo "Stats output:" echo "$stats" if [ -s "$stats_err" ]; then From 98d2765df442b8d10ff433e4c21a8bc974fae8c8 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Tue, 28 Apr 2026 10:10:54 +0530 Subject: [PATCH 89/96] fix(ci): checkout the correct branch of app_api Signed-off-by: Anupam Kumar --- .github/workflows/integration-test-k8s.yml | 1 + .github/workflows/integration-test.yml | 3 +++ 2 files changed, 4 insertions(+) diff --git a/.github/workflows/integration-test-k8s.yml b/.github/workflows/integration-test-k8s.yml index cb1aa53..9511066 100644 --- a/.github/workflows/integration-test-k8s.yml +++ b/.github/workflows/integration-test-k8s.yml @@ -123,6 +123,7 @@ jobs: uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 with: repository: nextcloud/app_api + ref: ${{ matrix.server-versions }} path: apps/app_api persist-credentials: false diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index ba536b8..6ebdd36 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -224,6 +224,7 @@ jobs: ./occ background-job:list - name: Initial dump of DB with context_chat_queue populated + if: always() run: | docker exec postgres pg_dump nextcloud > /tmp/0_pgdump_nextcloud @@ -365,6 +366,7 @@ jobs: - name: Upload database dumps uses: actions/upload-artifact@v4 + if: always() with: name: database-dumps-${{ matrix.server-versions }}-php@${{ matrix.php-versions }} path: | @@ -372,6 +374,7 @@ jobs: /tmp/1_pgdump_nextcloud - name: Final stats log + if: always() run: | ./occ context_chat:stats ./occ context_chat:stats --json From 50547915dc8a9a113bb427c958c4b14adf13d7a3 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Tue, 28 Apr 2026 10:33:32 +0530 Subject: [PATCH 90/96] fix(ci): show all k8s pods' logs Signed-off-by: Anupam Kumar --- .github/workflows/integration-test-k8s.yml | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/.github/workflows/integration-test-k8s.yml b/.github/workflows/integration-test-k8s.yml index 9511066..fe43f45 100644 --- a/.github/workflows/integration-test-k8s.yml +++ b/.github/workflows/integration-test-k8s.yml @@ -178,7 +178,7 @@ jobs: - name: Install k3s run: | - curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="--disable traefik --disable servicelb" sh - + curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="--disable traefik --disable servicelb --kubelet-arg=container-log-max-size=0" sh - sudo chmod 644 /etc/rancher/k3s/k3s.yaml echo "KUBECONFIG=/etc/rancher/k3s/k3s.yaml" >> $GITHUB_ENV @@ -300,6 +300,7 @@ jobs: ./occ background-job:list - name: Initial dump of DB with context_chat_queue populated + if: always() run: | docker exec postgres pg_dump nextcloud > /tmp/0_pgdump_nextcloud @@ -402,20 +403,21 @@ jobs: - name: Show main app indexing logs if: always() run: | - kubectl logs -n nextcloud-exapps -l app=nc-app-context-chat-backend-indexing + kubectl logs -n nextcloud-exapps -l app=nc-app-context-chat-backend-indexing --prefix --tail=-1 --ignore-errors - name: Show main app updates processing logs if: always() run: | - kubectl logs -n nextcloud-exapps -l app=nc-app-context-chat-backend-updatesproc + kubectl logs -n nextcloud-exapps -l app=nc-app-context-chat-backend-updatesproc --prefix --tail=-1 --ignore-errors - name: Show main app request processing logs if: always() run: | - kubectl logs -n nextcloud-exapps -l app=nc-app-context-chat-backend-requestproc + kubectl logs -n nextcloud-exapps -l app=nc-app-context-chat-backend-requestproc --prefix --tail=-1 --ignore-errors - name: Upload database dumps uses: actions/upload-artifact@v4 + if: always() with: name: database-dumps-${{ matrix.server-versions }}-php@${{ matrix.php-versions }} path: | @@ -423,6 +425,7 @@ jobs: /tmp/1_pgdump_nextcloud - name: Final stats log + if: always() run: | ./occ context_chat:stats ./occ context_chat:stats --json From f74908b6a00eb4787034807463e57fe5dbef3506 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Tue, 28 Apr 2026 10:49:29 +0530 Subject: [PATCH 91/96] fix(ci): app_api branch translation + only run k8s for master Signed-off-by: Anupam Kumar --- .github/workflows/integration-test-k8s.yml | 4 ++-- .github/workflows/integration-test.yml | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration-test-k8s.yml b/.github/workflows/integration-test-k8s.yml index fe43f45..db3f760 100644 --- a/.github/workflows/integration-test-k8s.yml +++ b/.github/workflows/integration-test-k8s.yml @@ -62,7 +62,7 @@ jobs: matrix: php-versions: [ '8.2' ] databases: [ 'pgsql' ] - server-versions: [ 'stable32', 'stable33', 'master' ] + server-versions: [ 'master' ] name: Integration test k8s on ${{ matrix.server-versions }} php@${{ matrix.php-versions }} @@ -123,7 +123,7 @@ jobs: uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 with: repository: nextcloud/app_api - ref: ${{ matrix.server-versions }} + ref: ${{ matrix.server-versions == 'master' && 'main' || matrix.server-versions }} path: apps/app_api persist-credentials: false diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index 6ebdd36..d211d16 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -124,6 +124,7 @@ jobs: uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 with: repository: nextcloud/app_api + ref: ${{ matrix.server-versions == 'master' && 'main' || matrix.server-versions }} path: apps/app_api persist-credentials: false From d3f957558ab0a451cbc557b8a668f17222a7ab2d Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Tue, 28 Apr 2026 13:16:11 +0530 Subject: [PATCH 92/96] fix(ci): separate prompt responses in groups Signed-off-by: Anupam Kumar --- .github/workflows/integration-test-k8s.yml | 9 ++++++--- .github/workflows/integration-test.yml | 9 ++++++--- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/.github/workflows/integration-test-k8s.yml b/.github/workflows/integration-test-k8s.yml index db3f760..0438c81 100644 --- a/.github/workflows/integration-test-k8s.yml +++ b/.github/workflows/integration-test-k8s.yml @@ -367,14 +367,17 @@ jobs: ./occ background-job:worker 'OC\TaskProcessing\SynchronousBackgroundJob' > worker1_logs 2>&1 & ./occ background-job:worker 'OC\TaskProcessing\SynchronousBackgroundJob' > worker2_logs 2>&1 & + echo ::group::English prompt OUT1=$(./occ context_chat:prompt admin "Which factors are taken into account for the Ethical AI Rating?") echo "$OUT1" - echo '--------------------------------------------------' + echo "$OUT1" | grep -q "If all of these points are met, we give a Green label." || exit 1 + echo ::endgroup:: + + echo ::group::German prompt OUT2=$(./occ context_chat:prompt admin "Welche Faktoren beeinflussen das Ethical AI Rating?") echo "$OUT2" - - echo "$OUT1" | grep -q "If all of these points are met, we give a Green label." || exit 1 echo "$OUT2" | grep -q "If all of these points are met, we give a Green label." || exit 1 + echo ::endgroup:: - name: Final dump of DB with vectordb populated run: | diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index d211d16..6649735 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -299,14 +299,17 @@ jobs: ./occ background-job:worker 'OC\TaskProcessing\SynchronousBackgroundJob' > worker1_logs 2>&1 & ./occ background-job:worker 'OC\TaskProcessing\SynchronousBackgroundJob' > worker2_logs 2>&1 & + echo ::group::English prompt OUT1=$(./occ context_chat:prompt admin "Which factors are taken into account for the Ethical AI Rating?") echo "$OUT1" - echo '--------------------------------------------------' + echo "$OUT1" | grep -q "If all of these points are met, we give a Green label." || exit 1 + echo ::endgroup:: + + echo ::group::German prompt OUT2=$(./occ context_chat:prompt admin "Welche Faktoren beeinflussen das Ethical AI Rating?") echo "$OUT2" - - echo "$OUT1" | grep -q "If all of these points are met, we give a Green label." || exit 1 echo "$OUT2" | grep -q "If all of these points are met, we give a Green label." || exit 1 + echo ::endgroup:: - name: Check python memory usage run: | From c9edba6f6396363d1f026a2efe5d6a86abcc14ad Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Tue, 28 Apr 2026 13:16:29 +0530 Subject: [PATCH 93/96] fix(ci): always dump db Signed-off-by: Anupam Kumar --- .github/workflows/integration-test-k8s.yml | 1 + .github/workflows/integration-test.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/workflows/integration-test-k8s.yml b/.github/workflows/integration-test-k8s.yml index 0438c81..5555f7c 100644 --- a/.github/workflows/integration-test-k8s.yml +++ b/.github/workflows/integration-test-k8s.yml @@ -380,6 +380,7 @@ jobs: echo ::endgroup:: - name: Final dump of DB with vectordb populated + if: always() run: | docker exec postgres pg_dump nextcloud > /tmp/1_pgdump_nextcloud diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index 6649735..4a6123c 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -330,6 +330,7 @@ jobs: fi - name: Final dump of DB with vectordb populated + if: always() run: | docker exec postgres pg_dump nextcloud > /tmp/1_pgdump_nextcloud From 7da999a82e62062c0f3c1d348e30a8569f7b5dc0 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Wed, 29 Apr 2026 18:49:19 +0530 Subject: [PATCH 94/96] fix(context): break the loop after the first chunk does not fit in the context Signed-off-by: Anupam Kumar --- context_chat_backend/chain/context.py | 10 ---------- context_chat_backend/chain/one_shot.py | 6 ++---- context_chat_backend/chain/query_proc.py | 17 ++++++++++------- 3 files changed, 12 insertions(+), 21 deletions(-) diff --git a/context_chat_backend/chain/context.py b/context_chat_backend/chain/context.py index adbac2d..c575d1a 100644 --- a/context_chat_backend/chain/context.py +++ b/context_chat_backend/chain/context.py @@ -32,16 +32,6 @@ def get_context_docs( return vectordb.doc_search(user_id, query, ctx_limit, scope_type, scope_list) -def get_context_chunks(context_docs: list[Document]) -> list[str]: - context_chunks = [] - for doc in context_docs: - if title := doc.metadata.get('title'): - context_chunks.append(title) - context_chunks.append(doc.page_content) - - return context_chunks - - def do_doc_search( user_id: str, query: str, diff --git a/context_chat_backend/chain/one_shot.py b/context_chat_backend/chain/one_shot.py index 3b4224c..d723e9e 100644 --- a/context_chat_backend/chain/one_shot.py +++ b/context_chat_backend/chain/one_shot.py @@ -8,7 +8,7 @@ from ..dyn_loader import VectorDBLoader from ..types import TConfig -from .context import get_context_chunks, get_context_docs +from .context import get_context_docs from .query_proc import get_pruned_query from .types import ContextException, LLMOutput, ScopeType, SearchResult @@ -56,14 +56,12 @@ def process_context_query( raise ContextException('No documents retrieved, please choose a wider scope of documents to search from') raise ContextException('No documents retrieved, please index a few documents first') - context_chunks = get_context_chunks(context_docs) logger.debug('context retrieved', extra={ 'len(context_docs)': len(context_docs), - 'len(context_chunks)': len(context_chunks), }) output = llm.invoke( - get_pruned_query(llm, app_config, query, template or _LLM_TEMPLATE, context_chunks), + get_pruned_query(llm, app_config, query, template or _LLM_TEMPLATE, context_docs), userid=user_id, ).strip() unique_sources = [SearchResult( diff --git a/context_chat_backend/chain/query_proc.py b/context_chat_backend/chain/query_proc.py index b6a9982..685246b 100644 --- a/context_chat_backend/chain/query_proc.py +++ b/context_chat_backend/chain/query_proc.py @@ -7,6 +7,7 @@ from sys import maxsize as SYS_MAXSIZE from langchain.llms.base import LLM +from langchain.schema import Document from transformers import GPT2Tokenizer from ..types import TConfig @@ -22,7 +23,7 @@ def get_num_tokens(text: str, tokenizer: GPT2Tokenizer) -> int: return len(tokenizer.encode(text, max_length=SYS_MAXSIZE, truncation=True)) -def get_pruned_query(llm: LLM, config: TConfig, query: str, template: str, text_chunks: list[str]) -> str: +def get_pruned_query(llm: LLM, config: TConfig, query: str, template: str, doc_chunks: list[Document]) -> str: ''' Truncates the input to fit the model's maximum context length and returns the model's prediction @@ -69,19 +70,21 @@ def get_pruned_query(llm: LLM, config: TConfig, query: str, template: str, text_ accepted_chunks = [] - while text_chunks and remaining_tokens > 0: - context = text_chunks.pop(0) + for chunk in doc_chunks: + context = f'{chunk.metadata.get("title", "")}:\n\n{chunk.page_content}' context_tokens = get_num_tokens(context, tokenizer) - if context_tokens <= remaining_tokens: - accepted_chunks.append(context) - remaining_tokens -= context_tokens + if context_tokens > remaining_tokens or remaining_tokens <= 0: + break + + accepted_chunks.append(context) + remaining_tokens -= context_tokens logger.debug('pruned query stats', extra={ 'total tokens': n_ctx - remaining_tokens, 'remaining tokens': remaining_tokens, 'accepted chunks': len(accepted_chunks), - 'total chunks': len(text_chunks), + 'total chunks': len(doc_chunks), }) return template.format(context='\n\n'.join(accepted_chunks), question=query) From 4bbb08ce5679056492934f5fe1d862121bdba615 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Wed, 29 Apr 2026 18:50:53 +0530 Subject: [PATCH 95/96] chore(context): increase default context size to 16384 Signed-off-by: Anupam Kumar --- context_chat_backend/chain/query_proc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/context_chat_backend/chain/query_proc.py b/context_chat_backend/chain/query_proc.py index 685246b..1fe6827 100644 --- a/context_chat_backend/chain/query_proc.py +++ b/context_chat_backend/chain/query_proc.py @@ -40,7 +40,7 @@ def get_pruned_query(llm: LLM, config: TConfig, query: str, template: str, doc_c n_ctx = llm_config.get('n_ctx') \ or llm_config.get('config', {}).get('context_length') \ or llm_config.get('pipeline_kwargs', {}).get('config', {}).get('max_length') \ - or 8192 + or 16384 # fav: tokens to generate n_gen = llm_config.get('max_tokens') \ From 205dba71e4b69936c765c0b1b8b1776eadb20a57 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Wed, 29 Apr 2026 18:55:54 +0530 Subject: [PATCH 96/96] chore: increase context chunks fetched to 30 Signed-off-by: Anupam Kumar --- context_chat_backend/chain/context.py | 2 +- context_chat_backend/chain/one_shot.py | 2 +- context_chat_backend/task_fetcher.py | 2 +- context_chat_backend/vectordb/pgvector.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/context_chat_backend/chain/context.py b/context_chat_backend/chain/context.py index c575d1a..81a58f9 100644 --- a/context_chat_backend/chain/context.py +++ b/context_chat_backend/chain/context.py @@ -36,7 +36,7 @@ def do_doc_search( user_id: str, query: str, vectordb_loader: VectorDBLoader, - ctx_limit: int = 20, + ctx_limit: int = 30, scope_type: ScopeType | None = None, scope_list: list[str] | None = None, ) -> list[SearchResult]: diff --git a/context_chat_backend/chain/one_shot.py b/context_chat_backend/chain/one_shot.py index d723e9e..3bd4557 100644 --- a/context_chat_backend/chain/one_shot.py +++ b/context_chat_backend/chain/one_shot.py @@ -38,7 +38,7 @@ def process_context_query( llm: LLM, app_config: TConfig, query: str, - ctx_limit: int = 20, + ctx_limit: int = 30, scope_type: ScopeType | None = None, scope_list: list[str] | None = None, template: str | None = None, diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index a41ae06..baa882d 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -60,7 +60,7 @@ TP_CHECK_INTERVAL = 5 TP_CHECK_INTERVAL_WITH_TRIGGER = 5 * 60 TP_CHECK_INTERVAL_ON_ERROR = 15 -CONTEXT_LIMIT=20 +CONTEXT_LIMIT = 30 class ThreadType(Enum): diff --git a/context_chat_backend/vectordb/pgvector.py b/context_chat_backend/vectordb/pgvector.py index e833356..4b820cd 100644 --- a/context_chat_backend/vectordb/pgvector.py +++ b/context_chat_backend/vectordb/pgvector.py @@ -640,7 +640,7 @@ def _similarity_search( session: orm.Session, query: str, chunk_ids: list[str], - k: int = 20, + k: int, ) -> list[Document]: embedding = self.client.embeddings.embed_query(query) collection = self.client.get_collection(session)