53
53
# Constants for OpenAI vector stores
54
54
CHUNK_MULTIPLIER = 5
55
55
FILE_BATCH_CLEANUP_INTERVAL_SECONDS = 24 * 60 * 60 # 1 day in seconds
56
+ MAX_CONCURRENT_FILES_PER_BATCH = 5 # Maximum concurrent file processing within a batch
57
+ FILE_BATCH_CHUNK_SIZE = 10 # Process files in chunks of this size (2x concurrency)
56
58
57
59
VERSION = "v3"
58
60
VECTOR_DBS_PREFIX = f"vector_dbs:{ VERSION } ::"
@@ -77,6 +79,8 @@ class OpenAIVectorStoreMixin(ABC):
77
79
kvstore : KVStore | None
78
80
# Track last cleanup time to throttle cleanup operations
79
81
_last_file_batch_cleanup_time : int
82
+ # Track running file batch processing tasks
83
+ _file_batch_tasks : dict [str , asyncio .Task [None ]]
80
84
81
85
async def _save_openai_vector_store (self , store_id : str , store_info : dict [str , Any ]) -> None :
82
86
"""Save vector store metadata to persistent storage."""
@@ -224,12 +228,14 @@ async def _resume_incomplete_batches(self) -> None:
224
228
if batch_info ["status" ] == "in_progress" :
225
229
logger .info (f"Resuming incomplete file batch: { batch_id } " )
226
230
# Restart the background processing task
227
- asyncio .create_task (self ._process_file_batch_async (batch_id , batch_info ))
231
+ task = asyncio .create_task (self ._process_file_batch_async (batch_id , batch_info ))
232
+ self ._file_batch_tasks [batch_id ] = task
228
233
229
234
async def initialize_openai_vector_stores (self ) -> None :
230
235
"""Load existing OpenAI vector stores and file batches into the in-memory cache."""
231
236
self .openai_vector_stores = await self ._load_openai_vector_stores ()
232
237
self .openai_file_batches = await self ._load_openai_vector_store_file_batches ()
238
+ self ._file_batch_tasks = {}
233
239
await self ._resume_incomplete_batches ()
234
240
self ._last_file_batch_cleanup_time = 0
235
241
@@ -935,7 +941,8 @@ async def openai_create_vector_store_file_batch(
935
941
await self ._save_openai_vector_store_file_batch (batch_id , batch_info )
936
942
937
943
# Start background processing of files
938
- asyncio .create_task (self ._process_file_batch_async (batch_id , batch_info ))
944
+ task = asyncio .create_task (self ._process_file_batch_async (batch_id , batch_info ))
945
+ self ._file_batch_tasks [batch_id ] = task
939
946
940
947
# Run cleanup if needed (throttled to once every 1 day)
941
948
current_time = int (time .time ())
@@ -946,50 +953,110 @@ async def openai_create_vector_store_file_batch(
946
953
947
954
return batch_object
948
955
949
- async def _process_file_batch_async (
956
+ async def _process_files_with_concurrency (
950
957
self ,
958
+ file_ids : list [str ],
959
+ vector_store_id : str ,
960
+ attributes : dict [str , Any ],
961
+ chunking_strategy_obj : Any ,
951
962
batch_id : str ,
952
963
batch_info : dict [str , Any ],
953
964
) -> None :
954
- """Process files in a batch asynchronously in the background."""
955
- file_ids = batch_info ["file_ids" ]
956
- attributes = batch_info ["attributes" ]
957
- chunking_strategy = batch_info ["chunking_strategy" ]
958
- vector_store_id = batch_info ["vector_store_id" ]
965
+ """Process files with controlled concurrency and chunking."""
966
+ semaphore = asyncio .Semaphore (MAX_CONCURRENT_FILES_PER_BATCH )
967
+
968
+ async def process_single_file (file_id : str ) -> tuple [str , bool ]:
969
+ """Process a single file with concurrency control."""
970
+ async with semaphore :
971
+ try :
972
+ await self .openai_attach_file_to_vector_store (
973
+ vector_store_id = vector_store_id ,
974
+ file_id = file_id ,
975
+ attributes = attributes ,
976
+ chunking_strategy = chunking_strategy_obj ,
977
+ )
978
+ return file_id , True
979
+ except Exception as e :
980
+ logger .error (f"Failed to process file { file_id } in batch { batch_id } : { e } " )
981
+ return file_id , False
982
+
983
+ # Process files in chunks to avoid creating too many tasks at once
984
+ total_files = len (file_ids )
985
+ for chunk_start in range (0 , total_files , FILE_BATCH_CHUNK_SIZE ):
986
+ chunk_end = min (chunk_start + FILE_BATCH_CHUNK_SIZE , total_files )
987
+ chunk = file_ids [chunk_start :chunk_end ]
988
+
989
+ logger .info (
990
+ f"Processing chunk { chunk_start // FILE_BATCH_CHUNK_SIZE + 1 } of { (total_files + FILE_BATCH_CHUNK_SIZE - 1 ) // FILE_BATCH_CHUNK_SIZE } ({ len (chunk )} files)"
991
+ )
959
992
960
- for file_id in file_ids :
961
- try :
962
- chunking_strategy_adapter : TypeAdapter [VectorStoreChunkingStrategy ] = TypeAdapter (
963
- VectorStoreChunkingStrategy
964
- )
965
- chunking_strategy_obj = chunking_strategy_adapter .validate_python (chunking_strategy )
966
- await self .openai_attach_file_to_vector_store (
967
- vector_store_id = vector_store_id ,
968
- file_id = file_id ,
969
- attributes = attributes ,
970
- chunking_strategy = chunking_strategy_obj ,
971
- )
993
+ async with asyncio .TaskGroup () as tg :
994
+ chunk_tasks = [tg .create_task (process_single_file (file_id )) for file_id in chunk ]
972
995
973
- # Update counts atomically
974
- batch_info ["file_counts" ]["completed" ] += 1
975
- batch_info ["file_counts" ]["in_progress" ] -= 1
996
+ chunk_results = [task .result () for task in chunk_tasks ]
976
997
977
- except Exception as e :
978
- logger .error (f"Failed to process file { file_id } in batch { batch_id } : { e } " )
979
- batch_info ["file_counts" ]["failed" ] += 1
980
- batch_info ["file_counts" ]["in_progress" ] -= 1
998
+ # Update counts after each chunk for progressive feedback
999
+ for _ , success in chunk_results :
1000
+ self ._update_file_counts (batch_info , success = success )
1001
+
1002
+ # Save progress after each chunk
1003
+ await self ._save_openai_vector_store_file_batch (batch_id , batch_info )
1004
+
1005
+ def _update_file_counts (self , batch_info : dict [str , Any ], success : bool ) -> None :
1006
+ """Update file counts based on processing result."""
1007
+ if success :
1008
+ batch_info ["file_counts" ]["completed" ] += 1
1009
+ else :
1010
+ batch_info ["file_counts" ]["failed" ] += 1
1011
+ batch_info ["file_counts" ]["in_progress" ] -= 1
981
1012
982
- # Update final status when all files are processed
1013
+ def _update_batch_status (self , batch_info : dict [str , Any ]) -> None :
1014
+ """Update final batch status based on file processing results."""
983
1015
if batch_info ["file_counts" ]["failed" ] == 0 :
984
1016
batch_info ["status" ] = "completed"
985
1017
elif batch_info ["file_counts" ]["completed" ] == 0 :
986
1018
batch_info ["status" ] = "failed"
987
1019
else :
988
1020
batch_info ["status" ] = "completed" # Partial success counts as completed
989
1021
990
- await self ._save_openai_vector_store_file_batch (batch_id , batch_info )
1022
+ async def _process_file_batch_async (
1023
+ self ,
1024
+ batch_id : str ,
1025
+ batch_info : dict [str , Any ],
1026
+ ) -> None :
1027
+ """Process files in a batch asynchronously in the background."""
1028
+ file_ids = batch_info ["file_ids" ]
1029
+ attributes = batch_info ["attributes" ]
1030
+ chunking_strategy = batch_info ["chunking_strategy" ]
1031
+ vector_store_id = batch_info ["vector_store_id" ]
1032
+ chunking_strategy_adapter : TypeAdapter [VectorStoreChunkingStrategy ] = TypeAdapter (VectorStoreChunkingStrategy )
1033
+ chunking_strategy_obj = chunking_strategy_adapter .validate_python (chunking_strategy )
1034
+
1035
+ try :
1036
+ # Process all files with controlled concurrency
1037
+ await self ._process_files_with_concurrency (
1038
+ file_ids = file_ids ,
1039
+ vector_store_id = vector_store_id ,
1040
+ attributes = attributes ,
1041
+ chunking_strategy_obj = chunking_strategy_obj ,
1042
+ batch_id = batch_id ,
1043
+ batch_info = batch_info ,
1044
+ )
1045
+
1046
+ # Update final batch status
1047
+ self ._update_batch_status (batch_info )
1048
+ await self ._save_openai_vector_store_file_batch (batch_id , batch_info )
991
1049
992
- logger .info (f"File batch { batch_id } processing completed with status: { batch_info ['status' ]} " )
1050
+ logger .info (f"File batch { batch_id } processing completed with status: { batch_info ['status' ]} " )
1051
+
1052
+ except asyncio .CancelledError :
1053
+ logger .info (f"File batch { batch_id } processing was cancelled" )
1054
+ # Clean up task reference if it still exists
1055
+ self ._file_batch_tasks .pop (batch_id , None )
1056
+ raise # Re-raise to ensure proper cancellation propagation
1057
+ finally :
1058
+ # Always clean up task reference when processing ends
1059
+ self ._file_batch_tasks .pop (batch_id , None )
993
1060
994
1061
def _get_and_validate_batch (self , batch_id : str , vector_store_id : str ) -> dict [str , Any ]:
995
1062
"""Get and validate batch exists and belongs to vector store."""
@@ -1114,6 +1181,15 @@ async def openai_cancel_vector_store_file_batch(
1114
1181
if batch_info ["status" ] not in ["in_progress" ]:
1115
1182
raise ValueError (f"Cannot cancel batch { batch_id } with status { batch_info ['status' ]} " )
1116
1183
1184
+ # Cancel the actual processing task if it exists
1185
+ if batch_id in self ._file_batch_tasks :
1186
+ task = self ._file_batch_tasks [batch_id ]
1187
+ if not task .done ():
1188
+ task .cancel ()
1189
+ logger .info (f"Cancelled processing task for file batch: { batch_id } " )
1190
+ # Remove from task tracking
1191
+ del self ._file_batch_tasks [batch_id ]
1192
+
1117
1193
batch_info ["status" ] = "cancelled"
1118
1194
1119
1195
await self ._save_openai_vector_store_file_batch (batch_id , batch_info )
0 commit comments