@@ -144,8 +144,26 @@ def __init__(
144
144
145
145
class ChunkingSettings (AttrDict [Any ]):
146
146
"""
147
- :arg strategy: (required) The chunking strategy: `sentence` or `word`.
148
- Defaults to `sentence` if omitted.
147
+ :arg strategy: (required) The chunking strategy: `sentence`, `word`,
148
+ `none` or `recursive`. * If `strategy` is set to `recursive`,
149
+ you must also specify: - `max_chunk_size` - either `separators`
150
+ or`separator_group` Learn more about different chunking
151
+ strategies in the linked documentation. Defaults to `sentence` if
152
+ omitted.
153
+ :arg separator_group: (required) This parameter is only applicable
154
+ when using the `recursive` chunking strategy. Sets a predefined
155
+ list of separators in the saved chunking settings based on the
156
+ selected text type. Values can be `markdown` or `plaintext`.
157
+ Using this parameter is an alternative to manually specifying a
158
+ custom `separators` list.
159
+ :arg separators: (required) A list of strings used as possible split
160
+ points when chunking text with the `recursive` strategy. Each
161
+ string can be a plain string or a regular expression (regex)
162
+ pattern. The system tries each separator in order to split the
163
+ text, starting from the first item in the list. After splitting,
164
+ it attempts to recombine smaller pieces into larger chunks that
165
+ stay within the `max_chunk_size` limit, to reduce the total number
166
+ of chunks generated.
149
167
:arg max_chunk_size: (required) The maximum size of a chunk in words.
150
168
This value cannot be higher than `300` or lower than `20` (for
151
169
`sentence` strategy) or `10` (for `word` strategy). Defaults to
@@ -160,6 +178,8 @@ class ChunkingSettings(AttrDict[Any]):
160
178
"""
161
179
162
180
strategy : Union [str , DefaultType ]
181
+ separator_group : Union [str , DefaultType ]
182
+ separators : Union [Sequence [str ], DefaultType ]
163
183
max_chunk_size : Union [int , DefaultType ]
164
184
overlap : Union [int , DefaultType ]
165
185
sentence_overlap : Union [int , DefaultType ]
@@ -168,13 +188,19 @@ def __init__(
168
188
self ,
169
189
* ,
170
190
strategy : Union [str , DefaultType ] = DEFAULT ,
191
+ separator_group : Union [str , DefaultType ] = DEFAULT ,
192
+ separators : Union [Sequence [str ], DefaultType ] = DEFAULT ,
171
193
max_chunk_size : Union [int , DefaultType ] = DEFAULT ,
172
194
overlap : Union [int , DefaultType ] = DEFAULT ,
173
195
sentence_overlap : Union [int , DefaultType ] = DEFAULT ,
174
196
** kwargs : Any ,
175
197
):
176
198
if strategy is not DEFAULT :
177
199
kwargs ["strategy" ] = strategy
200
+ if separator_group is not DEFAULT :
201
+ kwargs ["separator_group" ] = separator_group
202
+ if separators is not DEFAULT :
203
+ kwargs ["separators" ] = separators
178
204
if max_chunk_size is not DEFAULT :
179
205
kwargs ["max_chunk_size" ] = max_chunk_size
180
206
if overlap is not DEFAULT :
@@ -5198,9 +5224,11 @@ def buckets_as_dict(self) -> Mapping[str, "FiltersBucket"]:
5198
5224
class FiltersBucket (AttrDict [Any ]):
5199
5225
"""
5200
5226
:arg doc_count: (required)
5227
+ :arg key:
5201
5228
"""
5202
5229
5203
5230
doc_count : int
5231
+ key : str
5204
5232
5205
5233
5206
5234
class FrequentItemSetsAggregate (AttrDict [Any ]):
0 commit comments