@@ -280,24 +280,14 @@ def process_geometry(geometry):
280280 # Extract coordinates from ENVELOPE(minx,maxx,maxy,miny)
281281 minx , maxx , maxy , miny = map (float , envelope_match .groups ())
282282
283- # Validate the envelope coordinates
284- if not _is_valid_envelope (minx , maxx , maxy , miny ):
285- logger .warning (f"Invalid envelope coordinates: { geometry } - skipping" )
283+ # Normalize and validate the envelope coordinates
284+ normalized_geom , error_msg = _normalize_envelope (minx , maxx , maxy , miny )
285+
286+ if normalized_geom is None :
287+ logger .error (f"Invalid envelope { geometry } : { error_msg } - skipping" )
286288 return None
287-
288- # Create a polygon from the envelope coordinates in counterclockwise order
289- return {
290- "type" : "polygon" ,
291- "coordinates" : [
292- [
293- [minx , miny ], # bottom left
294- [maxx , miny ], # bottom right
295- [maxx , maxy ], # top right
296- [minx , maxy ], # top left
297- [minx , miny ], # close the polygon
298- ]
299- ],
300- }
289+
290+ return normalized_geom
301291
302292 # Try to parse as JSON
303293 try :
@@ -330,27 +320,74 @@ def process_geometry(geometry):
330320 return None
331321
332322
333- def _is_valid_envelope (minx , maxx , maxy , miny ):
334- """Validate envelope coordinates."""
335- # Check for valid coordinate ranges
323+ def _normalize_envelope (minx , maxx , maxy , miny ):
324+ """
325+ Normalize and validate envelope coordinates.
326+
327+ Returns:
328+ tuple: (geometry_dict, error_msg) where geometry_dict is None if invalid
329+ """
330+ # Check for valid coordinate ranges first
336331 if not (- 180 <= minx <= 180 ) or not (- 180 <= maxx <= 180 ):
337- return False
332+ return None , f"X coordinates out of range: minx= { minx } , maxx= { maxx } "
338333 if not (- 90 <= miny <= 90 ) or not (- 90 <= maxy <= 90 ):
339- return False
340-
341- # Check for valid envelope bounds (minx <= maxx, miny <= maxy)
342- if minx > maxx or miny > maxy :
343- return False
344-
345- # Check for zero-area polygons (at least one dimension must have non-zero area)
334+ return None , f"Y coordinates out of range: miny={ miny } , maxy={ maxy } "
335+
336+ # Auto-correct inverted coordinates
337+ if minx > maxx :
338+ logger .debug (f"Auto-correcting inverted X coordinates: swapping { minx } and { maxx } " )
339+ minx , maxx = maxx , minx
340+
341+ if miny > maxy :
342+ logger .debug (f"Auto-correcting inverted Y coordinates: swapping { miny } and { maxy } " )
343+ miny , maxy = maxy , miny
344+
345+ # Check if this is actually a point (zero-area envelope)
346346 if minx == maxx and miny == maxy :
347- return False
348-
349- # Check for very small areas that might cause precision issues
350- if abs (maxx - minx ) < 1e-10 or abs (maxy - miny ) < 1e-10 :
351- return False
352-
353- return True
347+ logger .debug (f"Converting zero-area envelope to POINT: ({ minx } , { miny } )" )
348+ return {
349+ "type" : "point" ,
350+ "coordinates" : [minx , miny ]
351+ }, None
352+
353+ # Check for very thin envelopes (essentially lines or near-points)
354+ epsilon = 1e-6 # ~0.11 meters at equator
355+
356+ if abs (maxx - minx ) < epsilon and abs (maxy - miny ) < epsilon :
357+ # Both dimensions tiny - treat as point
358+ center_x = (minx + maxx ) / 2
359+ center_y = (miny + maxy ) / 2
360+ logger .debug (f"Converting near-point envelope to POINT: ({ center_x } , { center_y } )" )
361+ return {
362+ "type" : "point" ,
363+ "coordinates" : [center_x , center_y ]
364+ }, None
365+
366+ elif abs (maxx - minx ) < epsilon :
367+ # Width tiny but height OK - expand width slightly
368+ center_x = (minx + maxx ) / 2
369+ minx = center_x - epsilon
370+ maxx = center_x + epsilon
371+ logger .debug (f"Expanding thin envelope width: { minx } to { maxx } " )
372+
373+ elif abs (maxy - miny ) < epsilon :
374+ # Height tiny but width OK - expand height slightly
375+ center_y = (miny + maxy ) / 2
376+ miny = center_y - epsilon
377+ maxy = center_y + epsilon
378+ logger .debug (f"Expanding thin envelope height: { miny } to { maxy } " )
379+
380+ # Create valid polygon from normalized envelope
381+ return {
382+ "type" : "polygon" ,
383+ "coordinates" : [[
384+ [minx , maxy ], # top-left
385+ [maxx , maxy ], # top-right
386+ [maxx , miny ], # bottom-right
387+ [minx , miny ], # bottom-left
388+ [minx , maxy ], # close the ring
389+ ]]
390+ }, None
354391
355392
356393def _is_valid_point (coords ):
@@ -421,19 +458,92 @@ def _are_points_collinear(points):
421458
422459
423460async def perform_bulk_indexing (bulk_data , index_name , bulk_size = 100 ):
424- """Perform bulk indexing in smaller chunks."""
425- # Split the bulk_data into smaller chunks
426- for i in range (0 , len (bulk_data ), bulk_size ):
427- chunk = bulk_data [i : i + bulk_size ]
461+ """Perform bulk indexing in smaller chunks.
462+
463+ Args:
464+ bulk_data: List of alternating action/document pairs
465+ index_name: Elasticsearch index name
466+ bulk_size: Number of OPERATIONS (not items) per chunk
467+ """
468+ # Each bulk operation consists of 2 items (action + document)
469+ # So we need to step by bulk_size * 2 to avoid splitting operations
470+ chunk_step = bulk_size * 2
471+
472+ total_operations = len (bulk_data ) // 2
473+ total_indexed = 0
474+ total_errors = 0
475+
476+ for i in range (0 , len (bulk_data ), chunk_step ):
477+ chunk = bulk_data [i : i + chunk_step ]
478+
479+ # Skip incomplete chunks (shouldn't happen, but be safe)
480+ if len (chunk ) % 2 != 0 :
481+ logger .warning (f"Skipping incomplete bulk chunk at offset { i } " )
482+ continue
483+
428484 try :
429485 # Perform the bulk operation for the current chunk
430- response = await es .bulk (operations = chunk , index = index_name , refresh = True )
486+ response = await es .bulk (operations = chunk , index = index_name , refresh = False )
487+
431488 # Check for errors in the response
432- if response .get ("errors" ):
433- print (f"Errors occurred during bulk indexing: { response ['items' ]} " )
489+ error_count = 0 # Initialize for this chunk
490+ success_count = 0 # Actually count successes
491+
492+ if response .get ("items" ):
493+ for item in response .get ("items" , []):
494+ # Each item is a dict with a single key (the operation type)
495+ for op_type , op_result in item .items ():
496+ status = op_result .get ("status" , 0 )
497+ if status >= 200 and status < 300 :
498+ success_count += 1
499+ else :
500+ error_count += 1
501+ total_errors += 1
502+ # Log first few errors in detail
503+ if error_count <= 10 :
504+ logger .error (
505+ f"Bulk indexing error for ID { op_result .get ('_id' )} : "
506+ f"Status { status } , "
507+ f"Error: { op_result .get ('error' )} "
508+ )
509+
510+ if error_count > 10 :
511+ logger .error (f"... and { error_count - 10 } more errors in this chunk" )
512+
513+ if error_count > 0 :
514+ logger .error (f"Chunk had { error_count } failed operations out of { len (chunk )// 2 } " )
515+ else :
516+ # No items in response - this is a problem
517+ logger .error (f"Bulk response has no items! Response keys: { list (response .keys ())} " )
518+ error_count = len (chunk ) // 2
519+ total_errors += error_count
520+
521+ total_indexed += success_count
522+
523+ if (i // chunk_step ) % 10 == 0 : # Log every 10 chunks
524+ logger .info (
525+ f"Progress: { total_indexed } /{ total_operations } operations "
526+ f"({ total_errors } errors)"
527+ )
528+
529+ except Exception as e :
530+ logger .error (f"Exception during bulk indexing chunk at offset { i } : { str (e )} " , exc_info = True )
531+ total_errors += len (chunk ) // 2
532+ # Continue with next chunk instead of failing completely
533+
534+ # Final refresh to make all indexed documents searchable
535+ if total_indexed > 0 :
536+ try :
537+ await es .indices .refresh (index = index_name )
434538 except Exception as e :
435- print (f"Exception during bulk indexing: { str (e )} " )
436- # Optionally, implement retry logic here
539+ logger .warning (f"Failed to refresh index: { e } " )
540+
541+ logger .info (
542+ f"Bulk indexing complete: { total_indexed } successful, "
543+ f"{ total_errors } errors out of { total_operations } total operations"
544+ )
545+
546+ return {"indexed" : total_indexed , "errors" : total_errors , "total" : total_operations }
437547
438548
439549async def reindex_resources ():
0 commit comments