Fix dataset upload

Ben Perlmutter · Ben Perlmutter · commit 5ba7c8c1e2eb · 2025-08-06T15:50:43.000-04:00
diff --git a/packages/datasets/README.md b/packages/datasets/README.md
@@ -0,0 +1,90 @@
+# Datasets Package
+
+This package provides utilities for importing, processing, and managing datasets used in the MongoDB Knowledge Service/Chatbot project. It contains both Node.js/TypeScript and Python implementations for various dataset operations.
+
+## Overview
+
+The datasets package is a hybrid TypeScript/Python package that handles:
+- Dataset ingestion from various sources (HuggingFace, Atlas, etc.)
+- Data processing and transformation pipelines
+- MongoDB import/export operations
+- Code example extraction and classification
+- Natural language query generation
+- Database metadata extraction
+
+## Structure
+
+### Node.js/TypeScript Components
+
+Located in `/src/` directory:
+
+- **Code Example Processing**: Extract and classify code examples from documentation
+- **Page Dataset**: Load and process page-based datasets
+- **Tree Generation**: Generate hierarchical data structures for NL queries
+- **Database Operations**: MongoDB schema generation and database analysis
+- **HuggingFace Integration**: Upload datasets to HuggingFace Hub
+- **Evaluation**: Braintrust integration for dataset evaluation
+
+### Python/UV Components
+
+Located in `/mongodb_datasets/` directory:
+
+- **Wikipedia Import**: Import Wikipedia datasets from HuggingFace to MongoDB
+- **Atlas Search**: Configure and create Atlas Search indexes
+- **Configuration Management**: Environment variable and project configuration
+
+## Installation & Setup
+
+### Node.js Dependencies
+```bash
+npm install
+npm run build
+```
+
+### Python Dependencies (using uv)
+```bash
+# Install Python dependencies
+uv sync
+
+# Activate virtual environment
+source .venv/bin/activate  # or .venv\Scripts\activate on Windows
+```
+
+## Usage
+
+### Node.js Scripts
+
+The package provides numerous npm scripts for different dataset operations:
+
+```bash
+# Build the project
+npm run ...
+```
+
+### Python Scripts
+
+The Python components provide CLI tools for dataset import operations:
+
+```bash
+# Import Wikipedia dataset (all articles)
+uv run ...
+```
+
+## Configuration
+
+### Environment Variables
+
+For required environment variables, see `.env.example` in project root.
+Create a `.env` file next to it with the required env vars.
+
+## Development
+
+### Testing
+```bash
+# Node.js tests
+npm run test
+
+# Linting
+npm run lint
+npm run lint:fix
+```
diff --git a/packages/datasets/mongodb_datasets/atlas_search_dataset_index.jsonc b/packages/datasets/mongodb_datasets/atlas_search_dataset_index.jsonc
@@ -49,26 +49,15 @@
         "analyzer": "text_analyzer"
       },
       /**
-        URL for filtering and exact matching
+        URL for filtering
        */
       "url": [
-        {
-          /**
-            For exact, case-insensitive matching on the original URL
-           */
-          "type": "string",
-          "analyzer": "lucene.keyword"
-        },
         {
           /**
             For normalized, fuzzy, flexible matching
            */
           "type": "string",
-          "analyzer": "url_normalizer_analyzer",
-          /**
-            This creates a sub-field called "url.normalized"
-           */
-          "name": "normalized"
+          "analyzer": "url_normalizer_analyzer"
         }
       ]
     }
@@ -83,28 +72,32 @@
     {
       "name": "text_analyzer",
       "tokenizer": {
+        /**
+          Standard tokenizer.
+          From the docs: It divides text into terms based on word boundaries,
+          which makes it language-neutral for most use cases. 
+          It converts all terms to lower case and removes punctuation. 
+         */
         "type": "standard"
       },
       "tokenFilters": [
         /**
-          Convert all text to lowercase
+          Remove accents
          */
         {
-          "type": "lowercase"
+          "type": "icuFolding"
         },
         /**
-          Remove stopwords, such as "the", "and", "is", etc.
+          Remove possessive suffixes, e.g., "John's" -> "John"
          */
         {
-          "type": "stop",
-          "stopwords": "_english_"
+          "type": "englishPossessive"
         },
         /**
           Stem words to their root form, e.g., "running" -> "run"
          */
         {
-          "type": "stemmer",
-          "language": "english"
+          "type": "kStemming"
         }
       ]
     },
@@ -121,25 +114,28 @@
           /**
             Remove http:// or https:// from the beginning
            */
-          "type": "pattern",
+          "type": "regex",
           "pattern": "^(https|http)://",
-          "replacement": ""
+          "replacement": "",
+          "matches": "first"
         },
         {
           /**
             Remove www. from the beginning
            */
-          "type": "pattern",
+          "type": "regex",
           "pattern": "^www\\.",
-          "replacement": ""
+          "replacement": "",
+          "matches": "first"
         },
         {
           /**
             Remove all trailing slashes
            */
-          "type": "pattern",
+          "type": "regex",
           "pattern": "/+$",
-          "replacement": ""
+          "replacement": "",
+          "matches": "first"
         }
       ]
     }
diff --git a/packages/datasets/mongodb_datasets/import_wikipedia.py b/packages/datasets/mongodb_datasets/import_wikipedia.py
@@ -12,11 +12,12 @@
 import argparse
 from typing import Dict, Any, Optional
 from pymongo import MongoClient
+from pymongo.operations import SearchIndexModel
 from pymongo.collection import Collection
 from pymongo.errors import BulkWriteError
 from datasets import load_dataset
 from datasets.arrow_dataset import Dataset
-from jsonc_parser import JsoncParser
+from jsonc_parser.parser import JsoncParser
 
 from .config import load_environment
 
@@ -63,16 +64,32 @@ def setup_indexes(self) -> None:
             logger.warning(f"Index creation failed (may already exist): {error}")
         
         try:
-            with open("atlas_search_dataset_index.jsonc", "r") as file:
-                jsonc_content = file.read()
+            # Look for the index file in the same directory as this script
+            import pathlib
+            script_dir = pathlib.Path(__file__).parent
+            index_file = script_dir / "atlas_search_dataset_index.jsonc"
             
+          
             # Parse JSONC to Python dictionary
-            atlas_search_dataset_index: Dict[str, Any] = JsoncParser.parse_text(jsonc_content)
+            parser = JsoncParser()
+            atlas_search_dataset_index: Dict[str, Any] = parser.parse_file(index_file)
             
-            self.collection.create_search_index(
+            # Create the search index definition without the name
+            definition = {
+                "mappings": atlas_search_dataset_index["mappings"],
+                "analyzers": atlas_search_dataset_index["analyzers"]
+            }
+            
+            # Create the search index
+            result = self.collection.create_search_index(model=SearchIndexModel(
                 name=atlas_search_dataset_index["name"],
-                definition=atlas_search_dataset_index["mappings"]
-            )
+                definition=definition,
+                type="search"
+            ))
+            logger.info(f"Search index creation result: {result}")
+            logger.info(f"Created search index {atlas_search_dataset_index['name']} for {DATABASE_NAME}.{COLLECTION_NAME}")
+        except FileNotFoundError:
+            logger.info("Atlas search index file not found, skipping search index creation")
         except Exception as error:
             logger.warning(f"Search index creation failed (may already exist): {error}")
                 
@@ -161,13 +178,17 @@ def insert_batch(self, batch: list) -> int:
             self.stats["errors"] += len(batch)
             return 0
 
-    def import_dataset(self, max_documents: Optional[int] = None) -> None:
+    def import_dataset(self, max_documents: Optional[int] = None, only_create_index: bool = False) -> None:
         """Import the Wikipedia dataset to MongoDB."""
-        logger.info(f"Starting Wikipedia dataset import to {DATABASE_NAME}.{COLLECTION_NAME}")
         
+        logger.info(f"Starting building DB indexes for {DATABASE_NAME}.{COLLECTION_NAME}")
         # Setup database
         self.setup_indexes()
+        if only_create_index:
+            logger.info(f"Only creating DB indexes for {DATABASE_NAME}.{COLLECTION_NAME}. Exiting...")
+            return
         
+        logger.info(f"Starting Wikipedia dataset import to {DATABASE_NAME}.{COLLECTION_NAME}")
         # Load dataset
         dataset = self.load_wikipedia_dataset()
         
@@ -257,6 +278,12 @@ def parse_args() -> argparse.Namespace:
         help="MongoDB connection URI (default: uses MONGODB_ATLAS_SEARCH_CONNECTION_URI env var)"
     )
     
+    parser.add_argument(
+        "--only-create-index",
+        action="store_true",
+        help="Only create the Atlas search index, skip importing data"
+    )
+    
     return parser.parse_args()
 
 
@@ -286,7 +313,7 @@ def main() -> None:
         logger.info("Connected to MongoDB")
         
         # Import dataset with CLI arguments
-        importer.import_dataset(max_documents=args.max_documents)
+        importer.import_dataset(max_documents=args.max_documents, only_create_index=args.only_create_index)
         
         logger.info("Wikipedia dataset import completed successfully")