(EAI-1230): Create MDB collection and indexes for Atlas Search benchmark (#865)

mongodben · Ben Perlmutter · web-flow · commit 80f0e468e432 · 2025-08-12T13:37:34.000-04:00
* dataset python script

* fix

* Fix dataset upload

* new line EOFs

* simpler imports

---------

Co-authored-by: Ben Perlmutter &lt;mongodben@mongodb.com&gt;
diff --git a/packages/datasets/.env.example b/packages/datasets/.env.example
@@ -9,4 +9,5 @@ HUGGINGFACE_DOCS_CONTENT_REPO="someuser/some-repo"
 HUGGINGFACE_DOCS_CODE_EXAMPLE_REPO="someuser/some-repo"
 MONGODB_CONNECTION_URI="..."
 MONGODB_DATABASE_NAME="docs-chatbot-dev"
-MONGODB_TEXT_TO_CODE_CONNECTION_URI="..."
+MONGODB_TEXT_TO_CODE_CONNECTION_URI="..."
+MONGODB_ATLAS_SEARCH_CONNECTION_URI="..."
diff --git a/packages/datasets/.gitignore b/packages/datasets/.gitignore
@@ -1,2 +1,2 @@
 dataOut/
-__pycache__/
+__pycache__/
diff --git a/packages/datasets/.python-version b/packages/datasets/.python-version
@@ -0,0 +1 @@
+3.11
diff --git a/packages/datasets/README.md b/packages/datasets/README.md
@@ -0,0 +1,90 @@
+# Datasets Package
+
+This package provides utilities for importing, processing, and managing datasets used in the MongoDB Knowledge Service/Chatbot project. It contains both Node.js/TypeScript and Python implementations for various dataset operations.
+
+## Overview
+
+The datasets package is a hybrid TypeScript/Python package that handles:
+- Dataset ingestion from various sources (HuggingFace, Atlas, etc.)
+- Data processing and transformation pipelines
+- MongoDB import/export operations
+- Code example extraction and classification
+- Natural language query generation
+- Database metadata extraction
+
+## Structure
+
+### Node.js/TypeScript Components
+
+Located in `/src/` directory:
+
+- **Code Example Processing**: Extract and classify code examples from documentation
+- **Page Dataset**: Load and process page-based datasets
+- **Tree Generation**: Generate hierarchical data structures for NL queries
+- **Database Operations**: MongoDB schema generation and database analysis
+- **HuggingFace Integration**: Upload datasets to HuggingFace Hub
+- **Evaluation**: Braintrust integration for dataset evaluation
+
+### Python/UV Components
+
+Located in `/mongodb_datasets/` directory:
+
+- **Wikipedia Import**: Import Wikipedia datasets from HuggingFace to MongoDB
+- **Atlas Search**: Configure and create Atlas Search indexes
+- **Configuration Management**: Environment variable and project configuration
+
+## Installation & Setup
+
+### Node.js Dependencies
+```bash
+npm install
+npm run build
+```
+
+### Python Dependencies (using uv)
+```bash
+# Install Python dependencies
+uv sync
+
+# Activate virtual environment
+source .venv/bin/activate  # or .venv\Scripts\activate on Windows
+```
+
+## Usage
+
+### Node.js Scripts
+
+The package provides numerous npm scripts for different dataset operations:
+
+```bash
+# Build the project
+npm run ...
+```
+
+### Python Scripts
+
+The Python components provide CLI tools for dataset import operations:
+
+```bash
+# Import Wikipedia dataset (all articles)
+uv run ...
+```
+
+## Configuration
+
+### Environment Variables
+
+For required environment variables, see `.env.example` in project root.
+Create a `.env` file next to it with the required env vars.
+
+## Development
+
+### Testing
+```bash
+# Node.js tests
+npm run test
+
+# Linting
+npm run lint
+npm run lint:fix
+```
diff --git a/packages/datasets/mongodb_datasets/__init__.py b/packages/datasets/mongodb_datasets/__init__.py
@@ -0,0 +1,3 @@
+"""MongoDB Datasets - Utilities for importing datasets into MongoDB."""
+
+__version__ = "0.1.0"
diff --git a/packages/datasets/mongodb_datasets/atlas_search_dataset_index.jsonc b/packages/datasets/mongodb_datasets/atlas_search_dataset_index.jsonc
@@ -0,0 +1,143 @@
+/**
+  Atlas Search index definitions for the `articles` collection.
+ */
+{
+  "name": "article_search",
+  "mappings": {
+    "dynamic": false,
+    /**
+      Fields to index:
+      - title: Full-text and autocomplete
+      - text: Full-text
+      - url: Exact and normalized
+     */
+    "fields": {
+      /**
+       Title with both full-text and autocomplete capabilities
+       */
+      "title": [
+        {
+          "type": "string",
+          "analyzer": "lucene.standard"
+        },
+        /**
+          Index optimized for autocomplete/type-ahead search.
+         */
+        {
+          "type": "autocomplete",
+          "analyzer": "lucene.standard",
+          /**
+            Min length of n-grams indexed is 2 characters.
+           */
+          "minGrams": 2,
+          /**
+            Max length of n-grams indexed is 15 characters.
+            This is a reasonable compromise between search relevance, performance, and storage cost.
+           */
+          "maxGrams": 15,
+          /**
+            Fold diacritics to their base characters, e.g., "á" -> "a".
+           */
+          "foldDiacritics": true
+        }
+      ],
+      /**
+        Full-text search over the `text` field, which contains the article content.
+       */
+      "text": {
+        "type": "string",
+        "analyzer": "text_analyzer"
+      },
+      /**
+        URL for filtering
+       */
+      "url": [
+        {
+          /**
+            For normalized, fuzzy, flexible matching
+           */
+          "type": "string",
+          "analyzer": "url_normalizer_analyzer"
+        }
+      ]
+    }
+  },
+  /**
+    Analyzers configuration for better text processing
+   */
+  "analyzers": [
+    /**
+      Optimized for text search over full documents in the `text` field
+     */
+    {
+      "name": "text_analyzer",
+      "tokenizer": {
+        /**
+          Standard tokenizer.
+          From the docs: It divides text into terms based on word boundaries,
+          which makes it language-neutral for most use cases. 
+          It converts all terms to lower case and removes punctuation. 
+         */
+        "type": "standard"
+      },
+      "tokenFilters": [
+        /**
+          Remove accents
+         */
+        {
+          "type": "icuFolding"
+        },
+        /**
+          Remove possessive suffixes, e.g., "John's" -> "John"
+         */
+        {
+          "type": "englishPossessive"
+        },
+        /**
+          Stem words to their root form, e.g., "running" -> "run"
+         */
+        {
+          "type": "kStemming"
+        }
+      ]
+    },
+    {
+      "name": "url_normalizer_analyzer",
+      "tokenizer": {
+        "type": "keyword"
+      },
+      "tokenFilters": [
+        {
+          "type": "lowercase"
+        },
+        {
+          /**
+            Remove http:// or https:// from the beginning
+           */
+          "type": "regex",
+          "pattern": "^(https|http)://",
+          "replacement": "",
+          "matches": "first"
+        },
+        {
+          /**
+            Remove www. from the beginning
+           */
+          "type": "regex",
+          "pattern": "^www\\.",
+          "replacement": "",
+          "matches": "first"
+        },
+        {
+          /**
+            Remove all trailing slashes
+           */
+          "type": "regex",
+          "pattern": "/+$",
+          "replacement": "",
+          "matches": "first"
+        }
+      ]
+    }
+  ]
+}
diff --git a/packages/datasets/mongodb_datasets/import_wikipedia.py b/packages/datasets/mongodb_datasets/import_wikipedia.py
diff --git a/packages/datasets/pyproject.toml b/packages/datasets/pyproject.toml
diff --git a/packages/datasets/uv.lock b/packages/datasets/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`dataOut/`
`2`		`-__pycache__/`
	`2`	`+__pycache__/`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+"""MongoDB Datasets - Utilities for importing datasets into MongoDB."""`
	`2`	`+`
	`3`	`+__version__ = "0.1.0"`