mongodb · mongodben · Aug 12, 2025 · Aug 6, 2025 · Aug 6, 2025 · Aug 6, 2025
diff --git a/packages/datasets/.env.example b/packages/datasets/.env.example
@@ -9,4 +9,5 @@ HUGGINGFACE_DOCS_CONTENT_REPO="someuser/some-repo"
 HUGGINGFACE_DOCS_CODE_EXAMPLE_REPO="someuser/some-repo"
 MONGODB_CONNECTION_URI="..."
 MONGODB_DATABASE_NAME="docs-chatbot-dev"
-MONGODB_TEXT_TO_CODE_CONNECTION_URI="..."
+MONGODB_TEXT_TO_CODE_CONNECTION_URI="..."
+MONGODB_ATLAS_SEARCH_CONNECTION_URI="..."
diff --git a/packages/datasets/.gitignore b/packages/datasets/.gitignore
@@ -1 +1,2 @@
-dataOut/
+dataOut/
+__pycache__/
diff --git a/packages/datasets/.python-version b/packages/datasets/.python-version
@@ -0,0 +1 @@
+3.11
diff --git a/packages/datasets/README.md b/packages/datasets/README.md
@@ -0,0 +1,90 @@
+# Datasets Package
+
+This package provides utilities for importing, processing, and managing datasets used in the MongoDB Knowledge Service/Chatbot project. It contains both Node.js/TypeScript and Python implementations for various dataset operations.
+
+## Overview
+
+The datasets package is a hybrid TypeScript/Python package that handles:
+- Dataset ingestion from various sources (HuggingFace, Atlas, etc.)
+- Data processing and transformation pipelines
+- MongoDB import/export operations
+- Code example extraction and classification
+- Natural language query generation
+- Database metadata extraction
+
+## Structure
+
+### Node.js/TypeScript Components
+
+Located in `/src/` directory:
+
+- **Code Example Processing**: Extract and classify code examples from documentation
+- **Page Dataset**: Load and process page-based datasets
+- **Tree Generation**: Generate hierarchical data structures for NL queries
+- **Database Operations**: MongoDB schema generation and database analysis
+- **HuggingFace Integration**: Upload datasets to HuggingFace Hub
+- **Evaluation**: Braintrust integration for dataset evaluation
+
+### Python/UV Components
+
+Located in `/mongodb_datasets/` directory:
+
+- **Wikipedia Import**: Import Wikipedia datasets from HuggingFace to MongoDB
+- **Atlas Search**: Configure and create Atlas Search indexes
+- **Configuration Management**: Environment variable and project configuration
+
+## Installation & Setup
+
+### Node.js Dependencies
+```bash
+npm install
+npm run build
+```
+
+### Python Dependencies (using uv)
+```bash
+# Install Python dependencies
+uv sync
+
+# Activate virtual environment
+source .venv/bin/activate  # or .venv\Scripts\activate on Windows
+```
+
+## Usage
+
+### Node.js Scripts
+
+The package provides numerous npm scripts for different dataset operations:
+
+```bash
+# Build the project
+npm run ...
+```
+
+### Python Scripts
+
+The Python components provide CLI tools for dataset import operations:
+
+```bash
+# Import Wikipedia dataset (all articles)
+uv run ...
+```
+
+## Configuration
+
+### Environment Variables
+
+For required environment variables, see `.env.example` in project root.
+Create a `.env` file next to it with the required env vars.
+
+## Development
+
+### Testing
+```bash
+# Node.js tests
+npm run test
+
+# Linting
+npm run lint
+npm run lint:fix
+```
diff --git a/packages/datasets/mongodb_datasets/__init__.py b/packages/datasets/mongodb_datasets/__init__.py
@@ -0,0 +1,3 @@
+"""MongoDB Datasets - Utilities for importing datasets into MongoDB."""
+
+__version__ = "0.1.0"
diff --git a/packages/datasets/mongodb_datasets/atlas_search_dataset_index.jsonc b/packages/datasets/mongodb_datasets/atlas_search_dataset_index.jsonc
@@ -0,0 +1,143 @@
+/**
+  Atlas Search index definitions for the `articles` collection.
+ */
+{
+  "name": "article_search",
+  "mappings": {
+    "dynamic": false,
+    /**
+      Fields to index:
+      - title: Full-text and autocomplete
+      - text: Full-text
+      - url: Exact and normalized
+     */
+    "fields": {
+      /**
+       Title with both full-text and autocomplete capabilities
+       */
+      "title": [
+        {
+          "type": "string",
+          "analyzer": "lucene.standard"
+        },
+        /**
+          Index optimized for autocomplete/type-ahead search.
+         */
+        {
+          "type": "autocomplete",
+          "analyzer": "lucene.standard",
+          /**
+            Min length of n-grams indexed is 2 characters.
+           */
+          "minGrams": 2,
+          /**
+            Max length of n-grams indexed is 15 characters.
+            This is a reasonable compromise between search relevance, performance, and storage cost.
+           */
+          "maxGrams": 15,
+          /**
+            Fold diacritics to their base characters, e.g., "á" -> "a".
+           */
+          "foldDiacritics": true
+        }
+      ],
+      /**
+        Full-text search over the `text` field, which contains the article content.
+       */
+      "text": {
+        "type": "string",
+        "analyzer": "text_analyzer"
+      },
+      /**
+        URL for filtering
+       */
+      "url": [
+        {
+          /**
+            For normalized, fuzzy, flexible matching
+           */
+          "type": "string",
+          "analyzer": "url_normalizer_analyzer"
+        }
+      ]
+    }
+  },
+  /**
+    Analyzers configuration for better text processing
+   */
+  "analyzers": [
+    /**
+      Optimized for text search over full documents in the `text` field
+     */
+    {
+      "name": "text_analyzer",
+      "tokenizer": {
+        /**
+          Standard tokenizer.
+          From the docs: It divides text into terms based on word boundaries,
+          which makes it language-neutral for most use cases. 
+          It converts all terms to lower case and removes punctuation. 
+         */
+        "type": "standard"
+      },
+      "tokenFilters": [
+        /**
+          Remove accents
+         */
+        {
+          "type": "icuFolding"
+        },
+        /**
+          Remove possessive suffixes, e.g., "John's" -> "John"
+         */
+        {
+          "type": "englishPossessive"
+        },
+        /**
+          Stem words to their root form, e.g., "running" -> "run"
+         */
+        {
+          "type": "kStemming"
+        }
+      ]
+    },
+    {
+      "name": "url_normalizer_analyzer",
+      "tokenizer": {
+        "type": "keyword"
+      },
+      "tokenFilters": [
+        {
+          "type": "lowercase"
+        },
+        {
+          /**
+            Remove http:// or https:// from the beginning
+           */
+          "type": "regex",
+          "pattern": "^(https|http)://",
+          "replacement": "",
+          "matches": "first"
+        },
+        {
+          /**
+            Remove www. from the beginning
+           */
+          "type": "regex",
+          "pattern": "^www\\.",
+          "replacement": "",
+          "matches": "first"
+        },
+        {
+          /**
+            Remove all trailing slashes
+           */
+          "type": "regex",
+          "pattern": "/+$",
+          "replacement": "",
+          "matches": "first"
+        }
+      ]
+    }
+  ]
+}
diff --git a/packages/datasets/mongodb_datasets/config.py b/packages/datasets/mongodb_datasets/config.py
@@ -0,0 +1,17 @@
+"""Configuration utilities for mongodb_datasets package."""
+
+from pathlib import Path
+from dotenv import load_dotenv
+
+# Find the project root .env file by traversing up from this file
+# Structure: packages/datasets/mongodb_datasets/config.py -> ../../../.env
+PROJECT_ROOT = Path(__file__).parent.parent.parent.parent.parent
+ENV_PATH = PROJECT_ROOT / ".env"
+
+def load_environment() -> None:
+    """Load environment variables from the project .env file."""
+    if ENV_PATH.exists():
+        load_dotenv(ENV_PATH)
+    else:
+        # Fallback to loading from current directory
+        load_dotenv()
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		"""MongoDB Datasets - Utilities for importing datasets into MongoDB."""

		__version__ = "0.1.0"
Copy link Collaborator nlarew Aug 11, 2025 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. nit - can we add newlines to the end of each file (and ideally track down whatever setting is stripping them out) mongodben reacted with thumbs up emoji