Skip to content

Commit 5ba7c8c

Browse files
author
Ben Perlmutter
committed
Fix dataset upload
1 parent ee455cc commit 5ba7c8c

File tree

3 files changed

+149
-36
lines changed

3 files changed

+149
-36
lines changed

packages/datasets/README.md

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
# Datasets Package
2+
3+
This package provides utilities for importing, processing, and managing datasets used in the MongoDB Knowledge Service/Chatbot project. It contains both Node.js/TypeScript and Python implementations for various dataset operations.
4+
5+
## Overview
6+
7+
The datasets package is a hybrid TypeScript/Python package that handles:
8+
- Dataset ingestion from various sources (HuggingFace, Atlas, etc.)
9+
- Data processing and transformation pipelines
10+
- MongoDB import/export operations
11+
- Code example extraction and classification
12+
- Natural language query generation
13+
- Database metadata extraction
14+
15+
## Structure
16+
17+
### Node.js/TypeScript Components
18+
19+
Located in `/src/` directory:
20+
21+
- **Code Example Processing**: Extract and classify code examples from documentation
22+
- **Page Dataset**: Load and process page-based datasets
23+
- **Tree Generation**: Generate hierarchical data structures for NL queries
24+
- **Database Operations**: MongoDB schema generation and database analysis
25+
- **HuggingFace Integration**: Upload datasets to HuggingFace Hub
26+
- **Evaluation**: Braintrust integration for dataset evaluation
27+
28+
### Python/UV Components
29+
30+
Located in `/mongodb_datasets/` directory:
31+
32+
- **Wikipedia Import**: Import Wikipedia datasets from HuggingFace to MongoDB
33+
- **Atlas Search**: Configure and create Atlas Search indexes
34+
- **Configuration Management**: Environment variable and project configuration
35+
36+
## Installation & Setup
37+
38+
### Node.js Dependencies
39+
```bash
40+
npm install
41+
npm run build
42+
```
43+
44+
### Python Dependencies (using uv)
45+
```bash
46+
# Install Python dependencies
47+
uv sync
48+
49+
# Activate virtual environment
50+
source .venv/bin/activate # or .venv\Scripts\activate on Windows
51+
```
52+
53+
## Usage
54+
55+
### Node.js Scripts
56+
57+
The package provides numerous npm scripts for different dataset operations:
58+
59+
```bash
60+
# Build the project
61+
npm run ...
62+
```
63+
64+
### Python Scripts
65+
66+
The Python components provide CLI tools for dataset import operations:
67+
68+
```bash
69+
# Import Wikipedia dataset (all articles)
70+
uv run ...
71+
```
72+
73+
## Configuration
74+
75+
### Environment Variables
76+
77+
For required environment variables, see `.env.example` in project root.
78+
Create a `.env` file next to it with the required env vars.
79+
80+
## Development
81+
82+
### Testing
83+
```bash
84+
# Node.js tests
85+
npm run test
86+
87+
# Linting
88+
npm run lint
89+
npm run lint:fix
90+
```

packages/datasets/mongodb_datasets/atlas_search_dataset_index.jsonc

Lines changed: 22 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -49,26 +49,15 @@
4949
"analyzer": "text_analyzer"
5050
},
5151
/**
52-
URL for filtering and exact matching
52+
URL for filtering
5353
*/
5454
"url": [
55-
{
56-
/**
57-
For exact, case-insensitive matching on the original URL
58-
*/
59-
"type": "string",
60-
"analyzer": "lucene.keyword"
61-
},
6255
{
6356
/**
6457
For normalized, fuzzy, flexible matching
6558
*/
6659
"type": "string",
67-
"analyzer": "url_normalizer_analyzer",
68-
/**
69-
This creates a sub-field called "url.normalized"
70-
*/
71-
"name": "normalized"
60+
"analyzer": "url_normalizer_analyzer"
7261
}
7362
]
7463
}
@@ -83,28 +72,32 @@
8372
{
8473
"name": "text_analyzer",
8574
"tokenizer": {
75+
/**
76+
Standard tokenizer.
77+
From the docs: It divides text into terms based on word boundaries,
78+
which makes it language-neutral for most use cases.
79+
It converts all terms to lower case and removes punctuation.
80+
*/
8681
"type": "standard"
8782
},
8883
"tokenFilters": [
8984
/**
90-
Convert all text to lowercase
85+
Remove accents
9186
*/
9287
{
93-
"type": "lowercase"
88+
"type": "icuFolding"
9489
},
9590
/**
96-
Remove stopwords, such as "the", "and", "is", etc.
91+
Remove possessive suffixes, e.g., "John's" -> "John"
9792
*/
9893
{
99-
"type": "stop",
100-
"stopwords": "_english_"
94+
"type": "englishPossessive"
10195
},
10296
/**
10397
Stem words to their root form, e.g., "running" -> "run"
10498
*/
10599
{
106-
"type": "stemmer",
107-
"language": "english"
100+
"type": "kStemming"
108101
}
109102
]
110103
},
@@ -121,25 +114,28 @@
121114
/**
122115
Remove http:// or https:// from the beginning
123116
*/
124-
"type": "pattern",
117+
"type": "regex",
125118
"pattern": "^(https|http)://",
126-
"replacement": ""
119+
"replacement": "",
120+
"matches": "first"
127121
},
128122
{
129123
/**
130124
Remove www. from the beginning
131125
*/
132-
"type": "pattern",
126+
"type": "regex",
133127
"pattern": "^www\\.",
134-
"replacement": ""
128+
"replacement": "",
129+
"matches": "first"
135130
},
136131
{
137132
/**
138133
Remove all trailing slashes
139134
*/
140-
"type": "pattern",
135+
"type": "regex",
141136
"pattern": "/+$",
142-
"replacement": ""
137+
"replacement": "",
138+
"matches": "first"
143139
}
144140
]
145141
}

packages/datasets/mongodb_datasets/import_wikipedia.py

Lines changed: 37 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,12 @@
1212
import argparse
1313
from typing import Dict, Any, Optional
1414
from pymongo import MongoClient
15+
from pymongo.operations import SearchIndexModel
1516
from pymongo.collection import Collection
1617
from pymongo.errors import BulkWriteError
1718
from datasets import load_dataset
1819
from datasets.arrow_dataset import Dataset
19-
from jsonc_parser import JsoncParser
20+
from jsonc_parser.parser import JsoncParser
2021

2122
from .config import load_environment
2223

@@ -63,16 +64,32 @@ def setup_indexes(self) -> None:
6364
logger.warning(f"Index creation failed (may already exist): {error}")
6465

6566
try:
66-
with open("atlas_search_dataset_index.jsonc", "r") as file:
67-
jsonc_content = file.read()
67+
# Look for the index file in the same directory as this script
68+
import pathlib
69+
script_dir = pathlib.Path(__file__).parent
70+
index_file = script_dir / "atlas_search_dataset_index.jsonc"
6871

72+
6973
# Parse JSONC to Python dictionary
70-
atlas_search_dataset_index: Dict[str, Any] = JsoncParser.parse_text(jsonc_content)
74+
parser = JsoncParser()
75+
atlas_search_dataset_index: Dict[str, Any] = parser.parse_file(index_file)
7176

72-
self.collection.create_search_index(
77+
# Create the search index definition without the name
78+
definition = {
79+
"mappings": atlas_search_dataset_index["mappings"],
80+
"analyzers": atlas_search_dataset_index["analyzers"]
81+
}
82+
83+
# Create the search index
84+
result = self.collection.create_search_index(model=SearchIndexModel(
7385
name=atlas_search_dataset_index["name"],
74-
definition=atlas_search_dataset_index["mappings"]
75-
)
86+
definition=definition,
87+
type="search"
88+
))
89+
logger.info(f"Search index creation result: {result}")
90+
logger.info(f"Created search index {atlas_search_dataset_index['name']} for {DATABASE_NAME}.{COLLECTION_NAME}")
91+
except FileNotFoundError:
92+
logger.info("Atlas search index file not found, skipping search index creation")
7693
except Exception as error:
7794
logger.warning(f"Search index creation failed (may already exist): {error}")
7895

@@ -161,13 +178,17 @@ def insert_batch(self, batch: list) -> int:
161178
self.stats["errors"] += len(batch)
162179
return 0
163180

164-
def import_dataset(self, max_documents: Optional[int] = None) -> None:
181+
def import_dataset(self, max_documents: Optional[int] = None, only_create_index: bool = False) -> None:
165182
"""Import the Wikipedia dataset to MongoDB."""
166-
logger.info(f"Starting Wikipedia dataset import to {DATABASE_NAME}.{COLLECTION_NAME}")
167183

184+
logger.info(f"Starting building DB indexes for {DATABASE_NAME}.{COLLECTION_NAME}")
168185
# Setup database
169186
self.setup_indexes()
187+
if only_create_index:
188+
logger.info(f"Only creating DB indexes for {DATABASE_NAME}.{COLLECTION_NAME}. Exiting...")
189+
return
170190

191+
logger.info(f"Starting Wikipedia dataset import to {DATABASE_NAME}.{COLLECTION_NAME}")
171192
# Load dataset
172193
dataset = self.load_wikipedia_dataset()
173194

@@ -257,6 +278,12 @@ def parse_args() -> argparse.Namespace:
257278
help="MongoDB connection URI (default: uses MONGODB_ATLAS_SEARCH_CONNECTION_URI env var)"
258279
)
259280

281+
parser.add_argument(
282+
"--only-create-index",
283+
action="store_true",
284+
help="Only create the Atlas search index, skip importing data"
285+
)
286+
260287
return parser.parse_args()
261288

262289

@@ -286,7 +313,7 @@ def main() -> None:
286313
logger.info("Connected to MongoDB")
287314

288315
# Import dataset with CLI arguments
289-
importer.import_dataset(max_documents=args.max_documents)
316+
importer.import_dataset(max_documents=args.max_documents, only_create_index=args.only_create_index)
290317

291318
logger.info("Wikipedia dataset import completed successfully")
292319

0 commit comments

Comments
 (0)