Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 35 additions & 34 deletions nemo/NeMo-Data-Designer/intro-tutorials/1-the-basics.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -28,16 +28,19 @@
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from getpass import getpass\n",
"\n",
"from nemo_microservices import NeMoMicroservices\n",
"from nemo_microservices.beta.data_designer import (\n",
"from nemo_microservices.data_designer.essentials import (\n",
" CategorySamplerParams,\n",
" DataDesignerConfigBuilder,\n",
" DataDesignerClient,\n",
")\n",
"from nemo_microservices.beta.data_designer.config import columns as C\n",
"from nemo_microservices.beta.data_designer.config import params as P"
" LLMTextColumnConfig,\n",
" NeMoDataDesignerClient,\n",
" PersonSamplerParams,\n",
" SamplerColumnConfig,\n",
" SamplerType,\n",
" SubcategorySamplerParams,\n",
" UniformSamplerParams,\n",
")"
]
},
{
Expand All @@ -51,7 +54,7 @@
"- If you have an instance of data designer running locally, you can connect to it as follows\n",
"\n",
" ```python\n",
" data_designer_client = DataDesignerClient(client=NeMoMicroservices(base_url=\"http://localhost:8080\"))\n",
" data_designer_client = NeMoDataDesignerClient(base_url=\"http://localhost:8080\")\n",
" ```\n"
]
},
Expand All @@ -76,11 +79,9 @@
"metadata": {},
"outputs": [],
"source": [
"data_designer_client = DataDesignerClient(\n",
" client=NeMoMicroservices(\n",
" base_url=\"https://ai.api.nvidia.com/v1/nemo/dd\",\n",
" default_headers={\"Authorization\": f\"Bearer {api_key}\"} # auto-generated API KEY\n",
" )\n",
"data_designer_client = NeMoDataDesignerClient(\n",
" base_url=\"https://ai.api.nvidia.com/v1/nemo/dd\",\n",
" default_headers={\"Authorization\": f\"Bearer {api_key}\"} # auto-generated API KEY\n",
")"
]
},
Expand Down Expand Up @@ -149,10 +150,10 @@
"outputs": [],
"source": [
"config_builder.add_column(\n",
" C.SamplerColumn(\n",
" SamplerColumnConfig(\n",
" name=\"product_category\",\n",
" type=P.SamplerType.CATEGORY,\n",
" params=P.CategorySamplerParams(\n",
" sampler_type=SamplerType.CATEGORY,\n",
" params=CategorySamplerParams(\n",
" values=[\n",
" \"Electronics\",\n",
" \"Clothing\",\n",
Expand All @@ -165,10 +166,10 @@
")\n",
"\n",
"config_builder.add_column(\n",
" C.SamplerColumn(\n",
" SamplerColumnConfig(\n",
" name=\"product_subcategory\",\n",
" type=P.SamplerType.SUBCATEGORY,\n",
" params=P.SubcategorySamplerParams(\n",
" sampler_type=SamplerType.SUBCATEGORY,\n",
" params=SubcategorySamplerParams(\n",
" category=\"product_category\",\n",
" values={\n",
" \"Electronics\": [\n",
Expand Down Expand Up @@ -212,10 +213,10 @@
")\n",
"\n",
"config_builder.add_column(\n",
" C.SamplerColumn(\n",
" SamplerColumnConfig(\n",
" name=\"target_age_range\",\n",
" type=P.SamplerType.CATEGORY,\n",
" params=P.CategorySamplerParams(\n",
" sampler_type=SamplerType.CATEGORY,\n",
" params=CategorySamplerParams(\n",
" values=[\"18-25\", \"25-35\", \"35-50\", \"50-65\", \"65+\"]\n",
" ),\n",
" )\n",
Expand All @@ -240,27 +241,27 @@
"source": [
"# This column will sample synthetic person data based on statistics from the US Census.\n",
"config_builder.add_column(\n",
" C.SamplerColumn(\n",
" SamplerColumnConfig(\n",
" name=\"customer\",\n",
" type=P.SamplerType.PERSON,\n",
" params=P.PersonSamplerParams(age_range=[18, 70]),\n",
" sampler_type=SamplerType.PERSON,\n",
" params=PersonSamplerParams(age_range=[18, 70]),\n",
" )\n",
")\n",
"\n",
"config_builder.add_column(\n",
" C.SamplerColumn(\n",
" SamplerColumnConfig(\n",
" name=\"number_of_stars\",\n",
" type=P.SamplerType.UNIFORM,\n",
" params=P.UniformSamplerParams(low=1, high=5),\n",
" sampler_type=SamplerType.UNIFORM,\n",
" params=UniformSamplerParams(low=1, high=5),\n",
" convert_to=\"int\",\n",
" )\n",
")\n",
"\n",
"config_builder.add_column(\n",
" C.SamplerColumn(\n",
" SamplerColumnConfig(\n",
" name=\"review_style\",\n",
" type=P.SamplerType.CATEGORY,\n",
" params=P.CategorySamplerParams(\n",
" sampler_type=SamplerType.CATEGORY,\n",
" params=CategorySamplerParams(\n",
" values=[\"rambling\", \"brief\", \"detailed\", \"structured with bullet points\"],\n",
" weights=[1, 2, 2, 1],\n",
" ),\n",
Expand Down Expand Up @@ -292,7 +293,7 @@
"outputs": [],
"source": [
"config_builder.add_column(\n",
" C.LLMTextColumn(\n",
" LLMTextColumnConfig(\n",
" name=\"product_name\",\n",
" prompt=(\n",
" \"Come up with a creative product name for a product in the '{{ product_category }}' category, focusing \"\n",
Expand All @@ -310,7 +311,7 @@
")\n",
"\n",
"config_builder.add_column(\n",
" C.LLMTextColumn(\n",
" LLMTextColumnConfig(\n",
" name=\"customer_review\",\n",
" prompt=(\n",
" \"You are a customer named {{ customer.first_name }} from {{ customer.city }}, {{ customer.state }}. \"\n",
Expand Down Expand Up @@ -342,7 +343,7 @@
"metadata": {},
"outputs": [],
"source": [
"preview = data_designer_client.preview(config_builder, verbose_logging=True)"
"preview = data_designer_client.preview(config_builder)"
]
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,17 @@
"source": [
"from getpass import getpass\n",
"\n",
"from nemo_microservices import NeMoMicroservices\n",
"from nemo_microservices.beta.data_designer import (\n",
"from nemo_microservices.data_designer.essentials import (\n",
" CategorySamplerParams,\n",
" DataDesignerConfigBuilder,\n",
" DataDesignerClient,\n",
")\n",
"from nemo_microservices.beta.data_designer.config import columns as C\n",
"from nemo_microservices.beta.data_designer.config import params as P"
" ExpressionColumnConfig,\n",
" LLMStructuredColumnConfig,\n",
" NeMoDataDesignerClient,\n",
" PersonSamplerParams,\n",
" SamplerColumnConfig,\n",
" SamplerType,\n",
" SubcategorySamplerParams,\n",
")"
]
},
{
Expand All @@ -54,7 +58,7 @@
"- If you have an instance of data designer running locally, you can connect to it as follows\n",
"\n",
" ```python\n",
" data_designer_client = DataDesignerClient(client=NeMoMicroservices(base_url=\"http://localhost:8080\"))\n",
" data_designer_client = NeMoDataDesignerClient(base_url=\"http://localhost:8080\")\n",
" ```\n"
]
},
Expand All @@ -79,11 +83,9 @@
"metadata": {},
"outputs": [],
"source": [
"data_designer_client = DataDesignerClient(\n",
" client=NeMoMicroservices(\n",
" base_url=\"https://ai.api.nvidia.com/v1/nemo/dd\",\n",
" default_headers={\"Authorization\": f\"Bearer {api_key}\"} # auto-generated API KEY\n",
" )\n",
"data_designer_client = NeMoDataDesignerClient(\n",
" base_url=\"https://ai.api.nvidia.com/v1/nemo/dd\",\n",
" default_headers={\"Authorization\": f\"Bearer {api_key}\"} # auto-generated API KEY\n",
")"
]
},
Expand Down Expand Up @@ -189,18 +191,20 @@
"metadata": {},
"outputs": [],
"source": [
"# Since we often just want a few attributes from Person objects, we can use\n",
"# Data Designer's `with_person_samplers` method to create multiple person samplers\n",
"# at once and drop the person object columns from the final dataset.\n",
"config_builder.with_person_samplers(\n",
" {\"customer\": P.PersonSamplerParams(age_range=[18, 65])}\n",
"# This column will sample synthetic person data based on statistics from the US Census.\n",
"config_builder.add_column(\n",
" SamplerColumnConfig(\n",
" name=\"customer\",\n",
" sampler_type=SamplerType.PERSON,\n",
" params=PersonSamplerParams(age_range=[18, 70]),\n",
" )\n",
")\n",
"\n",
"config_builder.add_column(\n",
" C.SamplerColumn(\n",
" SamplerColumnConfig(\n",
" name=\"product_category\",\n",
" type=P.SamplerType.CATEGORY,\n",
" params=P.CategorySamplerParams(\n",
" sampler_type=SamplerType.CATEGORY,\n",
" params=CategorySamplerParams(\n",
" values=[\n",
" \"Electronics\",\n",
" \"Clothing\",\n",
Expand All @@ -213,10 +217,10 @@
")\n",
"\n",
"config_builder.add_column(\n",
" C.SamplerColumn(\n",
" SamplerColumnConfig(\n",
" name=\"product_subcategory\",\n",
" type=P.SamplerType.SUBCATEGORY,\n",
" params=P.SubcategorySamplerParams(\n",
" sampler_type=SamplerType.SUBCATEGORY,\n",
" params=SubcategorySamplerParams(\n",
" category=\"product_category\",\n",
" values={\n",
" \"Electronics\": [\n",
Expand Down Expand Up @@ -260,10 +264,10 @@
")\n",
"\n",
"config_builder.add_column(\n",
" C.SamplerColumn(\n",
" SamplerColumnConfig(\n",
" name=\"target_age_range\",\n",
" type=P.SamplerType.CATEGORY,\n",
" params=P.CategorySamplerParams(\n",
" sampler_type=SamplerType.CATEGORY,\n",
" params=CategorySamplerParams(\n",
" values=[\"18-25\", \"25-35\", \"35-50\", \"50-65\", \"65+\"]\n",
" ),\n",
" )\n",
Expand All @@ -273,34 +277,34 @@
"# we also show how we can we use conditional params to set the values for the sampler if a given condition is met\n",
"# in this example, we set the review style to rambling if the target age range is 18-25\n",
"config_builder.add_column(\n",
" C.SamplerColumn(\n",
" SamplerColumnConfig(\n",
" name=\"review_style\",\n",
" type=P.SamplerType.CATEGORY,\n",
" params=P.CategorySamplerParams(\n",
" sampler_type=SamplerType.CATEGORY,\n",
" params=CategorySamplerParams(\n",
" values=[\"rambling\", \"brief\", \"detailed\", \"structured with bullet points\"],\n",
" weights=[1, 2, 2, 1],\n",
" conditional_params={\n",
" \"target_age_range == '18-25'\": P.CategorySamplerParams(values=[\"rambling\"]),\n",
" }\n",
" weights=[1, 2, 2, 1]\n",
" ),\n",
" conditional_params={\n",
" \"target_age_range == '18-25'\": CategorySamplerParams(values=[\"rambling\"]),\n",
" }\n",
" )\n",
")\n",
"\n",
"# We can create new columns using Jinja expressions that reference\n",
"# existing columns, including attributes of nested objects.\n",
"config_builder.add_column(\n",
" C.ExpressionColumn(\n",
" ExpressionColumnConfig(\n",
" name=\"customer_name\", expr=\"{{ customer.first_name }} {{ customer.last_name }}\"\n",
" )\n",
")\n",
"\n",
"config_builder.add_column(\n",
" C.ExpressionColumn(name=\"customer_age\", expr=\"{{ customer.age }}\")\n",
" ExpressionColumnConfig(name=\"customer_age\", expr=\"{{ customer.age }}\")\n",
")\n",
"\n",
"# Add an `LLMStructuredColumn` column to generate structured outputs.\n",
"# Add a column to generate structured outputs.\n",
"config_builder.add_column(\n",
" C.LLMStructuredColumn(\n",
" LLMStructuredColumnConfig(\n",
" name=\"product\",\n",
" prompt=(\n",
" \"Create a product in the '{{ product_category }}' category, focusing on products \"\n",
Expand All @@ -315,7 +319,7 @@
"# Another powerful feature we can use is the ability to use conditional statements in our prompt using Jinja expressions\n",
"# in this example, we add additional conditions to the prompt based on the target age range\n",
"config_builder.add_column(\n",
" C.LLMStructuredColumn(\n",
" LLMStructuredColumnConfig(\n",
" name=\"customer_review\",\n",
" prompt=(\n",
" \"Your task is to write a review for the following product:\\n\\n\"\n",
Expand All @@ -333,10 +337,7 @@
" output_format=ProductReview,\n",
" model_alias=model_alias,\n",
" )\n",
")\n",
"\n",
"# Let's add an evaluation report to our dataset.\n",
"config_builder.with_evaluation_report().validate()"
")"
]
},
{
Expand All @@ -347,9 +348,7 @@
"\n",
"- Iteration is key to generating high-quality synthetic data.\n",
"\n",
"- Use the `preview` method to generate 10 records for inspection.\n",
"\n",
"- Setting `verbose_logging=True` prints logs within each task of the generation process.\n"
"- Use the `preview` method to generate 10 records for inspection.\n"
]
},
{
Expand All @@ -358,7 +357,7 @@
"metadata": {},
"outputs": [],
"source": [
"preview = data_designer_client.preview(config_builder, verbose_logging=True)"
"preview = data_designer_client.preview(config_builder)"
]
},
{
Expand All @@ -381,6 +380,16 @@
"preview.dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# See analysis report on the preview dataset\n",
"preview.analysis.to_report()"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down
Loading