NVIDIA-Merlin
diff --git a/‎examples/scaling-criteo/01-Download-Convert.ipynb‎
Lines changed: 10 additions & 29 deletions b/‎examples/scaling-criteo/01-Download-Convert.ipynb‎
Lines changed: 10 additions & 29 deletions
diff --git a/‎examples/scaling-criteo/02-ETL-with-NVTabular.ipynb‎
Lines changed: 33 additions & 103 deletions b/‎examples/scaling-criteo/02-ETL-with-NVTabular.ipynb‎
Lines changed: 33 additions & 103 deletions
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -33,7 +33,7 @@
     "\n",
     "# Scaling Criteo: Download and Convert\n",
     "\n",
-    "This notebook is created using the latest stable [merlin-hugectr](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/merlin/containers/merlin-hugectr/tags), [merlin-tensorflow](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/merlin/containers/merlin-tensorflow/tags), or [merlin-pytorch](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/merlin/containers/merlin-pytorch/tags) container. \n",
+    "This notebook is created using the latest stable [merlin-tensorflow](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/merlin/containers/merlin-tensorflow/tags) or [merlin-pytorch](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/merlin/containers/merlin-pytorch/tags) container. \n",
     "\n",
     "## Criteo 1TB Click Logs dataset\n",
     "\n",
@@ -51,7 +51,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -74,7 +74,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -151,7 +151,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -177,7 +177,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -196,7 +196,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -227,7 +227,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -277,7 +277,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -297,28 +297,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "python3",
    "language": "python",
    "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
-  },
-  "merlin": {
-   "containers": [
-    "nvcr.io/nvidia/merlin/merlin-hugectr:latest",
-    "nvcr.io/nvidia/merlin/merlin-tensorflow:latest",
-    "nvcr.io/nvidia/merlin/merlin-pytorch:latest"
-   ]
   }
  },
  "nbformat": 4,
 
@@ -2,12 +2,8 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {
-    "jupyter": {
-     "outputs_hidden": false
-    }
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Copyright 2021 NVIDIA Corporation. All Rights Reserved.\n",
@@ -37,7 +33,7 @@
     "\n",
     "# Scaling Criteo: ETL with NVTabular\n",
     "\n",
-    "This notebook is created using the latest stable [merlin-hugectr](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/merlin/containers/merlin-hugectr/tags), [merlin-tensorflow](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/merlin/containers/merlin-tensorflow/tags), or [merlin-pytorch](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/merlin/containers/merlin-pytorch/tags) container.\n",
+    "This notebook is created using the latest stable [merlin-tensorflow](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/merlin/containers/merlin-tensorflow/tags) or [merlin-pytorch](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/merlin/containers/merlin-pytorch/tags) container.\n",
     "\n",
     "## Overview\n",
     "\n",
@@ -80,12 +76,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-    "jupyter": {
-     "outputs_hidden": false
-    }
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Standard Libraries\n",
@@ -122,19 +114,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {
-    "jupyter": {
-     "outputs_hidden": false
-    }
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [],
    "source": [
     "# define some information about where to get our data\n",
     "BASE_DIR = os.environ.get(\"BASE_DIR\", \"/raid/data/criteo\")\n",
     "INPUT_DATA_DIR = os.environ.get(\"INPUT_DATA_DIR\", BASE_DIR + \"/converted/criteo\")\n",
     "OUTPUT_DATA_DIR = os.environ.get(\"OUTPUT_DATA_DIR\", BASE_DIR + \"/test_dask/output\")\n",
-    "USE_HUGECTR = bool(os.environ.get(\"USE_HUGECTR\", \"\"))\n",
     "stats_path = os.path.join(OUTPUT_DATA_DIR, \"test_dask/stats\")\n",
     "dask_workdir = os.path.join(OUTPUT_DATA_DIR, \"test_dask/workdir\")\n",
     "\n",
@@ -163,7 +150,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -179,12 +166,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {
-    "jupyter": {
-     "outputs_hidden": false
-    }
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -216,12 +199,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {
-    "jupyter": {
-     "outputs_hidden": false
-    }
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [
     {
      "name": "stderr",
@@ -465,7 +444,7 @@
        "<Client: 'tcp://127.0.0.1:44059' processes=2 threads=2, memory=100.00 GiB>"
       ]
      },
-     "execution_count": 9,
+     "execution_count": null,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -537,12 +516,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {
-    "jupyter": {
-     "outputs_hidden": false
-    }
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [],
    "source": [
     "# define our dataset schema\n",
@@ -568,24 +543,19 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We need to enforce the required HugeCTR data types, so we set them in a dictionary and give as an argument when creating our dataset. The dictionary defines the output datatypes of our datasets."
+    "Optionally, we can define the output datatypes of our datasets."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {
-    "jupyter": {
-     "outputs_hidden": false
-    }
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [],
    "source": [
     "dict_dtypes = {}\n",
     "\n",
-    "# The environment variable USE_HUGECTR defines, if we want to use the output for HugeCTR or another framework\n",
     "for col in CATEGORICAL_COLUMNS:\n",
-    "    dict_dtypes[col] = np.int64 if USE_HUGECTR else np.int32\n",
+    "    dict_dtypes[col] = np.int32\n",
     "\n",
     "for col in CONTINUOUS_COLUMNS:\n",
     "    dict_dtypes[col] = np.float32\n",
@@ -603,12 +573,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {
-    "jupyter": {
-     "outputs_hidden": false
-    }
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [],
    "source": [
     "train_dataset = nvt.Dataset(train_paths, engine=\"parquet\", part_size=part_size)\n",
@@ -624,12 +590,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {
-    "jupyter": {
-     "outputs_hidden": false
-    }
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [],
    "source": [
     "output_train_dir = os.path.join(OUTPUT_DATA_DIR, \"train/\")\n",
@@ -647,7 +609,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -661,10 +623,10 @@
     {
      "data": {
       "text/plain": [
-       "<nvtabular.workflow.workflow.Workflow at 0x7fdacec4fdc0>"
+       "<nvtabular.workflow.workflow.Workflow>"
       ]
      },
-     "execution_count": 15,
+     "execution_count": null,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -676,12 +638,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {
-    "jupyter": {
-     "outputs_hidden": false
-    }
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -708,12 +666,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {
-    "jupyter": {
-     "outputs_hidden": false
-    }
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -745,7 +699,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -754,35 +708,11 @@
   }
  ],
  "metadata": {
-  "file_extension": ".py",
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "python3",
    "language": "python",
    "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
-  },
-  "merlin": {
-   "containers": [
-    "nvcr.io/nvidia/merlin/merlin-hugectr:latest",
-    "nvcr.io/nvidia/merlin/merlin-tensorflow:latest",
-    "nvcr.io/nvidia/merlin/merlin-pytorch:latest"
-   ]
-  },
-  "mimetype": "text/x-python",
-  "npconvert_exporter": "python",
-  "pygments_lexer": "ipython3",
-  "version": 3
+  }
  },
  "nbformat": 4,
  "nbformat_minor": 4