|
2 | 2 | "cells": [ |
3 | 3 | { |
4 | 4 | "cell_type": "code", |
5 | | - "execution_count": 1, |
6 | | - "metadata": { |
7 | | - "jupyter": { |
8 | | - "outputs_hidden": false |
9 | | - } |
10 | | - }, |
| 5 | + "execution_count": null, |
| 6 | + "metadata": {}, |
11 | 7 | "outputs": [], |
12 | 8 | "source": [ |
13 | 9 | "# Copyright 2021 NVIDIA Corporation. All Rights Reserved.\n", |
|
37 | 33 | "\n", |
38 | 34 | "# Scaling Criteo: ETL with NVTabular\n", |
39 | 35 | "\n", |
40 | | - "This notebook is created using the latest stable [merlin-hugectr](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/merlin/containers/merlin-hugectr/tags), [merlin-tensorflow](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/merlin/containers/merlin-tensorflow/tags), or [merlin-pytorch](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/merlin/containers/merlin-pytorch/tags) container.\n", |
| 36 | + "This notebook is created using the latest stable [merlin-tensorflow](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/merlin/containers/merlin-tensorflow/tags) or [merlin-pytorch](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/merlin/containers/merlin-pytorch/tags) container.\n", |
41 | 37 | "\n", |
42 | 38 | "## Overview\n", |
43 | 39 | "\n", |
|
80 | 76 | }, |
81 | 77 | { |
82 | 78 | "cell_type": "code", |
83 | | - "execution_count": 2, |
84 | | - "metadata": { |
85 | | - "jupyter": { |
86 | | - "outputs_hidden": false |
87 | | - } |
88 | | - }, |
| 79 | + "execution_count": null, |
| 80 | + "metadata": {}, |
89 | 81 | "outputs": [], |
90 | 82 | "source": [ |
91 | 83 | "# Standard Libraries\n", |
|
122 | 114 | }, |
123 | 115 | { |
124 | 116 | "cell_type": "code", |
125 | | - "execution_count": 5, |
126 | | - "metadata": { |
127 | | - "jupyter": { |
128 | | - "outputs_hidden": false |
129 | | - } |
130 | | - }, |
| 117 | + "execution_count": null, |
| 118 | + "metadata": {}, |
131 | 119 | "outputs": [], |
132 | 120 | "source": [ |
133 | 121 | "# define some information about where to get our data\n", |
134 | 122 | "BASE_DIR = os.environ.get(\"BASE_DIR\", \"/raid/data/criteo\")\n", |
135 | 123 | "INPUT_DATA_DIR = os.environ.get(\"INPUT_DATA_DIR\", BASE_DIR + \"/converted/criteo\")\n", |
136 | 124 | "OUTPUT_DATA_DIR = os.environ.get(\"OUTPUT_DATA_DIR\", BASE_DIR + \"/test_dask/output\")\n", |
137 | | - "USE_HUGECTR = bool(os.environ.get(\"USE_HUGECTR\", \"\"))\n", |
138 | 125 | "stats_path = os.path.join(OUTPUT_DATA_DIR, \"test_dask/stats\")\n", |
139 | 126 | "dask_workdir = os.path.join(OUTPUT_DATA_DIR, \"test_dask/workdir\")\n", |
140 | 127 | "\n", |
|
163 | 150 | }, |
164 | 151 | { |
165 | 152 | "cell_type": "code", |
166 | | - "execution_count": 7, |
| 153 | + "execution_count": null, |
167 | 154 | "metadata": {}, |
168 | 155 | "outputs": [], |
169 | 156 | "source": [ |
|
179 | 166 | }, |
180 | 167 | { |
181 | 168 | "cell_type": "code", |
182 | | - "execution_count": 8, |
183 | | - "metadata": { |
184 | | - "jupyter": { |
185 | | - "outputs_hidden": false |
186 | | - } |
187 | | - }, |
| 169 | + "execution_count": null, |
| 170 | + "metadata": {}, |
188 | 171 | "outputs": [ |
189 | 172 | { |
190 | 173 | "name": "stdout", |
|
216 | 199 | }, |
217 | 200 | { |
218 | 201 | "cell_type": "code", |
219 | | - "execution_count": 9, |
220 | | - "metadata": { |
221 | | - "jupyter": { |
222 | | - "outputs_hidden": false |
223 | | - } |
224 | | - }, |
| 202 | + "execution_count": null, |
| 203 | + "metadata": {}, |
225 | 204 | "outputs": [ |
226 | 205 | { |
227 | 206 | "name": "stderr", |
|
465 | 444 | "<Client: 'tcp://127.0.0.1:44059' processes=2 threads=2, memory=100.00 GiB>" |
466 | 445 | ] |
467 | 446 | }, |
468 | | - "execution_count": 9, |
| 447 | + "execution_count": null, |
469 | 448 | "metadata": {}, |
470 | 449 | "output_type": "execute_result" |
471 | 450 | } |
|
537 | 516 | }, |
538 | 517 | { |
539 | 518 | "cell_type": "code", |
540 | | - "execution_count": 10, |
541 | | - "metadata": { |
542 | | - "jupyter": { |
543 | | - "outputs_hidden": false |
544 | | - } |
545 | | - }, |
| 519 | + "execution_count": null, |
| 520 | + "metadata": {}, |
546 | 521 | "outputs": [], |
547 | 522 | "source": [ |
548 | 523 | "# define our dataset schema\n", |
|
568 | 543 | "cell_type": "markdown", |
569 | 544 | "metadata": {}, |
570 | 545 | "source": [ |
571 | | - "We need to enforce the required HugeCTR data types, so we set them in a dictionary and give as an argument when creating our dataset. The dictionary defines the output datatypes of our datasets." |
| 546 | + "Optionally, we can define the output datatypes of our datasets." |
572 | 547 | ] |
573 | 548 | }, |
574 | 549 | { |
575 | 550 | "cell_type": "code", |
576 | | - "execution_count": 11, |
577 | | - "metadata": { |
578 | | - "jupyter": { |
579 | | - "outputs_hidden": false |
580 | | - } |
581 | | - }, |
| 551 | + "execution_count": null, |
| 552 | + "metadata": {}, |
582 | 553 | "outputs": [], |
583 | 554 | "source": [ |
584 | 555 | "dict_dtypes = {}\n", |
585 | 556 | "\n", |
586 | | - "# The environment variable USE_HUGECTR defines, if we want to use the output for HugeCTR or another framework\n", |
587 | 557 | "for col in CATEGORICAL_COLUMNS:\n", |
588 | | - " dict_dtypes[col] = np.int64 if USE_HUGECTR else np.int32\n", |
| 558 | + " dict_dtypes[col] = np.int32\n", |
589 | 559 | "\n", |
590 | 560 | "for col in CONTINUOUS_COLUMNS:\n", |
591 | 561 | " dict_dtypes[col] = np.float32\n", |
|
603 | 573 | }, |
604 | 574 | { |
605 | 575 | "cell_type": "code", |
606 | | - "execution_count": 13, |
607 | | - "metadata": { |
608 | | - "jupyter": { |
609 | | - "outputs_hidden": false |
610 | | - } |
611 | | - }, |
| 576 | + "execution_count": null, |
| 577 | + "metadata": {}, |
612 | 578 | "outputs": [], |
613 | 579 | "source": [ |
614 | 580 | "train_dataset = nvt.Dataset(train_paths, engine=\"parquet\", part_size=part_size)\n", |
|
624 | 590 | }, |
625 | 591 | { |
626 | 592 | "cell_type": "code", |
627 | | - "execution_count": 14, |
628 | | - "metadata": { |
629 | | - "jupyter": { |
630 | | - "outputs_hidden": false |
631 | | - } |
632 | | - }, |
| 593 | + "execution_count": null, |
| 594 | + "metadata": {}, |
633 | 595 | "outputs": [], |
634 | 596 | "source": [ |
635 | 597 | "output_train_dir = os.path.join(OUTPUT_DATA_DIR, \"train/\")\n", |
|
647 | 609 | }, |
648 | 610 | { |
649 | 611 | "cell_type": "code", |
650 | | - "execution_count": 15, |
| 612 | + "execution_count": null, |
651 | 613 | "metadata": {}, |
652 | 614 | "outputs": [ |
653 | 615 | { |
|
661 | 623 | { |
662 | 624 | "data": { |
663 | 625 | "text/plain": [ |
664 | | - "<nvtabular.workflow.workflow.Workflow at 0x7fdacec4fdc0>" |
| 626 | + "<nvtabular.workflow.workflow.Workflow>" |
665 | 627 | ] |
666 | 628 | }, |
667 | | - "execution_count": 15, |
| 629 | + "execution_count": null, |
668 | 630 | "metadata": {}, |
669 | 631 | "output_type": "execute_result" |
670 | 632 | } |
|
676 | 638 | }, |
677 | 639 | { |
678 | 640 | "cell_type": "code", |
679 | | - "execution_count": 16, |
680 | | - "metadata": { |
681 | | - "jupyter": { |
682 | | - "outputs_hidden": false |
683 | | - } |
684 | | - }, |
| 641 | + "execution_count": null, |
| 642 | + "metadata": {}, |
685 | 643 | "outputs": [ |
686 | 644 | { |
687 | 645 | "name": "stdout", |
|
708 | 666 | }, |
709 | 667 | { |
710 | 668 | "cell_type": "code", |
711 | | - "execution_count": 17, |
712 | | - "metadata": { |
713 | | - "jupyter": { |
714 | | - "outputs_hidden": false |
715 | | - } |
716 | | - }, |
| 669 | + "execution_count": null, |
| 670 | + "metadata": {}, |
717 | 671 | "outputs": [ |
718 | 672 | { |
719 | 673 | "name": "stdout", |
|
745 | 699 | }, |
746 | 700 | { |
747 | 701 | "cell_type": "code", |
748 | | - "execution_count": 18, |
| 702 | + "execution_count": null, |
749 | 703 | "metadata": {}, |
750 | 704 | "outputs": [], |
751 | 705 | "source": [ |
|
754 | 708 | } |
755 | 709 | ], |
756 | 710 | "metadata": { |
757 | | - "file_extension": ".py", |
758 | 711 | "kernelspec": { |
759 | | - "display_name": "Python 3 (ipykernel)", |
| 712 | + "display_name": "python3", |
760 | 713 | "language": "python", |
761 | 714 | "name": "python3" |
762 | | - }, |
763 | | - "language_info": { |
764 | | - "codemirror_mode": { |
765 | | - "name": "ipython", |
766 | | - "version": 3 |
767 | | - }, |
768 | | - "file_extension": ".py", |
769 | | - "mimetype": "text/x-python", |
770 | | - "name": "python", |
771 | | - "nbconvert_exporter": "python", |
772 | | - "pygments_lexer": "ipython3", |
773 | | - "version": "3.8.10" |
774 | | - }, |
775 | | - "merlin": { |
776 | | - "containers": [ |
777 | | - "nvcr.io/nvidia/merlin/merlin-hugectr:latest", |
778 | | - "nvcr.io/nvidia/merlin/merlin-tensorflow:latest", |
779 | | - "nvcr.io/nvidia/merlin/merlin-pytorch:latest" |
780 | | - ] |
781 | | - }, |
782 | | - "mimetype": "text/x-python", |
783 | | - "npconvert_exporter": "python", |
784 | | - "pygments_lexer": "ipython3", |
785 | | - "version": 3 |
| 715 | + } |
786 | 716 | }, |
787 | 717 | "nbformat": 4, |
788 | 718 | "nbformat_minor": 4 |
|
0 commit comments