From 9368b498ac1d2f5461a5afdc068a831bc242a899 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Tue, 1 Feb 2022 16:12:18 +0100 Subject: [PATCH 1/7] add files to compute basic stats on pseudo crawl dataset --- dashboard/python_scripts/compute_stats.py | 135 ++++++++++++++++++ .../compute_stats_on_pseudo_crawl.slurm | 43 ++++++ 2 files changed, 178 insertions(+) create mode 100644 dashboard/python_scripts/compute_stats.py create mode 100644 dashboard/slurm_scripts/compute_stats_on_pseudo_crawl.slurm diff --git a/dashboard/python_scripts/compute_stats.py b/dashboard/python_scripts/compute_stats.py new file mode 100644 index 00000000..3871c2f2 --- /dev/null +++ b/dashboard/python_scripts/compute_stats.py @@ -0,0 +1,135 @@ +import json +import logging +import subprocess +import sys +from argparse import ArgumentParser +from pathlib import Path +from statistics import mean + +import datasets +from bs4 import BeautifulSoup +from bs4.dammit import EncodingDetector +from datasets import config, load_from_disk +from datasets.utils.logging import set_verbosity_info + +set_verbosity_info() +logger = logging.getLogger(__name__) + +# For `soup.decode_content` that can hit the limit +sys.setrecursionlimit(10000) + + +def get_args(): + parser = ArgumentParser() + parser.add_argument( + "--dataset-path", + type=str, + required=True, + help="path to the parquet dataset folder", + ) + parser.add_argument("--save-path-stats-json", type=str, help="Where to save the stats json.") + parser.add_argument("--save-path-stats-full-json", type=str, help="Where to save the stats json.") + parser.add_argument("--save-batch-size", type=int, required=True, help="Batch size when writing.") + parser.add_argument("--use-datasets-caching", action="store_true") + parser.add_argument("--num-proc", type=int, default=1, help="Number of procs use for preprocessing.") + parser.add_argument( + "--seed-id", + type=int, + required=True, + help="Value of the seed id.", + ) + parser.add_argument( + "--num-examples", + type=int, + default=None, + help="Optional argument to select a subset (used for debugging purposes). Example `10`.", + ) + args = parser.parse_args() + + return args + + +def main(): + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, + ) + args = get_args() + logger.info(f"** The job is runned with the following arguments: **\n{args}\n **** ") + + if not args.use_datasets_caching: + datasets.set_caching_enabled(False) + else: + logger.info(f"the datasets results will be cached at {config.HF_DATASETS_CACHE}.") + + ds = load_from_disk(args.dataset_path) + + if args.num_examples: + ds = ds.select([i for i in range(args.num_examples)]) + + selected_mime_types = ["text/html"] + splits = { + **{ + mime_type: ds.filter( + lambda mime_types_: [mime_type_ == mime_type for mime_type_ in mime_types_], + input_columns="content_mime_detected", + batched=True, + num_proc=args.num_proc, + ) + for mime_type in selected_mime_types + }, + "others": ds.filter( + lambda mime_types_: [mime_type_ not in selected_mime_types for mime_type_ in mime_types_], + input_columns="content_mime_detected", + batched=True, + num_proc=args.num_proc, + ), + } + + data_stats = {f"{split_name}_total": len(ds) for split_name, ds in splits.items()} + + ds_html = splits[selected_mime_types[0]] + + def get_length_text(example): + example["length_text"] = len(example["text"]) + return example + + cols_to_remove = [col for col in ds.column_names if col not in ["content_languages", "url_host_tld"]] + ds_html = ds_html.map( + get_length_text, + batched=False, + num_proc=args.num_proc, + remove_columns=cols_to_remove, + ) + + data_stats["html_empty_text"] = len([e for e in ds_html["length_text"] if e == 0]) + data_stats["html_mean_length_non_empty_text"] = mean([e for e in ds_html["length_text"] if e != 0]) + data_stats["seed_id"] = args.seed_id + + logger.info(f"There is {data_stats['html_empty_text']} empty text rows out of {len(ds_html)} rows.") + + save_path = Path(args.save_path_stats_json) + save_path_tmp = f"{str(save_path.absolute())}.tmp" + logger.info(f"Saving the dataset at {save_path_tmp}") + with open(save_path_tmp, "w", encoding="utf-8") as f: + json.dump(data_stats, f, ensure_ascii=False, indent=4) + logger.info(f"Moving the saved dataset to {str(save_path.absolute())}") + subprocess.run(["mv", save_path_tmp, str(save_path.absolute())]) + + save_path = Path(args.save_path_stats_full_json) + save_path_tmp = f"{str(save_path.absolute())}.tmp" + logger.info(f"Saving the dataset at {save_path_tmp}") + ds_html.to_json( + save_path_tmp, + batch_size=args.save_batch_size, + num_proc=args.num_proc, + compression="gzip", + ) + logger.info(f"Moving the saved dataset to {str(save_path.absolute())}") + subprocess.run(["mv", save_path_tmp, str(save_path.absolute())]) + + +if __name__ == "__main__": + main() diff --git a/dashboard/slurm_scripts/compute_stats_on_pseudo_crawl.slurm b/dashboard/slurm_scripts/compute_stats_on_pseudo_crawl.slurm new file mode 100644 index 00000000..5e4a3867 --- /dev/null +++ b/dashboard/slurm_scripts/compute_stats_on_pseudo_crawl.slurm @@ -0,0 +1,43 @@ +#!/bin/bash +#SBATCH --job-name=pseudo_crawl_compute_stats +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=4 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --partition=cpu_p1 +#SBATCH --time 10:00:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfswork/rech/six/uty16tp/code/big_science/logs/compute_stats/%x-%j.out # output file name +#SBATCH --array=1-604 +#SBATCH --account=six@cpu + +set -x -e + +source $six_ALL_CCFRWORK/start-prod +conda activate thomas_data_tooling # Debug deepspeed temporarily + +CC_INDEX_FOLDER=$six_ALL_CCFRSCRATCH/pseudo_crawl/cc +DATA_TOOLING_REPO=$WORK/repos/sync_data_tooling/data_tooling + +pushd $DATA_TOOLING_REPO + +SEED_ID=$(python cc_pseudo_crawl/load_all_seed_ids.py --seed-path "$DATA_TOOLING_REPO"/cc_pseudo_crawl/sourcing_sheet_seeds/ --seed-index $SLURM_ARRAY_TASK_ID) + +echo "Computing stats on seed id ${SEED_ID}" + +DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/datasets-seeds/bigscience-catalogue-data/pseudo_crawl_seed--seed-id--"$SEED_ID" +SAVE_STATS_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/datasets-stats/bigscience-catalogue-data/seed_id="$SEED_ID"/stats.json +SAVE_STATS_PATH_FULL=$six_ALL_CCFRSCRATCH/pseudo_crawl/datasets-stats/bigscience-catalogue-data/seed_id="$SEED_ID"/full + +mkdir -p $SAVE_STATS_PATH_FULL + +export HF_DATASETS_OFFLINE=1 +export HF_DATASETS_CACHE=$SCRATCH/to_delete + +python dashboard/python_scripts/compute_stats.py \ + --dataset-path $DATASET_PATH \ + --num-proc 4 \ + --save-path-stats-json $SAVE_STATS_PATH \ + --save-path-stats-full-json $SAVE_STATS_PATH_FULL \ + --use-datasets-caching\ + --seed-id $SEED_ID\ + --save-batch-size 2 # 100 # 10 # 2 From 19c004e7c2633f149d973a60da07e30627206d74 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Wed, 2 Feb 2022 09:23:11 +0100 Subject: [PATCH 2/7] update statistics --- dashboard/python_scripts/compute_stats.py | 23 +++++++++++-------- .../compute_stats_on_pseudo_crawl.slurm | 9 ++++---- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/dashboard/python_scripts/compute_stats.py b/dashboard/python_scripts/compute_stats.py index 3871c2f2..07175251 100644 --- a/dashboard/python_scripts/compute_stats.py +++ b/dashboard/python_scripts/compute_stats.py @@ -1,23 +1,18 @@ +import os import json import logging import subprocess -import sys from argparse import ArgumentParser from pathlib import Path from statistics import mean import datasets -from bs4 import BeautifulSoup -from bs4.dammit import EncodingDetector from datasets import config, load_from_disk from datasets.utils.logging import set_verbosity_info set_verbosity_info() logger = logging.getLogger(__name__) -# For `soup.decode_content` that can hit the limit -sys.setrecursionlimit(10000) - def get_args(): parser = ArgumentParser() @@ -59,6 +54,11 @@ def main(): args = get_args() logger.info(f"** The job is runned with the following arguments: **\n{args}\n **** ") + if os.path.isfile(args.save_path_stats_json): + logger.info(f" --- Statistics already computed for seed id {args.seed_id} ") + return + + logger.info(f" --- Statistics not already computed for seed id {args.seed_id} ") if not args.use_datasets_caching: datasets.set_caching_enabled(False) else: @@ -92,8 +92,10 @@ def main(): ds_html = splits[selected_mime_types[0]] + logger.info(f"the currents splits are {data_stats}.") + def get_length_text(example): - example["length_text"] = len(example["text"]) + example["length_text"] = len(example["text"]) if example["text"] is not None else 0 return example cols_to_remove = [col for col in ds.column_names if col not in ["content_languages", "url_host_tld"]] @@ -105,7 +107,9 @@ def get_length_text(example): ) data_stats["html_empty_text"] = len([e for e in ds_html["length_text"] if e == 0]) - data_stats["html_mean_length_non_empty_text"] = mean([e for e in ds_html["length_text"] if e != 0]) + + non_empty_texts = [e for e in ds_html["length_text"] if e != 0] + data_stats["html_mean_length_non_empty_text"] = mean(non_empty_texts) if non_empty_texts != [] else None data_stats["seed_id"] = args.seed_id logger.info(f"There is {data_stats['html_empty_text']} empty text rows out of {len(ds_html)} rows.") @@ -119,7 +123,8 @@ def get_length_text(example): subprocess.run(["mv", save_path_tmp, str(save_path.absolute())]) save_path = Path(args.save_path_stats_full_json) - save_path_tmp = f"{str(save_path.absolute())}.tmp" + tmp_file_name = f"tmp-{str(save_path.name)}" + save_path_tmp = os.path.join(save_path.parent, tmp_file_name) logger.info(f"Saving the dataset at {save_path_tmp}") ds_html.to_json( save_path_tmp, diff --git a/dashboard/slurm_scripts/compute_stats_on_pseudo_crawl.slurm b/dashboard/slurm_scripts/compute_stats_on_pseudo_crawl.slurm index 5e4a3867..acca9c78 100644 --- a/dashboard/slurm_scripts/compute_stats_on_pseudo_crawl.slurm +++ b/dashboard/slurm_scripts/compute_stats_on_pseudo_crawl.slurm @@ -1,12 +1,12 @@ #!/bin/bash -#SBATCH --job-name=pseudo_crawl_compute_stats +#SBATCH --job-name=pseudo_crawl_compute_stats_v5 #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! #SBATCH --cpus-per-task=4 # number of cores per tasks #SBATCH --hint=nomultithread # we get physical cores not logical #SBATCH --partition=cpu_p1 #SBATCH --time 10:00:00 # maximum execution time (HH:MM:SS) -#SBATCH --output=/gpfswork/rech/six/uty16tp/code/big_science/logs/compute_stats/%x-%j.out # output file name +#SBATCH --output=/gpfswork/rech/six/uty16tp/code/big_science/logs/compute_stats_v5/%x-%j.out # output file name #SBATCH --array=1-604 #SBATCH --account=six@cpu @@ -26,9 +26,10 @@ echo "Computing stats on seed id ${SEED_ID}" DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/datasets-seeds/bigscience-catalogue-data/pseudo_crawl_seed--seed-id--"$SEED_ID" SAVE_STATS_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/datasets-stats/bigscience-catalogue-data/seed_id="$SEED_ID"/stats.json -SAVE_STATS_PATH_FULL=$six_ALL_CCFRSCRATCH/pseudo_crawl/datasets-stats/bigscience-catalogue-data/seed_id="$SEED_ID"/full +SAVE_STATS_PATH_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/datasets-stats/bigscience-catalogue-data/seed_id="$SEED_ID"/full +SAVE_STATS_PATH_FULL=$SAVE_STATS_PATH_DIR/full.jsonl.gz -mkdir -p $SAVE_STATS_PATH_FULL +mkdir -p $SAVE_STATS_PATH_DIR export HF_DATASETS_OFFLINE=1 export HF_DATASETS_CACHE=$SCRATCH/to_delete From 11d0fd129fd3f3559c16b6e18c72e9a28c715bdb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 2 Feb 2022 08:25:40 +0000 Subject: [PATCH 3/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- dashboard/python_scripts/compute_stats.py | 54 +++++++++++++++++------ 1 file changed, 40 insertions(+), 14 deletions(-) diff --git a/dashboard/python_scripts/compute_stats.py b/dashboard/python_scripts/compute_stats.py index 07175251..970dd718 100644 --- a/dashboard/python_scripts/compute_stats.py +++ b/dashboard/python_scripts/compute_stats.py @@ -22,11 +22,19 @@ def get_args(): required=True, help="path to the parquet dataset folder", ) - parser.add_argument("--save-path-stats-json", type=str, help="Where to save the stats json.") - parser.add_argument("--save-path-stats-full-json", type=str, help="Where to save the stats json.") - parser.add_argument("--save-batch-size", type=int, required=True, help="Batch size when writing.") + parser.add_argument( + "--save-path-stats-json", type=str, help="Where to save the stats json." + ) + parser.add_argument( + "--save-path-stats-full-json", type=str, help="Where to save the stats json." + ) + parser.add_argument( + "--save-batch-size", type=int, required=True, help="Batch size when writing." + ) parser.add_argument("--use-datasets-caching", action="store_true") - parser.add_argument("--num-proc", type=int, default=1, help="Number of procs use for preprocessing.") + parser.add_argument( + "--num-proc", type=int, default=1, help="Number of procs use for preprocessing." + ) parser.add_argument( "--seed-id", type=int, @@ -52,17 +60,21 @@ def main(): level=logging.INFO, ) args = get_args() - logger.info(f"** The job is runned with the following arguments: **\n{args}\n **** ") + logger.info( + f"** The job is runned with the following arguments: **\n{args}\n **** " + ) if os.path.isfile(args.save_path_stats_json): logger.info(f" --- Statistics already computed for seed id {args.seed_id} ") return - + logger.info(f" --- Statistics not already computed for seed id {args.seed_id} ") if not args.use_datasets_caching: datasets.set_caching_enabled(False) else: - logger.info(f"the datasets results will be cached at {config.HF_DATASETS_CACHE}.") + logger.info( + f"the datasets results will be cached at {config.HF_DATASETS_CACHE}." + ) ds = load_from_disk(args.dataset_path) @@ -73,7 +85,9 @@ def main(): splits = { **{ mime_type: ds.filter( - lambda mime_types_: [mime_type_ == mime_type for mime_type_ in mime_types_], + lambda mime_types_: [ + mime_type_ == mime_type for mime_type_ in mime_types_ + ], input_columns="content_mime_detected", batched=True, num_proc=args.num_proc, @@ -81,7 +95,9 @@ def main(): for mime_type in selected_mime_types }, "others": ds.filter( - lambda mime_types_: [mime_type_ not in selected_mime_types for mime_type_ in mime_types_], + lambda mime_types_: [ + mime_type_ not in selected_mime_types for mime_type_ in mime_types_ + ], input_columns="content_mime_detected", batched=True, num_proc=args.num_proc, @@ -95,10 +111,16 @@ def main(): logger.info(f"the currents splits are {data_stats}.") def get_length_text(example): - example["length_text"] = len(example["text"]) if example["text"] is not None else 0 + example["length_text"] = ( + len(example["text"]) if example["text"] is not None else 0 + ) return example - cols_to_remove = [col for col in ds.column_names if col not in ["content_languages", "url_host_tld"]] + cols_to_remove = [ + col + for col in ds.column_names + if col not in ["content_languages", "url_host_tld"] + ] ds_html = ds_html.map( get_length_text, batched=False, @@ -109,10 +131,14 @@ def get_length_text(example): data_stats["html_empty_text"] = len([e for e in ds_html["length_text"] if e == 0]) non_empty_texts = [e for e in ds_html["length_text"] if e != 0] - data_stats["html_mean_length_non_empty_text"] = mean(non_empty_texts) if non_empty_texts != [] else None + data_stats["html_mean_length_non_empty_text"] = ( + mean(non_empty_texts) if non_empty_texts != [] else None + ) data_stats["seed_id"] = args.seed_id - logger.info(f"There is {data_stats['html_empty_text']} empty text rows out of {len(ds_html)} rows.") + logger.info( + f"There is {data_stats['html_empty_text']} empty text rows out of {len(ds_html)} rows." + ) save_path = Path(args.save_path_stats_json) save_path_tmp = f"{str(save_path.absolute())}.tmp" @@ -124,7 +150,7 @@ def get_length_text(example): save_path = Path(args.save_path_stats_full_json) tmp_file_name = f"tmp-{str(save_path.name)}" - save_path_tmp = os.path.join(save_path.parent, tmp_file_name) + save_path_tmp = os.path.join(save_path.parent, tmp_file_name) logger.info(f"Saving the dataset at {save_path_tmp}") ds_html.to_json( save_path_tmp, From d4f38a56ca687f693d88cda6bbe6c63ef2ce4579 Mon Sep 17 00:00:00 2001 From: SaulLu <55560583+SaulLu@users.noreply.github.com> Date: Tue, 8 Feb 2022 15:33:47 +0100 Subject: [PATCH 4/7] Update dashboard/python_scripts/compute_stats.py Co-authored-by: Thomas Wang <24695242+thomasw21@users.noreply.github.com> --- dashboard/python_scripts/compute_stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dashboard/python_scripts/compute_stats.py b/dashboard/python_scripts/compute_stats.py index 970dd718..f765e973 100644 --- a/dashboard/python_scripts/compute_stats.py +++ b/dashboard/python_scripts/compute_stats.py @@ -23,7 +23,7 @@ def get_args(): help="path to the parquet dataset folder", ) parser.add_argument( - "--save-path-stats-json", type=str, help="Where to save the stats json." + "--save-path-stats-json", type=str, required=True, help="Where to save the stats json." ) parser.add_argument( "--save-path-stats-full-json", type=str, help="Where to save the stats json." From f644acab0e6815204b18a2f6b5b3cd197d1970ae Mon Sep 17 00:00:00 2001 From: SaulLu <55560583+SaulLu@users.noreply.github.com> Date: Tue, 8 Feb 2022 15:35:18 +0100 Subject: [PATCH 5/7] Update dashboard/python_scripts/compute_stats.py Co-authored-by: Thomas Wang <24695242+thomasw21@users.noreply.github.com> --- dashboard/python_scripts/compute_stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dashboard/python_scripts/compute_stats.py b/dashboard/python_scripts/compute_stats.py index f765e973..ed0a9112 100644 --- a/dashboard/python_scripts/compute_stats.py +++ b/dashboard/python_scripts/compute_stats.py @@ -78,7 +78,7 @@ def main(): ds = load_from_disk(args.dataset_path) - if args.num_examples: + if args.num_examples is not None: ds = ds.select([i for i in range(args.num_examples)]) selected_mime_types = ["text/html"] From 81f0062d87ee58d265e52e5628ec37f3a9d7be8b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 8 Feb 2022 14:35:36 +0000 Subject: [PATCH 6/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- dashboard/python_scripts/compute_stats.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dashboard/python_scripts/compute_stats.py b/dashboard/python_scripts/compute_stats.py index ed0a9112..012e48db 100644 --- a/dashboard/python_scripts/compute_stats.py +++ b/dashboard/python_scripts/compute_stats.py @@ -23,7 +23,10 @@ def get_args(): help="path to the parquet dataset folder", ) parser.add_argument( - "--save-path-stats-json", type=str, required=True, help="Where to save the stats json." + "--save-path-stats-json", + type=str, + required=True, + help="Where to save the stats json.", ) parser.add_argument( "--save-path-stats-full-json", type=str, help="Where to save the stats json." From e17c42de586869085df138df3d3aac10d650e0f9 Mon Sep 17 00:00:00 2001 From: SaulLu <55560583+SaulLu@users.noreply.github.com> Date: Tue, 8 Feb 2022 15:37:21 +0100 Subject: [PATCH 7/7] Update dashboard/python_scripts/compute_stats.py Co-authored-by: Thomas Wang <24695242+thomasw21@users.noreply.github.com> --- dashboard/python_scripts/compute_stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dashboard/python_scripts/compute_stats.py b/dashboard/python_scripts/compute_stats.py index 012e48db..505b3775 100644 --- a/dashboard/python_scripts/compute_stats.py +++ b/dashboard/python_scripts/compute_stats.py @@ -109,7 +109,7 @@ def main(): data_stats = {f"{split_name}_total": len(ds) for split_name, ds in splits.items()} - ds_html = splits[selected_mime_types[0]] + ds_html = splits["text/html"] logger.info(f"the currents splits are {data_stats}.")