From 9368b498ac1d2f5461a5afdc068a831bc242a899 Mon Sep 17 00:00:00 2001
From: SaulLu <lucilesaul.com@gmail.com>
Date: Tue, 1 Feb 2022 16:12:18 +0100
Subject: [PATCH 1/7] add files to compute basic stats on pseudo crawl dataset

---
 dashboard/python_scripts/compute_stats.py     | 135 ++++++++++++++++++
 .../compute_stats_on_pseudo_crawl.slurm       |  43 ++++++
 2 files changed, 178 insertions(+)
 create mode 100644 dashboard/python_scripts/compute_stats.py
 create mode 100644 dashboard/slurm_scripts/compute_stats_on_pseudo_crawl.slurm

diff --git a/dashboard/python_scripts/compute_stats.py b/dashboard/python_scripts/compute_stats.py
new file mode 100644
index 00000000..3871c2f2
--- /dev/null
+++ b/dashboard/python_scripts/compute_stats.py
@@ -0,0 +1,135 @@
+import json
+import logging
+import subprocess
+import sys
+from argparse import ArgumentParser
+from pathlib import Path
+from statistics import mean
+
+import datasets
+from bs4 import BeautifulSoup
+from bs4.dammit import EncodingDetector
+from datasets import config, load_from_disk
+from datasets.utils.logging import set_verbosity_info
+
+set_verbosity_info()
+logger = logging.getLogger(__name__)
+
+# For `soup.decode_content` that can hit the limit
+sys.setrecursionlimit(10000)
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--dataset-path",
+        type=str,
+        required=True,
+        help="path to the parquet dataset folder",
+    )
+    parser.add_argument("--save-path-stats-json", type=str, help="Where to save the stats json.")
+    parser.add_argument("--save-path-stats-full-json", type=str, help="Where to save the stats json.")
+    parser.add_argument("--save-batch-size", type=int, required=True, help="Batch size when writing.")
+    parser.add_argument("--use-datasets-caching", action="store_true")
+    parser.add_argument("--num-proc", type=int, default=1, help="Number of procs use for preprocessing.")
+    parser.add_argument(
+        "--seed-id",
+        type=int,
+        required=True,
+        help="Value of the seed id.",
+    )
+    parser.add_argument(
+        "--num-examples",
+        type=int,
+        default=None,
+        help="Optional argument to select a subset (used for debugging purposes). Example `10`.",
+    )
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    args = get_args()
+    logger.info(f"** The job is runned with the following arguments: **\n{args}\n **** ")
+
+    if not args.use_datasets_caching:
+        datasets.set_caching_enabled(False)
+    else:
+        logger.info(f"the datasets results will be cached at {config.HF_DATASETS_CACHE}.")
+
+    ds = load_from_disk(args.dataset_path)
+
+    if args.num_examples:
+        ds = ds.select([i for i in range(args.num_examples)])
+
+    selected_mime_types = ["text/html"]
+    splits = {
+        **{
+            mime_type: ds.filter(
+                lambda mime_types_: [mime_type_ == mime_type for mime_type_ in mime_types_],
+                input_columns="content_mime_detected",
+                batched=True,
+                num_proc=args.num_proc,
+            )
+            for mime_type in selected_mime_types
+        },
+        "others": ds.filter(
+            lambda mime_types_: [mime_type_ not in selected_mime_types for mime_type_ in mime_types_],
+            input_columns="content_mime_detected",
+            batched=True,
+            num_proc=args.num_proc,
+        ),
+    }
+
+    data_stats = {f"{split_name}_total": len(ds) for split_name, ds in splits.items()}
+
+    ds_html = splits[selected_mime_types[0]]
+
+    def get_length_text(example):
+        example["length_text"] = len(example["text"])
+        return example
+
+    cols_to_remove = [col for col in ds.column_names if col not in ["content_languages", "url_host_tld"]]
+    ds_html = ds_html.map(
+        get_length_text,
+        batched=False,
+        num_proc=args.num_proc,
+        remove_columns=cols_to_remove,
+    )
+
+    data_stats["html_empty_text"] = len([e for e in ds_html["length_text"] if e == 0])
+    data_stats["html_mean_length_non_empty_text"] = mean([e for e in ds_html["length_text"] if e != 0])
+    data_stats["seed_id"] = args.seed_id
+
+    logger.info(f"There is {data_stats['html_empty_text']} empty text rows out of {len(ds_html)} rows.")
+
+    save_path = Path(args.save_path_stats_json)
+    save_path_tmp = f"{str(save_path.absolute())}.tmp"
+    logger.info(f"Saving the dataset at {save_path_tmp}")
+    with open(save_path_tmp, "w", encoding="utf-8") as f:
+        json.dump(data_stats, f, ensure_ascii=False, indent=4)
+    logger.info(f"Moving the saved dataset to {str(save_path.absolute())}")
+    subprocess.run(["mv", save_path_tmp, str(save_path.absolute())])
+
+    save_path = Path(args.save_path_stats_full_json)
+    save_path_tmp = f"{str(save_path.absolute())}.tmp"
+    logger.info(f"Saving the dataset at {save_path_tmp}")
+    ds_html.to_json(
+        save_path_tmp,
+        batch_size=args.save_batch_size,
+        num_proc=args.num_proc,
+        compression="gzip",
+    )
+    logger.info(f"Moving the saved dataset to {str(save_path.absolute())}")
+    subprocess.run(["mv", save_path_tmp, str(save_path.absolute())])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/dashboard/slurm_scripts/compute_stats_on_pseudo_crawl.slurm b/dashboard/slurm_scripts/compute_stats_on_pseudo_crawl.slurm
new file mode 100644
index 00000000..5e4a3867
--- /dev/null
+++ b/dashboard/slurm_scripts/compute_stats_on_pseudo_crawl.slurm
@@ -0,0 +1,43 @@
+#!/bin/bash
+#SBATCH --job-name=pseudo_crawl_compute_stats
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
+#SBATCH --cpus-per-task=4         # number of cores per tasks
+#SBATCH --hint=nomultithread         # we get physical cores not logical
+#SBATCH --partition=cpu_p1
+#SBATCH --time 10:00:00              # maximum execution time (HH:MM:SS)
+#SBATCH --output=/gpfswork/rech/six/uty16tp/code/big_science/logs/compute_stats/%x-%j.out           # output file name
+#SBATCH --array=1-604
+#SBATCH --account=six@cpu
+
+set -x -e
+
+source $six_ALL_CCFRWORK/start-prod
+conda activate thomas_data_tooling # Debug deepspeed temporarily
+
+CC_INDEX_FOLDER=$six_ALL_CCFRSCRATCH/pseudo_crawl/cc
+DATA_TOOLING_REPO=$WORK/repos/sync_data_tooling/data_tooling
+
+pushd $DATA_TOOLING_REPO
+
+SEED_ID=$(python cc_pseudo_crawl/load_all_seed_ids.py --seed-path "$DATA_TOOLING_REPO"/cc_pseudo_crawl/sourcing_sheet_seeds/ --seed-index $SLURM_ARRAY_TASK_ID)
+
+echo "Computing stats on seed id ${SEED_ID}"
+
+DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/datasets-seeds/bigscience-catalogue-data/pseudo_crawl_seed--seed-id--"$SEED_ID"
+SAVE_STATS_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/datasets-stats/bigscience-catalogue-data/seed_id="$SEED_ID"/stats.json
+SAVE_STATS_PATH_FULL=$six_ALL_CCFRSCRATCH/pseudo_crawl/datasets-stats/bigscience-catalogue-data/seed_id="$SEED_ID"/full
+
+mkdir -p $SAVE_STATS_PATH_FULL
+
+export HF_DATASETS_OFFLINE=1
+export HF_DATASETS_CACHE=$SCRATCH/to_delete
+
+python dashboard/python_scripts/compute_stats.py \
+    --dataset-path $DATASET_PATH \
+    --num-proc 4 \
+    --save-path-stats-json $SAVE_STATS_PATH \
+    --save-path-stats-full-json $SAVE_STATS_PATH_FULL \
+    --use-datasets-caching\
+    --seed-id $SEED_ID\
+    --save-batch-size 2 # 100 # 10 # 2

From 19c004e7c2633f149d973a60da07e30627206d74 Mon Sep 17 00:00:00 2001
From: SaulLu <lucilesaul.com@gmail.com>
Date: Wed, 2 Feb 2022 09:23:11 +0100
Subject: [PATCH 2/7] update statistics

---
 dashboard/python_scripts/compute_stats.py     | 23 +++++++++++--------
 .../compute_stats_on_pseudo_crawl.slurm       |  9 ++++----
 2 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/dashboard/python_scripts/compute_stats.py b/dashboard/python_scripts/compute_stats.py
index 3871c2f2..07175251 100644
--- a/dashboard/python_scripts/compute_stats.py
+++ b/dashboard/python_scripts/compute_stats.py
@@ -1,23 +1,18 @@
+import os
 import json
 import logging
 import subprocess
-import sys
 from argparse import ArgumentParser
 from pathlib import Path
 from statistics import mean
 
 import datasets
-from bs4 import BeautifulSoup
-from bs4.dammit import EncodingDetector
 from datasets import config, load_from_disk
 from datasets.utils.logging import set_verbosity_info
 
 set_verbosity_info()
 logger = logging.getLogger(__name__)
 
-# For `soup.decode_content` that can hit the limit
-sys.setrecursionlimit(10000)
-
 
 def get_args():
     parser = ArgumentParser()
@@ -59,6 +54,11 @@ def main():
     args = get_args()
     logger.info(f"** The job is runned with the following arguments: **\n{args}\n **** ")
 
+    if os.path.isfile(args.save_path_stats_json):
+        logger.info(f" --- Statistics already computed for seed id {args.seed_id} ")
+        return
+        
+    logger.info(f" --- Statistics not already computed for seed id {args.seed_id} ")
     if not args.use_datasets_caching:
         datasets.set_caching_enabled(False)
     else:
@@ -92,8 +92,10 @@ def main():
 
     ds_html = splits[selected_mime_types[0]]
 
+    logger.info(f"the currents splits are {data_stats}.")
+
     def get_length_text(example):
-        example["length_text"] = len(example["text"])
+        example["length_text"] = len(example["text"]) if example["text"] is not None else 0
         return example
 
     cols_to_remove = [col for col in ds.column_names if col not in ["content_languages", "url_host_tld"]]
@@ -105,7 +107,9 @@ def get_length_text(example):
     )
 
     data_stats["html_empty_text"] = len([e for e in ds_html["length_text"] if e == 0])
-    data_stats["html_mean_length_non_empty_text"] = mean([e for e in ds_html["length_text"] if e != 0])
+
+    non_empty_texts = [e for e in ds_html["length_text"] if e != 0]
+    data_stats["html_mean_length_non_empty_text"] = mean(non_empty_texts) if non_empty_texts != [] else None
     data_stats["seed_id"] = args.seed_id
 
     logger.info(f"There is {data_stats['html_empty_text']} empty text rows out of {len(ds_html)} rows.")
@@ -119,7 +123,8 @@ def get_length_text(example):
     subprocess.run(["mv", save_path_tmp, str(save_path.absolute())])
 
     save_path = Path(args.save_path_stats_full_json)
-    save_path_tmp = f"{str(save_path.absolute())}.tmp"
+    tmp_file_name = f"tmp-{str(save_path.name)}"
+    save_path_tmp = os.path.join(save_path.parent, tmp_file_name) 
     logger.info(f"Saving the dataset at {save_path_tmp}")
     ds_html.to_json(
         save_path_tmp,
diff --git a/dashboard/slurm_scripts/compute_stats_on_pseudo_crawl.slurm b/dashboard/slurm_scripts/compute_stats_on_pseudo_crawl.slurm
index 5e4a3867..acca9c78 100644
--- a/dashboard/slurm_scripts/compute_stats_on_pseudo_crawl.slurm
+++ b/dashboard/slurm_scripts/compute_stats_on_pseudo_crawl.slurm
@@ -1,12 +1,12 @@
 #!/bin/bash
-#SBATCH --job-name=pseudo_crawl_compute_stats
+#SBATCH --job-name=pseudo_crawl_compute_stats_v5
 #SBATCH --nodes=1
 #SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
 #SBATCH --cpus-per-task=4         # number of cores per tasks
 #SBATCH --hint=nomultithread         # we get physical cores not logical
 #SBATCH --partition=cpu_p1
 #SBATCH --time 10:00:00              # maximum execution time (HH:MM:SS)
-#SBATCH --output=/gpfswork/rech/six/uty16tp/code/big_science/logs/compute_stats/%x-%j.out           # output file name
+#SBATCH --output=/gpfswork/rech/six/uty16tp/code/big_science/logs/compute_stats_v5/%x-%j.out           # output file name
 #SBATCH --array=1-604
 #SBATCH --account=six@cpu
 
@@ -26,9 +26,10 @@ echo "Computing stats on seed id ${SEED_ID}"
 
 DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/datasets-seeds/bigscience-catalogue-data/pseudo_crawl_seed--seed-id--"$SEED_ID"
 SAVE_STATS_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/datasets-stats/bigscience-catalogue-data/seed_id="$SEED_ID"/stats.json
-SAVE_STATS_PATH_FULL=$six_ALL_CCFRSCRATCH/pseudo_crawl/datasets-stats/bigscience-catalogue-data/seed_id="$SEED_ID"/full
+SAVE_STATS_PATH_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/datasets-stats/bigscience-catalogue-data/seed_id="$SEED_ID"/full
+SAVE_STATS_PATH_FULL=$SAVE_STATS_PATH_DIR/full.jsonl.gz
 
-mkdir -p $SAVE_STATS_PATH_FULL
+mkdir -p $SAVE_STATS_PATH_DIR
 
 export HF_DATASETS_OFFLINE=1
 export HF_DATASETS_CACHE=$SCRATCH/to_delete

From 11d0fd129fd3f3559c16b6e18c72e9a28c715bdb Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 2 Feb 2022 08:25:40 +0000
Subject: [PATCH 3/7] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 dashboard/python_scripts/compute_stats.py | 54 +++++++++++++++++------
 1 file changed, 40 insertions(+), 14 deletions(-)

diff --git a/dashboard/python_scripts/compute_stats.py b/dashboard/python_scripts/compute_stats.py
index 07175251..970dd718 100644
--- a/dashboard/python_scripts/compute_stats.py
+++ b/dashboard/python_scripts/compute_stats.py
@@ -22,11 +22,19 @@ def get_args():
         required=True,
         help="path to the parquet dataset folder",
     )
-    parser.add_argument("--save-path-stats-json", type=str, help="Where to save the stats json.")
-    parser.add_argument("--save-path-stats-full-json", type=str, help="Where to save the stats json.")
-    parser.add_argument("--save-batch-size", type=int, required=True, help="Batch size when writing.")
+    parser.add_argument(
+        "--save-path-stats-json", type=str, help="Where to save the stats json."
+    )
+    parser.add_argument(
+        "--save-path-stats-full-json", type=str, help="Where to save the stats json."
+    )
+    parser.add_argument(
+        "--save-batch-size", type=int, required=True, help="Batch size when writing."
+    )
     parser.add_argument("--use-datasets-caching", action="store_true")
-    parser.add_argument("--num-proc", type=int, default=1, help="Number of procs use for preprocessing.")
+    parser.add_argument(
+        "--num-proc", type=int, default=1, help="Number of procs use for preprocessing."
+    )
     parser.add_argument(
         "--seed-id",
         type=int,
@@ -52,17 +60,21 @@ def main():
         level=logging.INFO,
     )
     args = get_args()
-    logger.info(f"** The job is runned with the following arguments: **\n{args}\n **** ")
+    logger.info(
+        f"** The job is runned with the following arguments: **\n{args}\n **** "
+    )
 
     if os.path.isfile(args.save_path_stats_json):
         logger.info(f" --- Statistics already computed for seed id {args.seed_id} ")
         return
-        
+
     logger.info(f" --- Statistics not already computed for seed id {args.seed_id} ")
     if not args.use_datasets_caching:
         datasets.set_caching_enabled(False)
     else:
-        logger.info(f"the datasets results will be cached at {config.HF_DATASETS_CACHE}.")
+        logger.info(
+            f"the datasets results will be cached at {config.HF_DATASETS_CACHE}."
+        )
 
     ds = load_from_disk(args.dataset_path)
 
@@ -73,7 +85,9 @@ def main():
     splits = {
         **{
             mime_type: ds.filter(
-                lambda mime_types_: [mime_type_ == mime_type for mime_type_ in mime_types_],
+                lambda mime_types_: [
+                    mime_type_ == mime_type for mime_type_ in mime_types_
+                ],
                 input_columns="content_mime_detected",
                 batched=True,
                 num_proc=args.num_proc,
@@ -81,7 +95,9 @@ def main():
             for mime_type in selected_mime_types
         },
         "others": ds.filter(
-            lambda mime_types_: [mime_type_ not in selected_mime_types for mime_type_ in mime_types_],
+            lambda mime_types_: [
+                mime_type_ not in selected_mime_types for mime_type_ in mime_types_
+            ],
             input_columns="content_mime_detected",
             batched=True,
             num_proc=args.num_proc,
@@ -95,10 +111,16 @@ def main():
     logger.info(f"the currents splits are {data_stats}.")
 
     def get_length_text(example):
-        example["length_text"] = len(example["text"]) if example["text"] is not None else 0
+        example["length_text"] = (
+            len(example["text"]) if example["text"] is not None else 0
+        )
         return example
 
-    cols_to_remove = [col for col in ds.column_names if col not in ["content_languages", "url_host_tld"]]
+    cols_to_remove = [
+        col
+        for col in ds.column_names
+        if col not in ["content_languages", "url_host_tld"]
+    ]
     ds_html = ds_html.map(
         get_length_text,
         batched=False,
@@ -109,10 +131,14 @@ def get_length_text(example):
     data_stats["html_empty_text"] = len([e for e in ds_html["length_text"] if e == 0])
 
     non_empty_texts = [e for e in ds_html["length_text"] if e != 0]
-    data_stats["html_mean_length_non_empty_text"] = mean(non_empty_texts) if non_empty_texts != [] else None
+    data_stats["html_mean_length_non_empty_text"] = (
+        mean(non_empty_texts) if non_empty_texts != [] else None
+    )
     data_stats["seed_id"] = args.seed_id
 
-    logger.info(f"There is {data_stats['html_empty_text']} empty text rows out of {len(ds_html)} rows.")
+    logger.info(
+        f"There is {data_stats['html_empty_text']} empty text rows out of {len(ds_html)} rows."
+    )
 
     save_path = Path(args.save_path_stats_json)
     save_path_tmp = f"{str(save_path.absolute())}.tmp"
@@ -124,7 +150,7 @@ def get_length_text(example):
 
     save_path = Path(args.save_path_stats_full_json)
     tmp_file_name = f"tmp-{str(save_path.name)}"
-    save_path_tmp = os.path.join(save_path.parent, tmp_file_name) 
+    save_path_tmp = os.path.join(save_path.parent, tmp_file_name)
     logger.info(f"Saving the dataset at {save_path_tmp}")
     ds_html.to_json(
         save_path_tmp,

From d4f38a56ca687f693d88cda6bbe6c63ef2ce4579 Mon Sep 17 00:00:00 2001
From: SaulLu <55560583+SaulLu@users.noreply.github.com>
Date: Tue, 8 Feb 2022 15:33:47 +0100
Subject: [PATCH 4/7] Update dashboard/python_scripts/compute_stats.py

Co-authored-by: Thomas Wang <24695242+thomasw21@users.noreply.github.com>
---
 dashboard/python_scripts/compute_stats.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dashboard/python_scripts/compute_stats.py b/dashboard/python_scripts/compute_stats.py
index 970dd718..f765e973 100644
--- a/dashboard/python_scripts/compute_stats.py
+++ b/dashboard/python_scripts/compute_stats.py
@@ -23,7 +23,7 @@ def get_args():
         help="path to the parquet dataset folder",
     )
     parser.add_argument(
-        "--save-path-stats-json", type=str, help="Where to save the stats json."
+        "--save-path-stats-json", type=str, required=True, help="Where to save the stats json."
     )
     parser.add_argument(
         "--save-path-stats-full-json", type=str, help="Where to save the stats json."

From f644acab0e6815204b18a2f6b5b3cd197d1970ae Mon Sep 17 00:00:00 2001
From: SaulLu <55560583+SaulLu@users.noreply.github.com>
Date: Tue, 8 Feb 2022 15:35:18 +0100
Subject: [PATCH 5/7] Update dashboard/python_scripts/compute_stats.py

Co-authored-by: Thomas Wang <24695242+thomasw21@users.noreply.github.com>
---
 dashboard/python_scripts/compute_stats.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dashboard/python_scripts/compute_stats.py b/dashboard/python_scripts/compute_stats.py
index f765e973..ed0a9112 100644
--- a/dashboard/python_scripts/compute_stats.py
+++ b/dashboard/python_scripts/compute_stats.py
@@ -78,7 +78,7 @@ def main():
 
     ds = load_from_disk(args.dataset_path)
 
-    if args.num_examples:
+    if args.num_examples is not None:
         ds = ds.select([i for i in range(args.num_examples)])
 
     selected_mime_types = ["text/html"]

From 81f0062d87ee58d265e52e5628ec37f3a9d7be8b Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 8 Feb 2022 14:35:36 +0000
Subject: [PATCH 6/7] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 dashboard/python_scripts/compute_stats.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/dashboard/python_scripts/compute_stats.py b/dashboard/python_scripts/compute_stats.py
index ed0a9112..012e48db 100644
--- a/dashboard/python_scripts/compute_stats.py
+++ b/dashboard/python_scripts/compute_stats.py
@@ -23,7 +23,10 @@ def get_args():
         help="path to the parquet dataset folder",
     )
     parser.add_argument(
-        "--save-path-stats-json", type=str, required=True, help="Where to save the stats json."
+        "--save-path-stats-json",
+        type=str,
+        required=True,
+        help="Where to save the stats json.",
     )
     parser.add_argument(
         "--save-path-stats-full-json", type=str, help="Where to save the stats json."

From e17c42de586869085df138df3d3aac10d650e0f9 Mon Sep 17 00:00:00 2001
From: SaulLu <55560583+SaulLu@users.noreply.github.com>
Date: Tue, 8 Feb 2022 15:37:21 +0100
Subject: [PATCH 7/7] Update dashboard/python_scripts/compute_stats.py

Co-authored-by: Thomas Wang <24695242+thomasw21@users.noreply.github.com>
---
 dashboard/python_scripts/compute_stats.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dashboard/python_scripts/compute_stats.py b/dashboard/python_scripts/compute_stats.py
index 012e48db..505b3775 100644
--- a/dashboard/python_scripts/compute_stats.py
+++ b/dashboard/python_scripts/compute_stats.py
@@ -109,7 +109,7 @@ def main():
 
     data_stats = {f"{split_name}_total": len(ds) for split_name, ds in splits.items()}
 
-    ds_html = splits[selected_mime_types[0]]
+    ds_html = splits["text/html"]
 
     logger.info(f"the currents splits are {data_stats}.")