Inist-CNRS · leogail · Nov 26, 2025
diff --git a/services/text-clustering/v1/clustering.py b/services/text-clustering/v1/clustering.py
@@ -7,10 +7,11 @@
 from sklearn.preprocessing import StandardScaler
 from sklearn.cluster import KMeans
 from sklearn.metrics import silhouette_score
-from sklearn.decomposition import PCA
 import random
 import umap
 import os
+import numpy as np
+
 
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 model = SentenceTransformer("./v1/all-MiniLM-L6-v2")
@@ -199,66 +200,79 @@ def truncate_text_for_teeft(text):
     except Exception:
         indice_out_cluster.append(i)
 
-try:
-    # Dimension reduction
-    umap_model = umap.UMAP(
-        n_neighbors=max(10, min(30, int(len_data/20))),
-        n_components=10,
-        metric="cosine",
-        random_state=42,
-        min_dist=0.0,
-        n_jobs=1
-    )
-    reduced_embeddings = umap_model.fit_transform(texts)
-except Exception:
-    try:
-        pca_model = PCA(n_components=min(10, texts.shape[1]))
-        reduced_embeddings = pca_model.fit_transform(texts)
-    except Exception:
-        indice_out_cluster = [i for i in range(len_data)]
-        reduced_embeddings = []
 
+texts = np.array(texts)
 
-if nb_cluster == 0:
-    nb_cluster = find_optimal_k(reduced_embeddings, max_k=min(21, len(texts)-2))
+if len(texts) == 0:
+    indice_out_cluster = list(range(len_data))
+    reduced_embeddings = np.empty((0, 10))
+else:
+    try:
+        umap_model = umap.UMAP(
+            n_neighbors=max(10, min(30, int(len(texts)/20))),
+            n_components=10,
+            metric="cosine",
+            random_state=42,
+            min_dist=0.0,
+            n_jobs=1
+        )
+        reduced_embeddings = umap_model.fit_transform(texts)
+    except Exception as e:
+        sys.stderr.write(f"Error in textClustering while UMAP processing : {e}")
+        reduced_embeddings = center_reduce(texts)
+
+if reduced_embeddings.shape[0] < nb_cluster:
+    nb_cluster = max(1, reduced_embeddings.shape[0] - 1)
+else:
+    if nb_cluster == 0:
+        nb_cluster = find_optimal_k(reduced_embeddings, max_k=min(21, len(texts)-2))
+
+try:
+    clusterer = KMeans(n_clusters=nb_cluster, random_state=42)
+    clusterer.fit(reduced_embeddings)
+    clustering_done = True
+except Exception as e:
+    sys.stderr.write(f"Error in textClustering while KMEANS processing : {e}")
+    clustering_done = False
 
-# Clustering
-clusterer = KMeans(n_clusters=nb_cluster, random_state=42)
-clusterer.fit(reduced_embeddings)
 
 
 # Create datas for teeft
 indice_in_cluster = 0
 keywords = (
     {}
 )  # keywords is a dictionary, the key is the cluster and value texts from clus
-for i in range(len_data):
-    if i not in indice_out_cluster:
-        label = int(clusterer.labels_[indice_in_cluster] + 1)
-        if label != 0:
-            if label in keywords:
-                keywords[label] += "\n\n" + str(all_data[i]["value"])
-            else:
-                keywords[label] = str(all_data[i]["value"])
-        indice_in_cluster += 1
 
-# Execute teeft
-n_clusters = len(keywords)
-for i in range(n_clusters):
-    if i+1 in keywords:
-        keywords[i+1] = truncate_text_for_teeft(keywords[i+1])
-        data = {"id": i + 1, "value": keywords[i + 1]}
-        keywords[i + 1] = teeft(data, n_keywords)
-    else:
-        continue
+if clustering_done:
+    for i in range(len_data):
+        if i not in indice_out_cluster:
+            label = int(clusterer.labels_[indice_in_cluster] + 1)
+            if label != 0:
+                if label in keywords:
+                    keywords[label] += "\n\n" + str(all_data[i]["value"])
+                else:
+                    keywords[label] = str(all_data[i]["value"])
+            indice_in_cluster += 1
+
+    # Execute teeft
+    n_clusters = len(keywords)
+    for i in range(n_clusters):
+        if i+1 in keywords:
+            keywords[i+1] = truncate_text_for_teeft(keywords[i+1])
+            data = {"id": i + 1, "value": keywords[i + 1]}
+            keywords[i + 1] = teeft(data, n_keywords)
+        else:
+            continue
 
-# Filter dict : delete every keywords who has a to big frequency
-try:
-    keywords = filter_keywords(keywords, threshold=0.5)
-except Exception:
-    pass
-# Add res for noise cluster
-keywords[0] = []
+    # Filter dict : delete every keywords who has a to big frequency
+    try:
+        keywords = filter_keywords(keywords, threshold=0.5)
+    except Exception:
+        pass
+    # Add res for noise cluster
+    keywords[0] = []
+else:
+    indice_out_cluster = [i for i in range(len_data)]
 
 # extract infos and return teeft res
 indice_in_cluster = 0

diff --git a/services/text-clustering/v1/noise.py b/services/text-clustering/v1/noise.py
@@ -24,6 +24,8 @@ def center_reduce(matrix):
     array-like: The centered and reduced matrix.
     """
     # center and reduce
+    if matrix is None or len(matrix) == 0:
+        return matrix
     scaler = StandardScaler()
     scaler.fit(matrix)
     matrix_center_reduce = scaler.transform(matrix) 
@@ -83,8 +85,11 @@ def center_reduce(matrix):
     random_state=42,
     n_jobs=1)
 
-reduced_embeddings = umap_model.fit_transform(texts)
-
+try:
+    reduced_embeddings = umap_model.fit_transform(texts)
+except Exception as e:
+    sys.stderr.write(f"Error in noiseDetect while UMAP processing : {e}")
+    reduced_embeddings = center_reduce(texts)
 
 # HDBSCAN with scikit-learn
 clusterer = HDBSCAN(
@@ -96,7 +101,11 @@ def center_reduce(matrix):
     cluster_selection_method="eom",
     n_jobs=-1) 
 
-clusterer.fit(reduced_embeddings)
+try:
+    clusterer.fit(reduced_embeddings)
+except Exception as e:
+    sys.stderr.write(f"Error in noiseDetect while HDBSCAN processing : {e}")
+    indice_out_cluster = [i for i in range(len_data)]
 
 
 # extract infos
@@ -114,7 +123,5 @@ def center_reduce(matrix):
             line["value"] = "relevant"
         # Increment only if the row isn't noise (they aren't count in "clusterer model")
         indice_in_cluster += 1
-    text_output += json.dumps(line)
-    text_output += "\n"
-
-sys.stdout.write(text_output)
+    sys.stdout.write(json.dumps(line))
+    sys.stdout.write("\n")