From c9fdaa6421733e64c785afe87428628383360c96 Mon Sep 17 00:00:00 2001 From: Pouya Rezvanipour Date: Tue, 11 Mar 2025 14:28:19 -0700 Subject: [PATCH 1/8] add tfidf --- dask_ml/feature_extraction/text.py | 96 +++++++++++++++++++++++++++ tests/feature_extraction/test_text.py | 16 +++++ 2 files changed, 112 insertions(+) diff --git a/dask_ml/feature_extraction/text.py b/dask_ml/feature_extraction/text.py index 9710d03a6..22ffcf111 100644 --- a/dask_ml/feature_extraction/text.py +++ b/dask_ml/feature_extraction/text.py @@ -3,6 +3,8 @@ """ import itertools +import math +from typing import Literal, Optional import dask import dask.array as da @@ -13,10 +15,14 @@ import scipy.sparse import sklearn.base import sklearn.feature_extraction.text +import sparse from dask.delayed import Delayed from distributed import get_client, wait +from sklearn.base import BaseEstimator, OneToOneFeatureMixin, TransformerMixin from sklearn.utils.validation import check_is_fitted +from ..utils import check_array + class _BaseHasher(sklearn.base.BaseEstimator): @property @@ -280,3 +286,93 @@ def _merge_vocabulary(*vocabularies): ) } return vocabulary + + +def _handle_zeros_in_scale(scale: da.Array) -> da.Array: + + constant_mask = scale < 10 * np.finfo(scale.dtype).eps + + scale[constant_mask] = 1.0 + + return scale + + +def _normalize(X: da.Array, norm: Literal["l2", "l1", "max"] = "l2") -> da.Array: + if norm not in ("l1", "l2", "max"): + raise ValueError("'%s' is not a supported norm" % norm) + + if norm == "l1": + norms = da.abs(X).sum(axis=1) + elif norm == "l2": + norms = da.sqrt(da.square(X).sum(axis=1)) + elif norm == "max": + norms = da.max(da.abs(X), axis=1) + + norms = _handle_zeros_in_scale(norms) + X = X / norms[:, np.newaxis] + + return X + + +class TfidfTransformer( + OneToOneFeatureMixin, TransformerMixin, BaseEstimator, auto_wrap_output_keys=None +): + + def __init__( + self, + *, + norm: Optional[Literal["l2", "l1"]] = "l2", + use_idf: bool = True, + smooth_idf: bool = True, + sublinear_tf: bool = False, + ): + self.norm = norm + self.use_idf = use_idf + self.smooth_idf = smooth_idf + self.sublinear_tf = sublinear_tf + + def fit(self, X, y=None): + X = check_array(X) + + if self.use_idf: + + X = X.map_blocks(lambda a: sparse.as_coo(a).astype(np.float64)) + + if math.isnan( + X.shape[0] + ): # if number is rows is not currently know, get it + X.compute_chunk_sizes() + + n_samples = X.shape[0] + + df = da.count_nonzero(X, axis=0) + + if self.smooth_idf: + n_samples = n_samples + 1 + df = df + 1 + + idf_ = da.log(n_samples / df) + 1 + + self.idf_ = idf_.compute() + + return self + + def transform(self, X): + + if self.use_idf: + check_is_fitted(self, "idf_") + + X = X.map_blocks(lambda a: sparse.as_coo(a).astype(np.float64)) + + if self.sublinear_tf: + X = da.where(X != 0, da.log(X) + 1, X) + + if self.use_idf: + tf_idf = X * self.idf_ + else: + tf_idf = X + + if self.norm: + tf_idf = _normalize(tf_idf, norm=self.norm) + + return tf_idf diff --git a/tests/feature_extraction/test_text.py b/tests/feature_extraction/test_text.py index e833a30d3..6e7ece133 100644 --- a/tests/feature_extraction/test_text.py +++ b/tests/feature_extraction/test_text.py @@ -188,3 +188,19 @@ def test_count_vectorizer_remote_vocabulary(): ) m.fit_transform(b) assert m.vocabulary_ is remote_vocabulary + +@pytest.mark.parametrize("norm", [None, "l2","l1"]) +@pytest.mark.parametrize("smooth_idf", [True, False]) +@pytest.mark.parametrize("sublinear_tf", [True, False]) +@pytest.mark.parametrize("use_idf", [True, False]) +def test_tf_idf(data,norm,smooth_idf,sublinear_tf,use_idf): + x = sklearn.feature_extraction.text.CountVectorizer().fit(JUNK_FOOD_DOCS) + + d_tf_idf = dask_ml.feature_extraction.text.TfidfTransformer(norm=norm,smooth_idf=smooth_idf,sublinear_tf=sublinear_tf,use_idf=use_idf) + d_tf_idf_result = d_tf_idf.fit_transform(x).compute() + + + sk_tf_idf = sklearn.feature_extraction.text.TfidfTransformer(norm=norm,smooth_idf=smooth_idf,sublinear_tf=sublinear_tf,use_idf=use_idf) + sk_tf_idf_result = sk_tf_idf.fit_transform(x) + + np.testing.assert_array_almost_equal(d_tf_idf_result.todense().astype(np.float64),sk_tf_idf_result.todense().astype(np.float64)) From 9b973069b49486b69b8a7bf4a515a71f07d89357 Mon Sep 17 00:00:00 2001 From: Pouya Rezvanipour Date: Wed, 12 Mar 2025 16:50:32 -0700 Subject: [PATCH 2/8] fix _normalize and tests --- dask_ml/feature_extraction/text.py | 5 +---- tests/feature_extraction/test_text.py | 11 ++++++----- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/dask_ml/feature_extraction/text.py b/dask_ml/feature_extraction/text.py index 22ffcf111..003876b22 100644 --- a/dask_ml/feature_extraction/text.py +++ b/dask_ml/feature_extraction/text.py @@ -292,7 +292,7 @@ def _handle_zeros_in_scale(scale: da.Array) -> da.Array: constant_mask = scale < 10 * np.finfo(scale.dtype).eps - scale[constant_mask] = 1.0 + scale = da.where(constant_mask,1.0, scale) return scale @@ -332,8 +332,6 @@ def __init__( self.sublinear_tf = sublinear_tf def fit(self, X, y=None): - X = check_array(X) - if self.use_idf: X = X.map_blocks(lambda a: sparse.as_coo(a).astype(np.float64)) @@ -358,7 +356,6 @@ def fit(self, X, y=None): return self def transform(self, X): - if self.use_idf: check_is_fitted(self, "idf_") diff --git a/tests/feature_extraction/test_text.py b/tests/feature_extraction/test_text.py index 6e7ece133..9eaa5bfe5 100644 --- a/tests/feature_extraction/test_text.py +++ b/tests/feature_extraction/test_text.py @@ -193,14 +193,15 @@ def test_count_vectorizer_remote_vocabulary(): @pytest.mark.parametrize("smooth_idf", [True, False]) @pytest.mark.parametrize("sublinear_tf", [True, False]) @pytest.mark.parametrize("use_idf", [True, False]) -def test_tf_idf(data,norm,smooth_idf,sublinear_tf,use_idf): - x = sklearn.feature_extraction.text.CountVectorizer().fit(JUNK_FOOD_DOCS) +def test_tf_idf(norm,smooth_idf,sublinear_tf,use_idf): + corpus = db.from_sequence(JUNK_FOOD_DOCS) + x_d= dask_ml.feature_extraction.text.CountVectorizer().fit_transform(corpus) d_tf_idf = dask_ml.feature_extraction.text.TfidfTransformer(norm=norm,smooth_idf=smooth_idf,sublinear_tf=sublinear_tf,use_idf=use_idf) - d_tf_idf_result = d_tf_idf.fit_transform(x).compute() - + d_tf_idf_result = d_tf_idf.fit_transform(x_d).compute() + x_n = x_d.compute() sk_tf_idf = sklearn.feature_extraction.text.TfidfTransformer(norm=norm,smooth_idf=smooth_idf,sublinear_tf=sublinear_tf,use_idf=use_idf) - sk_tf_idf_result = sk_tf_idf.fit_transform(x) + sk_tf_idf_result = sk_tf_idf.fit_transform(x_n) np.testing.assert_array_almost_equal(d_tf_idf_result.todense().astype(np.float64),sk_tf_idf_result.todense().astype(np.float64)) From ae5aeb7dd712c022b8d878d84fc6ab4f268c6b01 Mon Sep 17 00:00:00 2001 From: pr38 <57053686+pr38@users.noreply.github.com> Date: Thu, 13 Mar 2025 17:28:02 -0700 Subject: [PATCH 3/8] Update dask_ml/feature_extraction/text.py Co-authored-by: Scott Sievert --- dask_ml/feature_extraction/text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask_ml/feature_extraction/text.py b/dask_ml/feature_extraction/text.py index 003876b22..3224e8120 100644 --- a/dask_ml/feature_extraction/text.py +++ b/dask_ml/feature_extraction/text.py @@ -346,7 +346,7 @@ def fit(self, X, y=None): df = da.count_nonzero(X, axis=0) if self.smooth_idf: - n_samples = n_samples + 1 + n_samples += 1 df = df + 1 idf_ = da.log(n_samples / df) + 1 From f900275bb2750a0a6b17856fd94cd3f02d7821e9 Mon Sep 17 00:00:00 2001 From: pr38 <57053686+pr38@users.noreply.github.com> Date: Thu, 13 Mar 2025 17:28:11 -0700 Subject: [PATCH 4/8] Update dask_ml/feature_extraction/text.py Co-authored-by: Scott Sievert --- dask_ml/feature_extraction/text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask_ml/feature_extraction/text.py b/dask_ml/feature_extraction/text.py index 3224e8120..5b19e8b26 100644 --- a/dask_ml/feature_extraction/text.py +++ b/dask_ml/feature_extraction/text.py @@ -347,7 +347,7 @@ def fit(self, X, y=None): if self.smooth_idf: n_samples += 1 - df = df + 1 + df += 1 idf_ = da.log(n_samples / df) + 1 From 8b597d536c57ba9fcb35341ff0bf5c66eff741c6 Mon Sep 17 00:00:00 2001 From: Pouya Rezvanipour Date: Thu, 13 Mar 2025 17:45:10 -0700 Subject: [PATCH 5/8] formatting fixes --- dask_ml/feature_extraction/text.py | 9 ++------- tests/feature_extraction/test_text.py | 26 +++++++++++++++++--------- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/dask_ml/feature_extraction/text.py b/dask_ml/feature_extraction/text.py index 5b19e8b26..6671fe9d5 100644 --- a/dask_ml/feature_extraction/text.py +++ b/dask_ml/feature_extraction/text.py @@ -21,8 +21,6 @@ from sklearn.base import BaseEstimator, OneToOneFeatureMixin, TransformerMixin from sklearn.utils.validation import check_is_fitted -from ..utils import check_array - class _BaseHasher(sklearn.base.BaseEstimator): @property @@ -292,7 +290,7 @@ def _handle_zeros_in_scale(scale: da.Array) -> da.Array: constant_mask = scale < 10 * np.finfo(scale.dtype).eps - scale = da.where(constant_mask,1.0, scale) + scale = da.where(constant_mask, 1.0, scale) return scale @@ -364,10 +362,7 @@ def transform(self, X): if self.sublinear_tf: X = da.where(X != 0, da.log(X) + 1, X) - if self.use_idf: - tf_idf = X * self.idf_ - else: - tf_idf = X + tf_idf = X * self.idf_ if self.use_idf else X if self.norm: tf_idf = _normalize(tf_idf, norm=self.norm) diff --git a/tests/feature_extraction/test_text.py b/tests/feature_extraction/test_text.py index 9eaa5bfe5..cee74f818 100644 --- a/tests/feature_extraction/test_text.py +++ b/tests/feature_extraction/test_text.py @@ -189,19 +189,27 @@ def test_count_vectorizer_remote_vocabulary(): m.fit_transform(b) assert m.vocabulary_ is remote_vocabulary -@pytest.mark.parametrize("norm", [None, "l2","l1"]) + +@pytest.mark.parametrize("norm", [None, "l2", "l1"]) @pytest.mark.parametrize("smooth_idf", [True, False]) @pytest.mark.parametrize("sublinear_tf", [True, False]) @pytest.mark.parametrize("use_idf", [True, False]) -def test_tf_idf(norm,smooth_idf,sublinear_tf,use_idf): +def test_tf_idf(norm, smooth_idf, sublinear_tf, use_idf): corpus = db.from_sequence(JUNK_FOOD_DOCS) - x_d= dask_ml.feature_extraction.text.CountVectorizer().fit_transform(corpus) + x_d = dask_ml.feature_extraction.text.CountVectorizer().fit_transform(corpus) - d_tf_idf = dask_ml.feature_extraction.text.TfidfTransformer(norm=norm,smooth_idf=smooth_idf,sublinear_tf=sublinear_tf,use_idf=use_idf) - d_tf_idf_result = d_tf_idf.fit_transform(x_d).compute() + d_tf_idf = dask_ml.feature_extraction.text.TfidfTransformer( + norm=norm, smooth_idf=smooth_idf, sublinear_tf=sublinear_tf, use_idf=use_idf + ) + d_tf_idf_result = d_tf_idf.fit_transform(x_d).compute() x_n = x_d.compute() - sk_tf_idf = sklearn.feature_extraction.text.TfidfTransformer(norm=norm,smooth_idf=smooth_idf,sublinear_tf=sublinear_tf,use_idf=use_idf) - sk_tf_idf_result = sk_tf_idf.fit_transform(x_n) - - np.testing.assert_array_almost_equal(d_tf_idf_result.todense().astype(np.float64),sk_tf_idf_result.todense().astype(np.float64)) + sk_tf_idf = sklearn.feature_extraction.text.TfidfTransformer( + norm=norm, smooth_idf=smooth_idf, sublinear_tf=sublinear_tf, use_idf=use_idf + ) + sk_tf_idf_result = sk_tf_idf.fit_transform(x_n) + + np.testing.assert_array_almost_equal( + d_tf_idf_result.todense().astype(np.float64), + sk_tf_idf_result.todense().astype(np.float64), + ) From fd8469e426bc271e0c85d9ffd35c8d5e8828b91d Mon Sep 17 00:00:00 2001 From: Pouya Rezvanipour Date: Sun, 23 Mar 2025 20:51:22 -0700 Subject: [PATCH 6/8] add lazy row counts --- dask_ml/feature_extraction/text.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/dask_ml/feature_extraction/text.py b/dask_ml/feature_extraction/text.py index 6671fe9d5..89fb00a6e 100644 --- a/dask_ml/feature_extraction/text.py +++ b/dask_ml/feature_extraction/text.py @@ -312,6 +312,22 @@ def _normalize(X: da.Array, norm: Literal["l2", "l1", "max"] = "l2") -> da.Array return X +def _get_chunk_len(a): + s = np.asarray(a.shape[0], dtype=int) + return s.reshape(1, 1, 1) + + +def _get_lazy_row_count_as_dask_array(a: da.Array) -> da.Array: + chunk_len = a.map_blocks( + _get_chunk_len, + dtype=int, + chunks=tuple(len(c) * (1,) for c in a.chunks) + ((a.ndim,),), + new_axis=1, + ) + + return chunk_len.sum() + + class TfidfTransformer( OneToOneFeatureMixin, TransformerMixin, BaseEstimator, auto_wrap_output_keys=None ): @@ -336,10 +352,10 @@ def fit(self, X, y=None): if math.isnan( X.shape[0] - ): # if number is rows is not currently know, get it - X.compute_chunk_sizes() - - n_samples = X.shape[0] + ): # if number is rows is not currently know, get a lazy reference to row counts + n_samples = _get_lazy_row_count_as_dask_array(X) + else: + n_samples = X.shape[0] df = da.count_nonzero(X, axis=0) From f63f09072e40c2ff6fb94d786574f0f3fe75cc92 Mon Sep 17 00:00:00 2001 From: Pouya Rezvanipour Date: Sun, 30 Mar 2025 15:00:22 -0700 Subject: [PATCH 7/8] more style + lazy row count update --- dask_ml/feature_extraction/text.py | 37 +++++++++++------------------- 1 file changed, 13 insertions(+), 24 deletions(-) diff --git a/dask_ml/feature_extraction/text.py b/dask_ml/feature_extraction/text.py index 89fb00a6e..10b9ab58e 100644 --- a/dask_ml/feature_extraction/text.py +++ b/dask_ml/feature_extraction/text.py @@ -312,19 +312,12 @@ def _normalize(X: da.Array, norm: Literal["l2", "l1", "max"] = "l2") -> da.Array return X -def _get_chunk_len(a): - s = np.asarray(a.shape[0], dtype=int) - return s.reshape(1, 1, 1) - - def _get_lazy_row_count_as_dask_array(a: da.Array) -> da.Array: chunk_len = a.map_blocks( - _get_chunk_len, + lambda a: np.asarray(a.shape[0], dtype=int).reshape(1, 1), dtype=int, - chunks=tuple(len(c) * (1,) for c in a.chunks) + ((a.ndim,),), - new_axis=1, + chunks=tuple(len(c) * (1,) for c in a.chunks), ) - return chunk_len.sum() @@ -346,26 +339,22 @@ def __init__( self.sublinear_tf = sublinear_tf def fit(self, X, y=None): - if self.use_idf: - - X = X.map_blocks(lambda a: sparse.as_coo(a).astype(np.float64)) + if not self.use_idf: + return self + + X = X.map_blocks(lambda a: sparse.as_coo(a).astype(np.float64)) - if math.isnan( - X.shape[0] - ): # if number is rows is not currently know, get a lazy reference to row counts - n_samples = _get_lazy_row_count_as_dask_array(X) - else: - n_samples = X.shape[0] + n_samples = X.shape[0] if not math.isnan(X.shape[0]) else _get_lazy_row_count_as_dask_array(X) - df = da.count_nonzero(X, axis=0) + df = da.count_nonzero(X, axis=0) - if self.smooth_idf: - n_samples += 1 - df += 1 + if self.smooth_idf: + n_samples += 1 + df += 1 - idf_ = da.log(n_samples / df) + 1 + idf_ = da.log(n_samples / df) + 1 - self.idf_ = idf_.compute() + self.idf_ = idf_.compute() return self From a5ca024684ae5a13f872712ecfd67886f0687ada Mon Sep 17 00:00:00 2001 From: Pouya Rezvanipour Date: Mon, 14 Apr 2025 13:15:34 -0700 Subject: [PATCH 8/8] lazy row counts for odd chunks + test updates --- dask_ml/feature_extraction/text.py | 11 ++++++++--- tests/feature_extraction/test_text.py | 8 +++++++- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/dask_ml/feature_extraction/text.py b/dask_ml/feature_extraction/text.py index 10b9ab58e..2c261fa8b 100644 --- a/dask_ml/feature_extraction/text.py +++ b/dask_ml/feature_extraction/text.py @@ -318,7 +318,8 @@ def _get_lazy_row_count_as_dask_array(a: da.Array) -> da.Array: dtype=int, chunks=tuple(len(c) * (1,) for c in a.chunks), ) - return chunk_len.sum() + + return chunk_len[:, 0].sum() class TfidfTransformer( @@ -341,10 +342,14 @@ def __init__( def fit(self, X, y=None): if not self.use_idf: return self - + X = X.map_blocks(lambda a: sparse.as_coo(a).astype(np.float64)) - n_samples = X.shape[0] if not math.isnan(X.shape[0]) else _get_lazy_row_count_as_dask_array(X) + n_samples = ( + X.shape[0] + if not math.isnan(X.shape[0]) + else _get_lazy_row_count_as_dask_array(X) + ) df = da.count_nonzero(X, axis=0) diff --git a/tests/feature_extraction/test_text.py b/tests/feature_extraction/test_text.py index cee74f818..2560fcb0c 100644 --- a/tests/feature_extraction/test_text.py +++ b/tests/feature_extraction/test_text.py @@ -194,9 +194,15 @@ def test_count_vectorizer_remote_vocabulary(): @pytest.mark.parametrize("smooth_idf", [True, False]) @pytest.mark.parametrize("sublinear_tf", [True, False]) @pytest.mark.parametrize("use_idf", [True, False]) -def test_tf_idf(norm, smooth_idf, sublinear_tf, use_idf): +@pytest.mark.parametrize("rechunk", [True, False]) +def test_tf_idf(norm, smooth_idf, sublinear_tf, use_idf, rechunk): corpus = db.from_sequence(JUNK_FOOD_DOCS) x_d = dask_ml.feature_extraction.text.CountVectorizer().fit_transform(corpus) + if rechunk: + x_d.compute_chunk_sizes() + x_d = x_d.rechunk(1, 6) + # forget chunk_sizes/shape, to trigger _get_lazy_row_count_as_dask_array + x_d._chunks = ((np.nan, np.nan, np.nan, np.nan, np.nan, np.nan), (6,)) d_tf_idf = dask_ml.feature_extraction.text.TfidfTransformer( norm=norm, smooth_idf=smooth_idf, sublinear_tf=sublinear_tf, use_idf=use_idf