From 8cffeb15c4e377d5e0194ead7c180c9cc06102cb Mon Sep 17 00:00:00 2001 From: Dmitrii Cherkasov Date: Tue, 15 Jul 2025 12:10:24 -0700 Subject: [PATCH 1/4] Relaxes matplotlib dependency --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 195f9d573..edd13f350 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -68,8 +68,8 @@ dependencies = [ "fsspec>=0.8.7", "gitpython>=3.1.2", "jinja2>=2.11.2", - "matplotlib>=3.1.3,<=3.8.4", - "numpy>=1.19.2,<2.0.0", + "matplotlib>=3.1.3", + "numpy>=1.19.2", "oci>=2.148.0", "ocifs>=1.1.3", "pandas>=2.2.0", From f8474bfe1bdbe6151615512ba3aaa8ad6d639a45 Mon Sep 17 00:00:00 2001 From: Dmitrii Cherkasov Date: Tue, 15 Jul 2025 12:11:55 -0700 Subject: [PATCH 2/4] Relaxes scikit-learn --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index edd13f350..c560374fe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,7 +76,7 @@ dependencies = [ "psutil>=5.7.2", "python_jsonschema_objects>=0.3.13", "requests", - "scikit-learn>=1.0,<1.6.0", + "scikit-learn>=1.0", "tabulate>=0.8.9", "tqdm>=4.59.0", "pydantic>=2.6.3", From 1f639ee104269010597bca36c5cf51f045611199 Mon Sep 17 00:00:00 2001 From: Dmitrii Cherkasov Date: Tue, 15 Jul 2025 12:26:03 -0700 Subject: [PATCH 3/4] Fixes numpy --- .../feature_type/boolean.py | 29 ++++++++-------- .../feature_type/category.py | 23 ++++++------- .../feature_type/continuous.py | 25 +++++++------- .../feature_type/datetime.py | 23 ++++++------- ads/feature_engineering/feature_type/gis.py | 28 ++++++++-------- .../feature_type/integer.py | 21 ++++++------ .../feature_type/ip_address.py | 16 +++++---- .../feature_type/ip_address_v4.py | 13 ++++---- .../feature_type/ip_address_v6.py | 13 ++++---- .../feature_type/lat_long.py | 29 ++++++++-------- .../feature_type/phone_number.py | 15 +++++---- .../feature_type/string.py | 24 +++++++------- ads/feature_engineering/feature_type/text.py | 13 ++++---- .../feature_type/zip_code.py | 20 ++++++----- ads/feature_engineering/utils.py | 26 ++++++++------- ads/model/transformer/onnx_transformer.py | 15 ++++----- ads/templates/score.jinja2 | 6 ++-- ads/templates/score_onnx.jinja2 | 6 ++-- ads/templates/score_onnx_new.jinja2 | 6 ++-- .../test_feature_domain_schema.py | 25 +++++++------- .../feature_types/test_feature_stat.py | 24 +++++++------- .../model/test_model_metadata.py | 33 ++++++++++--------- .../feature_engineering/test_feature_plot.py | 32 +++++++++--------- .../feature_types/test_feature_types.py | 17 +++++----- 24 files changed, 248 insertions(+), 234 deletions(-) diff --git a/ads/feature_engineering/feature_type/boolean.py b/ads/feature_engineering/feature_type/boolean.py index b8b108c0e..7b3d46dd4 100644 --- a/ads/feature_engineering/feature_type/boolean.py +++ b/ads/feature_engineering/feature_type/boolean.py @@ -1,7 +1,6 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- -# Copyright (c) 2021, 2022 Oracle and/or its affiliates. +# Copyright (c) 2021, 2025 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ """ @@ -15,19 +14,21 @@ default_handler(data: pd.Series) -> pd.Series Processes given data and indicates if the data matches requirements. """ + import matplotlib.pyplot as plt import pandas as pd + +from ads.common.decorator.runtime_dependency import ( + OptionalDependency, + runtime_dependency, +) +from ads.feature_engineering import schema from ads.feature_engineering.feature_type.base import FeatureType from ads.feature_engineering.utils import ( + SchemeTeal, _count_unique_missing, - is_boolean, _set_seaborn_theme, - SchemeTeal, -) -from ads.feature_engineering import schema -from ads.common.decorator.runtime_dependency import ( - runtime_dependency, - OptionalDependency, + is_boolean, ) @@ -44,7 +45,7 @@ def default_handler(data: pd.Series, *args, **kwargs) -> pd.Series: :class:`pandas.Series` The logical list indicating if the data matches requirements. """ - return pd.Series((is_boolean(value) for value in data.values)) + return pd.Series(is_boolean(value) for value in data.values) class Boolean(FeatureType): @@ -74,7 +75,7 @@ class Boolean(FeatureType): >>> from ads.feature_engineering.feature_type.boolean import Boolean >>> import pandas as pd >>> import numpy as np - >>> s = pd.Series([True, False, True, False, np.NaN, None], name='bool') + >>> s = pd.Series([True, False, True, False, np.nan, None], name='bool') >>> s.ads.feature_type = ['boolean'] >>> Boolean.validator.is_boolean(s) 0 True @@ -106,7 +107,7 @@ def feature_stat(x: pd.Series) -> pd.DataFrame: Examples -------- - >>> s = pd.Series([True, False, True, False, np.NaN, None], name='bool') + >>> s = pd.Series([True, False, True, False, np.nan, None], name='bool') >>> s.ads.feature_type = ['boolean'] >>> s.ads.feature_stat() Metric Value @@ -134,7 +135,7 @@ def feature_plot(x: pd.Series) -> plt.Axes: Examples -------- - >>> s = pd.Series([True, False, True, False, np.NaN, None], name='bool') + >>> s = pd.Series([True, False, True, False, np.nan, None], name='bool') >>> s.ads.feature_type = ['boolean'] >>> s.ads.feature_plot() """ @@ -155,7 +156,7 @@ def feature_domain(cls, x: pd.Series) -> schema.Domain: Examples -------- - >>> s = pd.Series([True, False, True, False, np.NaN, None], name='bool') + >>> s = pd.Series([True, False, True, False, np.nan, None], name='bool') >>> s.ads.feature_type = ['boolean'] >>> s.ads.feature_domain() constraints: diff --git a/ads/feature_engineering/feature_type/category.py b/ads/feature_engineering/feature_type/category.py index 61ed1e3ef..7dfaa6a3d 100644 --- a/ads/feature_engineering/feature_type/category.py +++ b/ads/feature_engineering/feature_type/category.py @@ -1,7 +1,6 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- -# Copyright (c) 2021, 2022 Oracle and/or its affiliates. +# Copyright (c) 2021, 2025 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ """ @@ -11,18 +10,20 @@ Category The Category feature type. """ + import matplotlib.pyplot as plt import pandas as pd + +from ads.common.decorator.runtime_dependency import ( + OptionalDependency, + runtime_dependency, +) +from ads.feature_engineering import schema from ads.feature_engineering.feature_type.base import FeatureType from ads.feature_engineering.utils import ( + SchemeTeal, _count_unique_missing, _set_seaborn_theme, - SchemeTeal, -) -from ads.feature_engineering import schema -from ads.common.decorator.runtime_dependency import ( - runtime_dependency, - OptionalDependency, ) @@ -71,7 +72,7 @@ def feature_stat(x: pd.Series) -> pd.DataFrame: Examples -------- >>> cat = pd.Series(['S', 'C', 'S', 'S', 'S', 'Q', 'S', 'S', 'S', 'C', 'S', 'S', 'S', - 'S', 'S', 'S', 'Q', 'S', 'S', '', np.NaN, None], name='сategory') + 'S', 'S', 'S', 'Q', 'S', 'S', '', np.nan, None], name='сategory') >>> cat.ads.feature_type = ['сategory'] >>> cat.ads.feature_stat() Metric Value @@ -100,7 +101,7 @@ def feature_plot(x: pd.Series) -> plt.Axes: Examples -------- >>> cat = pd.Series(['S', 'C', 'S', 'S', 'S', 'Q', 'S', 'S', 'S', 'C', 'S', 'S', 'S', - 'S', 'S', 'S', 'Q', 'S', 'S', '', np.NaN, None], name='сategory') + 'S', 'S', 'S', 'Q', 'S', 'S', '', np.nan, None], name='сategory') >>> cat.ads.feature_type = ['сategory'] >>> cat.ads.feature_plot() """ @@ -121,7 +122,7 @@ def feature_domain(cls, x: pd.Series) -> schema.Domain: Examples -------- >>> cat = pd.Series(['S', 'C', 'S', 'S', 'S', 'Q', 'S', 'S', 'S', 'C', 'S', 'S', 'S', - 'S', 'S', 'S', 'Q', 'S', 'S', '', np.NaN, None], name='category') + 'S', 'S', 'S', 'Q', 'S', 'S', '', np.nan, None], name='category') >>> cat.ads.feature_type = ['category'] >>> cat.ads.feature_domain() constraints: diff --git a/ads/feature_engineering/feature_type/continuous.py b/ads/feature_engineering/feature_type/continuous.py index 69a359fe2..4cb4b9b41 100644 --- a/ads/feature_engineering/feature_type/continuous.py +++ b/ads/feature_engineering/feature_type/continuous.py @@ -1,7 +1,6 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- -# Copyright (c) 2021, 2022 Oracle and/or its affiliates. +# Copyright (c) 2021, 2025 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ """ @@ -11,19 +10,21 @@ Continuous The Continuous feature type. """ + import matplotlib.pyplot as plt import pandas as pd + +from ads.common.decorator.runtime_dependency import ( + OptionalDependency, + runtime_dependency, +) +from ads.feature_engineering import schema from ads.feature_engineering.feature_type.base import FeatureType from ads.feature_engineering.utils import ( - _add_missing, - _set_seaborn_theme, SchemeTeal, + _add_missing, _format_stat, -) -from ads.feature_engineering import schema -from ads.common.decorator.runtime_dependency import ( - runtime_dependency, - OptionalDependency, + _set_seaborn_theme, ) @@ -62,7 +63,7 @@ def feature_stat(x: pd.Series) -> pd.DataFrame: Examples -------- >>> cts = pd.Series([13.32, 3.32, 4.3, 2.45, 6.34, 2.25, - 4.43, 3.26, np.NaN, None], name='continuous') + 4.43, 3.26, np.nan, None], name='continuous') >>> cts.ads.feature_type = ['continuous'] >>> cts.ads.feature_stat() Metric Value @@ -99,7 +100,7 @@ def feature_plot(x: pd.Series) -> plt.Axes: Examples -------- >>> cts = pd.Series([13.32, 3.32, 4.3, 2.45, 6.34, 2.25, - 4.43, 3.26, np.NaN, None], name='continuous') + 4.43, 3.26, np.nan, None], name='continuous') >>> cts.ads.feature_type = ['continuous'] >>> cts.ads.feture_plot() @@ -125,7 +126,7 @@ def feature_domain(cls, x: pd.Series) -> schema.Domain: Examples -------- >>> cts = pd.Series([13.32, 3.32, 4.3, 2.45, 6.34, 2.25, - 4.43, 3.26, np.NaN, None], name='continuous') + 4.43, 3.26, np.nan, None], name='continuous') >>> cts.ads.feature_type = ['continuous'] >>> cts.ads.feature_domain() constraints: [] diff --git a/ads/feature_engineering/feature_type/datetime.py b/ads/feature_engineering/feature_type/datetime.py index 2131618e3..1e4a4f503 100644 --- a/ads/feature_engineering/feature_type/datetime.py +++ b/ads/feature_engineering/feature_type/datetime.py @@ -1,7 +1,6 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- -# Copyright (c) 2021, 2022 Oracle and/or its affiliates. +# Copyright (c) 2021, 2025 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ """ @@ -11,21 +10,19 @@ DateTime The DateTime feature type. """ + import matplotlib.pyplot as plt import numpy as np import pandas as pd import pandas.api.types as pdtypes -from ads.feature_engineering.feature_type.base import FeatureType -from ads.feature_engineering.utils import ( - _add_missing, - _set_seaborn_theme, - SchemeTeal, -) -from ads.feature_engineering import schema + from ads.common.decorator.runtime_dependency import ( - runtime_dependency, OptionalDependency, + runtime_dependency, ) +from ads.feature_engineering import schema +from ads.feature_engineering.feature_type.base import FeatureType +from ads.feature_engineering.utils import SchemeTeal, _add_missing, _set_seaborn_theme def default_handler(data: pd.Series, *args, **kwargs) -> pd.Series: @@ -123,12 +120,12 @@ def feature_stat(x: pd.Series) -> pd.DataFrame: df_stat = pd.Series( { "count": len(x), - "sample maximum": x.replace(r"", np.NaN).dropna().max(), - "sample minimum": x.replace(r"", np.NaN).dropna().min(), + "sample maximum": x.replace(r"", np.nan).dropna().max(), + "sample minimum": x.replace(r"", np.nan).dropna().min(), }, name=x.name, ).to_frame() - return _add_missing(x.replace(r"", np.NaN), df_stat) + return _add_missing(x.replace(r"", np.nan), df_stat) @staticmethod @runtime_dependency(module="seaborn", install_from=OptionalDependency.VIZ) diff --git a/ads/feature_engineering/feature_type/gis.py b/ads/feature_engineering/feature_type/gis.py index 44e805aba..06589c9f4 100644 --- a/ads/feature_engineering/feature_type/gis.py +++ b/ads/feature_engineering/feature_type/gis.py @@ -1,7 +1,6 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- -# Copyright (c) 2021, 2022 Oracle and/or its affiliates. +# Copyright (c) 2021, 2025 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ """ @@ -11,20 +10,23 @@ GIS The GIS feature type. """ + +import re + import matplotlib.pyplot as plt import pandas as pd -import re + +from ads.common.decorator.runtime_dependency import ( + OptionalDependency, + runtime_dependency, +) +from ads.feature_engineering import schema from ads.feature_engineering.feature_type.base import FeatureType from ads.feature_engineering.utils import ( - _count_unique_missing, - _str_lat_long_to_point, SchemeNeutral, SchemeTeal, -) -from ads.feature_engineering import schema -from ads.common.decorator.runtime_dependency import ( - runtime_dependency, - OptionalDependency, + _count_unique_missing, + _str_lat_long_to_point, ) PATTERN = re.compile(r"^[(]?(\-?\d+\.\d+?),\s*(\-?\d+\.\d+?)[)]?$", re.VERBOSE) @@ -126,7 +128,7 @@ def feature_stat(x: pd.Series) -> pd.DataFrame: "-44.510428,-169.269477", "-56.3344375,-166.407038", "", - np.NaN, + np.nan, None ], name='gis' @@ -165,7 +167,7 @@ def feature_plot(x: pd.Series) -> plt.Axes: "-44.510428,-169.269477", "-56.3344375,-166.407038", "", - np.NaN, + np.nan, None ], name='gis' @@ -221,7 +223,7 @@ def feature_domain(cls, x: pd.Series) -> schema.Domain: "-44.510428,-169.269477", "-56.3344375,-166.407038", "", - np.NaN, + np.nan, None ], name='gis' diff --git a/ads/feature_engineering/feature_type/integer.py b/ads/feature_engineering/feature_type/integer.py index 8e8ec473f..2fc8a5513 100644 --- a/ads/feature_engineering/feature_type/integer.py +++ b/ads/feature_engineering/feature_type/integer.py @@ -1,7 +1,6 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- -# Copyright (c) 2021, 2022 Oracle and/or its affiliates. +# Copyright (c) 2021, 2025 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ """ @@ -11,19 +10,21 @@ Integer The Integer feature type. """ + import matplotlib.pyplot as plt import pandas as pd + +from ads.common.decorator.runtime_dependency import ( + OptionalDependency, + runtime_dependency, +) +from ads.feature_engineering import schema from ads.feature_engineering.feature_type.base import FeatureType from ads.feature_engineering.utils import ( - _add_missing, - _set_seaborn_theme, SchemeTeal, + _add_missing, _format_stat, -) -from ads.feature_engineering import schema -from ads.common.decorator.runtime_dependency import ( - runtime_dependency, - OptionalDependency, + _set_seaborn_theme, ) @@ -120,7 +121,7 @@ def feature_domain(cls, x: pd.Series) -> schema.Domain: Examples -------- - >>> s = pd.Series([True, False, True, False, np.NaN, None], name='integer') + >>> s = pd.Series([True, False, True, False, np.nan, None], name='integer') >>> s.ads.feature_type = ['integer'] >>> s.ads.feature_domain() constraints: [] diff --git a/ads/feature_engineering/feature_type/ip_address.py b/ads/feature_engineering/feature_type/ip_address.py index 4be6d95da..9f001566d 100644 --- a/ads/feature_engineering/feature_type/ip_address.py +++ b/ads/feature_engineering/feature_type/ip_address.py @@ -1,7 +1,6 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- -# Copyright (c) 2021 Oracle and/or its affiliates. +# Copyright (c) 2021, 2025 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ """ @@ -11,11 +10,14 @@ IpAddress The IpAddress feature type. """ -import pandas as pd + import re + +import pandas as pd + +from ads.feature_engineering import schema from ads.feature_engineering.feature_type.base import FeatureType from ads.feature_engineering.utils import _count_unique_missing -from ads.feature_engineering import schema PATTERNV4 = re.compile( r"(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)", @@ -75,7 +77,7 @@ class IpAddress(FeatureType): >>> from ads.feature_engineering.feature_type.ip_address import IpAddress >>> import pandas as pd >>> import numpy as np - >>> s = pd.Series(['192.168.0.1', '2001:db8::', '', np.NaN, None], name='ip_address') + >>> s = pd.Series(['192.168.0.1', '2001:db8::', '', np.nan, None], name='ip_address') >>> s.ads.feature_type = ['ip_address'] >>> IpAddress.validator.is_ip_address(s) 0 True @@ -96,7 +98,7 @@ def feature_stat(x: pd.Series) -> pd.DataFrame: Examples -------- - >>> s = pd.Series(['2002:db8::', '192.168.0.1', '2001:db8::', '2002:db8::', np.NaN, None], name='ip_address') + >>> s = pd.Series(['2002:db8::', '192.168.0.1', '2001:db8::', '2002:db8::', np.nan, None], name='ip_address') >>> s.ads.feature_type = ['ip_address'] >>> s.ads.feature_stat() Metric Value @@ -118,7 +120,7 @@ def feature_domain(cls, x: pd.Series) -> schema.Domain: Examples -------- - >>> s = pd.Series(['2002:db8::', '192.168.0.1', '2001:db8::', '2002:db8::', np.NaN, None], name='ip_address') + >>> s = pd.Series(['2002:db8::', '192.168.0.1', '2001:db8::', '2002:db8::', np.nan, None], name='ip_address') >>> s.ads.feature_type = ['ip_address'] >>> s.ads.feature_domain() constraints: [] diff --git a/ads/feature_engineering/feature_type/ip_address_v4.py b/ads/feature_engineering/feature_type/ip_address_v4.py index d698a3b83..069af2b03 100644 --- a/ads/feature_engineering/feature_type/ip_address_v4.py +++ b/ads/feature_engineering/feature_type/ip_address_v4.py @@ -1,7 +1,6 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- -# Copyright (c) 2021 Oracle and/or its affiliates. +# Copyright (c) 2021, 2025 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ """ @@ -11,12 +10,14 @@ IpAddressV4 The IpAddressV4 feature type. """ + import re import pandas as pd + +from ads.feature_engineering import schema from ads.feature_engineering.feature_type.base import FeatureType from ads.feature_engineering.utils import _count_unique_missing -from ads.feature_engineering import schema PATTERN = re.compile( r"(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)", @@ -69,7 +70,7 @@ class IpAddressV4(FeatureType): >>> from ads.feature_engineering.feature_type.ip_address_v4 import IpAddressV4 >>> import pandas as pd >>> import numpy as np - >>> s = pd.Series(['192.168.0.1', '2001:db8::', '', np.NaN, None], name='ip_address') + >>> s = pd.Series(['192.168.0.1', '2001:db8::', '', np.nan, None], name='ip_address') >>> s.ads.feature_type = ['ip_address_v4'] >>> IpAddressV4.validator.is_ip_address_v4(s) 0 True @@ -90,7 +91,7 @@ def feature_stat(x: pd.Series) -> pd.DataFrame: Examples -------- - >>> s = pd.Series(['192.168.0.1', '192.168.0.2', '192.168.0.3', '192.168.0.4', np.NaN, None], name='ip_address') + >>> s = pd.Series(['192.168.0.1', '192.168.0.2', '192.168.0.3', '192.168.0.4', np.nan, None], name='ip_address') >>> s.ads.feature_type = ['ip_address_v4'] >>> s.ads.feature_stat() Metric Value @@ -112,7 +113,7 @@ def feature_domain(cls, x: pd.Series) -> schema.Domain: Examples -------- - >>> s = pd.Series(['192.168.0.1', '192.168.0.2', '192.168.0.3', '192.168.0.4', np.NaN, None], name='ip_address_v4') + >>> s = pd.Series(['192.168.0.1', '192.168.0.2', '192.168.0.3', '192.168.0.4', np.nan, None], name='ip_address_v4') >>> s.ads.feature_type = ['ip_address_v4'] >>> s.ads.feature_domain() constraints: [] diff --git a/ads/feature_engineering/feature_type/ip_address_v6.py b/ads/feature_engineering/feature_type/ip_address_v6.py index ba1e57191..d7b3c92b0 100644 --- a/ads/feature_engineering/feature_type/ip_address_v6.py +++ b/ads/feature_engineering/feature_type/ip_address_v6.py @@ -1,7 +1,6 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- -# Copyright (c) 2021 Oracle and/or its affiliates. +# Copyright (c) 2021, 2025 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ """ @@ -11,12 +10,14 @@ IpAddressV6 The IpAddressV6 feature type. """ + import re import pandas as pd + +from ads.feature_engineering import schema from ads.feature_engineering.feature_type.base import FeatureType from ads.feature_engineering.utils import _count_unique_missing -from ads.feature_engineering import schema PATTERN = re.compile( r"\s*(?!.*::.*::)(?:(?!:)|:(?=:))(?:[0-9a-f]{0,4}(?:(?<=::)|(?>> from ads.feature_engineering.feature_type.ip_address_v6 import IpAddressV6 >>> import pandas as pd >>> import numpy as np - >>> s = pd.Series(['192.168.0.1', '2001:db8::', '', np.NaN, None], name='ip_address') + >>> s = pd.Series(['192.168.0.1', '2001:db8::', '', np.nan, None], name='ip_address') >>> s.ads.feature_type = ['ip_address_v6'] >>> IpAddressV6.validator.is_ip_address_v6(s) 0 False @@ -90,7 +91,7 @@ def feature_stat(x: pd.Series) -> pd.DataFrame: Examples -------- - >>> s = pd.Series(['2002:db8::', '2001:db8::', '2001:db8::', '2002:db8::', np.NaN, None], name='ip_address') + >>> s = pd.Series(['2002:db8::', '2001:db8::', '2001:db8::', '2002:db8::', np.nan, None], name='ip_address') >>> s.ads.feature_type = ['ip_address_v6'] >>> s.ads.feature_stat() Metric Value @@ -112,7 +113,7 @@ def feature_domain(cls, x: pd.Series) -> schema.Domain: Examples -------- - >>> s = pd.Series(['2002:db8::', '2001:db8::', '2001:db8::', '2002:db8::', np.NaN, None], name='ip_address_v6') + >>> s = pd.Series(['2002:db8::', '2001:db8::', '2001:db8::', '2002:db8::', np.nan, None], name='ip_address_v6') >>> s.ads.feature_type = ['ip_address_v6'] >>> s.ads.feature_domain() constraints: [] diff --git a/ads/feature_engineering/feature_type/lat_long.py b/ads/feature_engineering/feature_type/lat_long.py index af6d4ac74..8c790bb0a 100644 --- a/ads/feature_engineering/feature_type/lat_long.py +++ b/ads/feature_engineering/feature_type/lat_long.py @@ -1,7 +1,6 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- -# Copyright (c) 2021, 2022 Oracle and/or its affiliates. +# Copyright (c) 2021, 2025 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ """ @@ -15,22 +14,24 @@ default_handler(data: pd.Series) -> pd.Series Processes given data and indicates if the data matches requirements. """ + +import re + import matplotlib.pyplot as plt import pandas as pd -import re + +from ads.common.decorator.runtime_dependency import ( + OptionalDependency, + runtime_dependency, +) +from ads.feature_engineering import schema from ads.feature_engineering.feature_type.string import String from ads.feature_engineering.utils import ( - _count_unique_missing, - _str_lat_long_to_point, SchemeNeutral, SchemeTeal, + _count_unique_missing, + _str_lat_long_to_point, ) -from ads.feature_engineering import schema -from ads.common.decorator.runtime_dependency import ( - runtime_dependency, - OptionalDependency, -) - PATTERN = re.compile(r"^[(]?(\-?\d+\.\d+?),\s*(\-?\d+\.\d+?)[)]?$", re.VERBOSE) @@ -131,7 +132,7 @@ def feature_stat(x: pd.Series) -> pd.DataFrame: "-44.510428,-169.269477", "-56.3344375,-166.407038", "", - np.NaN, + np.nan, None ], name='latlong' @@ -170,7 +171,7 @@ def feature_plot(x: pd.Series) -> plt.Axes: "-44.510428,-169.269477", "-56.3344375,-166.407038", "", - np.NaN, + np.nan, None ], name='latlong' @@ -226,7 +227,7 @@ def feature_domain(cls, x: pd.Series) -> schema.Domain: "-44.510428,-169.269477", "-56.3344375,-166.407038", "", - np.NaN, + np.nan, None ], name='latlong' diff --git a/ads/feature_engineering/feature_type/phone_number.py b/ads/feature_engineering/feature_type/phone_number.py index 8cdec6f38..e9522a973 100644 --- a/ads/feature_engineering/feature_type/phone_number.py +++ b/ads/feature_engineering/feature_type/phone_number.py @@ -1,7 +1,6 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- -# Copyright (c) 2021 Oracle and/or its affiliates. +# Copyright (c) 2021, 2025 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ """ @@ -15,12 +14,14 @@ default_handler(data: pd.Series) -> pd.Series Processes given data and indicates if the data matches requirements. """ -import pandas as pd + import re + +import pandas as pd + +from ads.feature_engineering import schema from ads.feature_engineering.feature_type.string import String from ads.feature_engineering.utils import _count_unique_missing -from ads.feature_engineering import schema - PATTERN = re.compile( r"^(\+?\d{1,2}[\s-])?\(?(\d{3})\)?[\s.-]?\d{3}[\s.-]?\d{4}$", re.VERBOSE @@ -91,7 +92,7 @@ def feature_stat(x: pd.Series) -> pd.DataFrame: Examples -------- - >>> s = pd.Series(['2068866666', '6508866666', '2068866666', '', np.NaN, np.nan, None], name='phone') + >>> s = pd.Series(['2068866666', '6508866666', '2068866666', '', np.nan, np.nan, None], name='phone') >>> s.ads.feature_type = ['phone_number'] >>> s.ads.feature_stat() Metric Value @@ -113,7 +114,7 @@ def feature_domain(cls, x: pd.Series) -> schema.Domain: Examples -------- - >>> s = pd.Series(['2068866666', '6508866666', '2068866666', '', np.NaN, np.nan, None], name='phone') + >>> s = pd.Series(['2068866666', '6508866666', '2068866666', '', np.nan, np.nan, None], name='phone') >>> s.ads.feature_type = ['phone_number'] >>> s.ads.feature_domain() constraints: [] diff --git a/ads/feature_engineering/feature_type/string.py b/ads/feature_engineering/feature_type/string.py index e1408c634..b46c89169 100644 --- a/ads/feature_engineering/feature_type/string.py +++ b/ads/feature_engineering/feature_type/string.py @@ -1,7 +1,6 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- -# Copyright (c) 2021, 2022 Oracle and/or its affiliates. +# Copyright (c) 2021, 2025 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ """ @@ -11,19 +10,20 @@ String The feature type that represents string values. """ + import matplotlib.pyplot as plt import pandas as pd + +from ads.common.decorator.runtime_dependency import ( + OptionalDependency, + runtime_dependency, +) +from ads.feature_engineering import schema from ads.feature_engineering.feature_type.base import FeatureType from ads.feature_engineering.utils import ( + SchemeNeutral, _count_unique_missing, random_color_func, - SchemeNeutral, -) -from ads.feature_engineering import schema -from ads.common import utils, logger -from ads.common.decorator.runtime_dependency import ( - runtime_dependency, - OptionalDependency, ) @@ -89,7 +89,7 @@ def feature_stat(x: pd.Series) -> pd.DataFrame: Examples -------- >>> string = pd.Series(['S', 'C', 'S', 'S', 'S', 'Q', 'S', 'S', 'S', 'C', 'S', 'S', 'S', - 'S', 'S', 'S', 'Q', 'S', 'S', '', np.NaN, None], name='string') + 'S', 'S', 'S', 'Q', 'S', 'S', '', np.nan, None], name='string') >>> string.ads.feature_type = ['string'] >>> string.ads.feature_stat() Metric Value @@ -113,7 +113,7 @@ def feature_plot(x: pd.Series) -> plt.Axes: Examples -------- >>> string = pd.Series(['S', 'C', 'S', 'S', 'S', 'Q', 'S', 'S', 'S', 'C', 'S', 'S', 'S', - 'S', 'S', 'S', 'Q', 'S', 'S', '', np.NaN, None], name='string') + 'S', 'S', 'S', 'Q', 'S', 'S', '', np.nan, None], name='string') >>> string.ads.feature_type = ['string'] >>> string.ads.feature_plot() @@ -149,7 +149,7 @@ def feature_domain(cls, x: pd.Series) -> schema.Domain: Examples -------- >>> string = pd.Series(['S', 'C', 'S', 'S', 'S', 'Q', 'S', 'S', 'S', 'C', 'S', 'S', 'S', - 'S', 'S', 'S', 'Q', 'S', 'S', '', np.NaN, None], name='string') + 'S', 'S', 'S', 'Q', 'S', 'S', '', np.nan, None], name='string') >>> string.ads.feature_type = ['string'] >>> string.ads.feature_domain() constraints: [] diff --git a/ads/feature_engineering/feature_type/text.py b/ads/feature_engineering/feature_type/text.py index 3dfe805b2..eb65abcb7 100644 --- a/ads/feature_engineering/feature_type/text.py +++ b/ads/feature_engineering/feature_type/text.py @@ -1,7 +1,6 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- -# Copyright (c) 2021, 2022 Oracle and/or its affiliates. +# Copyright (c) 2021, 2025 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ """ @@ -11,16 +10,16 @@ Text The Text feature type. """ + import matplotlib.pyplot as plt import pandas as pd -from ads.feature_engineering.feature_type.string import String -from ads.feature_engineering.utils import random_color_func, SchemeNeutral -from ads.common import utils, logger from ads.common.decorator.runtime_dependency import ( - runtime_dependency, OptionalDependency, + runtime_dependency, ) +from ads.feature_engineering.feature_type.string import String +from ads.feature_engineering.utils import SchemeNeutral, random_color_func class Text(String): @@ -53,7 +52,7 @@ def feature_plot(x: pd.Series) -> plt.Axes: Examples -------- >>> text = pd.Series(['S', 'C', 'S', 'S', 'S', 'Q', 'S', 'S', 'S', 'C', 'S', 'S', 'S', - 'S', 'S', 'S', 'Q', 'S', 'S', '', np.NaN, None], name='text') + 'S', 'S', 'S', 'Q', 'S', 'S', '', np.nan, None], name='text') >>> text.ads.feature_type = ['text'] >>> text.ads.feature_plot() diff --git a/ads/feature_engineering/feature_type/zip_code.py b/ads/feature_engineering/feature_type/zip_code.py index 6349592f0..b4ece207b 100644 --- a/ads/feature_engineering/feature_type/zip_code.py +++ b/ads/feature_engineering/feature_type/zip_code.py @@ -1,7 +1,6 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- -# Copyright (c) 2021 Oracle and/or its affiliates. +# Copyright (c) 2021, 2025 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ """ @@ -15,17 +14,20 @@ default_handler(data: pd.Series) -> pd.Series Processes given data and indicates if the data matches requirements. """ + +import re + import matplotlib.pyplot as plt import pandas as pd -import re + +from ads.feature_engineering import schema from ads.feature_engineering.feature_type.string import String from ads.feature_engineering.utils import ( _count_unique_missing, - _to_lat_long, _plot_gis_scatter, + _to_lat_long, _zip_code, ) -from ads.feature_engineering import schema PATTERN = re.compile(r"^[0-9]{5}(?:-[0-9]{4})?$", re.VERBOSE) @@ -78,7 +80,7 @@ class ZipCode(String): >>> from ads.feature_engineering.feature_type.zip_code import ZipCode >>> import pandas as pd >>> import numpy as np - >>> s = pd.Series(["94065", "90210", np.NaN, None], name='zipcode') + >>> s = pd.Series(["94065", "90210", np.nan, None], name='zipcode') >>> ZipCode.validator.is_zip_code(s) 0 True 1 True @@ -97,7 +99,7 @@ def feature_stat(x: pd.Series) -> pd.DataFrame: Examples -------- - >>> zipcode = pd.Series([94065, 90210, np.NaN, None], name='zipcode') + >>> zipcode = pd.Series([94065, 90210, np.nan, None], name='zipcode') >>> zipcode.ads.feature_type = ['zip_code'] >>> zipcode.ads.feature_stat() Metric Value @@ -119,7 +121,7 @@ def feature_plot(x: pd.Series) -> plt.Axes: Examples -------- - >>> zipcode = pd.Series([94065, 90210, np.NaN, None], name='zipcode') + >>> zipcode = pd.Series([94065, 90210, np.nan, None], name='zipcode') >>> zipcode.ads.feature_type = ['zip_code'] >>> zipcode.ads.feature_plot() Returns @@ -138,7 +140,7 @@ def feature_domain(cls, x: pd.Series) -> schema.Domain: Examples -------- - >>> zipcode = pd.Series([94065, 90210, np.NaN, None], name='zipcode') + >>> zipcode = pd.Series([94065, 90210, np.nan, None], name='zipcode') >>> zipcode.ads.feature_type = ['zip_code'] >>> zipcode.ads.feature_domain() constraints: [] diff --git a/ads/feature_engineering/utils.py b/ads/feature_engineering/utils.py index cd85667ee..51a8a7074 100644 --- a/ads/feature_engineering/utils.py +++ b/ads/feature_engineering/utils.py @@ -1,7 +1,6 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- -# Copyright (c) 2021, 2022 Oracle and/or its affiliates. +# Copyright (c) 2021, 2025 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ """ @@ -11,18 +10,21 @@ is_boolean(value: Any) -> bool Checks if value type is boolean. """ -import numpy as np + +import re +from functools import lru_cache +from typing import Any + import matplotlib.pyplot as plt +import numpy as np import pandas as pd -import re + from ads.common.card_identifier import card_identify from ads.common.decorator.runtime_dependency import ( - runtime_dependency, OptionalDependency, + runtime_dependency, ) from ads.feature_engineering.dataset.zip_code_data import zip_code_dict -from functools import lru_cache -from typing import Any class SchemeNeutral(str): @@ -67,7 +69,7 @@ def _add_missing(x, df): """ Adds count of missing values. """ - n_missing = pd.isnull(x.replace(r"", np.NaN)).sum() + n_missing = pd.isnull(x.replace(r"", np.nan)).sum() if n_missing > 0: df.loc["missing"] = n_missing return df @@ -78,7 +80,7 @@ def _count_unique_missing(x): Returns the total count, unique count and count of missing values of a series. """ df_stat = pd.Series( - {"count": len(x), "unique": len(x.replace(r"", np.NaN).dropna().unique())}, + {"count": len(x), "unique": len(x.replace(r"", np.nan).dropna().unique())}, name=x.name, ).to_frame() return _add_missing(x, df_stat) @@ -122,7 +124,7 @@ def random_color_func( h = 179 s = 23 l = int(100.0 * float(random_state.randint(60, 120)) / 255.0) - return "hsl({}, {}%, {}%)".format(h, s, l) + return f"hsl({h}, {s}%, {l}%)" def _is_float(s: str): @@ -135,7 +137,7 @@ def _is_float(s: str): def _str_lat_long_to_point(s): """ Converts input data into formated geometry point - Return formated geometry point string or np.NaN if input string is not valid + Return formated geometry point string or np.nan if input string is not valid """ if isinstance(s, str): coords = s.split(",") @@ -147,7 +149,7 @@ def _str_lat_long_to_point(s): long = long[:-1] if _is_float(lat) and _is_float(long): return "POINT(" + long + " " + lat + ")" - return np.NaN + return np.nan @runtime_dependency(module="geopandas", install_from=OptionalDependency.GEO) diff --git a/ads/model/transformer/onnx_transformer.py b/ads/model/transformer/onnx_transformer.py index 7ac5b7c4c..e0bc11394 100644 --- a/ads/model/transformer/onnx_transformer.py +++ b/ads/model/transformer/onnx_transformer.py @@ -1,7 +1,6 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- -# Copyright (c) 2022 Oracle and/or its affiliates. +# Copyright (c) 2022, 2025 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ import json @@ -16,7 +15,7 @@ # Note to developers: If you make any changes to this class, copy and paste those changes over to # templates/score_onnx.jinja2 and templates/score_onnx_new.jinja2. We do not yet have an automatic way of doing this. -class ONNXTransformer(object): +class ONNXTransformer: """ This is a transformer to convert X [pandas.Dataframe, pd.Series] data into Onnx readable dtypes and formats. It is Serializable, so it can be reloaded at another time. @@ -199,7 +198,7 @@ def _handle_missing_value( X, impute_values=impute_values ) elif isinstance(X, pd.Series): - X = X.replace(r"^\s*$", np.NaN, regex=True) + X = X.replace(r"^\s*$", np.nan, regex=True) if len(impute_values.keys()) == 1: for key, val in impute_values.items(): X = X.fillna(val) @@ -208,7 +207,7 @@ def _handle_missing_value( "Multiple imputed values are provided, but `X` has only one dim." ) else: - raise NotImplemented( + raise NotImplementedError( f"{type(X)} is not supported. Convert `X` to pandas dataframe or numpy array." ) return X @@ -218,11 +217,11 @@ def _handle_missing_value_dataframe(X: pd.DataFrame, impute_values: Dict): for idx, val in impute_values.items(): if isinstance(idx, int): X.iloc[:, idx] = ( - X.iloc[:, idx].replace(r"^\s*$", np.NaN, regex=True).fillna(val) + X.iloc[:, idx].replace(r"^\s*$", np.nan, regex=True).fillna(val) ) else: X.loc[:, idx] = ( - X.loc[:, idx].replace(r"^\s*$", np.NaN, regex=True).fillna(val) + X.loc[:, idx].replace(r"^\s*$", np.nan, regex=True).fillna(val) ) return X @@ -294,7 +293,7 @@ def load(filename, **kwargs): The loaded model """ # Make sure you have pandas, numpy, and sklearn imported - with open(filename, "r") as f: + with open(filename) as f: export_dict = json.load(f) onnx_transformer = ONNXTransformer() diff --git a/ads/templates/score.jinja2 b/ads/templates/score.jinja2 index f81122ba2..e068548e4 100644 --- a/ads/templates/score.jinja2 +++ b/ads/templates/score.jinja2 @@ -207,7 +207,7 @@ class ONNXTransformer(object): X, impute_values=impute_values ) elif isinstance(X, pd.Series): - X = X.replace(r"^\s*$", np.NaN, regex=True) + X = X.replace(r"^\s*$", np.nan, regex=True) if len(impute_values.keys()) == 1: for key, val in impute_values.items(): X = X.fillna(val) @@ -226,11 +226,11 @@ class ONNXTransformer(object): for idx, val in impute_values.items(): if isinstance(idx, int): X.iloc[:, idx] = ( - X.iloc[:, idx].replace(r"^\s*$", np.NaN, regex=True).fillna(val) + X.iloc[:, idx].replace(r"^\s*$", np.nan, regex=True).fillna(val) ) else: X.loc[:, idx] = ( - X.loc[:, idx].replace(r"^\s*$", np.NaN, regex=True).fillna(val) + X.loc[:, idx].replace(r"^\s*$", np.nan, regex=True).fillna(val) ) return X diff --git a/ads/templates/score_onnx.jinja2 b/ads/templates/score_onnx.jinja2 index fded33300..6548e3865 100644 --- a/ads/templates/score_onnx.jinja2 +++ b/ads/templates/score_onnx.jinja2 @@ -282,7 +282,7 @@ class ONNXTransformer(object): X, impute_values=impute_values ) elif isinstance(X, pd.Series): - X = X.replace(r"^\s*$", np.NaN, regex=True) + X = X.replace(r"^\s*$", np.nan, regex=True) if len(impute_values.keys()) == 1: for key, val in impute_values.items(): X = X.fillna(val) @@ -301,11 +301,11 @@ class ONNXTransformer(object): for idx, val in impute_values.items(): if isinstance(idx, int): X.iloc[:, idx] = ( - X.iloc[:, idx].replace(r"^\s*$", np.NaN, regex=True).fillna(val) + X.iloc[:, idx].replace(r"^\s*$", np.nan, regex=True).fillna(val) ) else: X.loc[:, idx] = ( - X.loc[:, idx].replace(r"^\s*$", np.NaN, regex=True).fillna(val) + X.loc[:, idx].replace(r"^\s*$", np.nan, regex=True).fillna(val) ) return X diff --git a/ads/templates/score_onnx_new.jinja2 b/ads/templates/score_onnx_new.jinja2 index aed2dbc65..9ed1bb067 100644 --- a/ads/templates/score_onnx_new.jinja2 +++ b/ads/templates/score_onnx_new.jinja2 @@ -348,7 +348,7 @@ class ONNXTransformer(object): X, impute_values=impute_values ) elif isinstance(X, pd.Series): - X = X.replace(r"^\s*$", np.NaN, regex=True) + X = X.replace(r"^\s*$", np.nan, regex=True) if len(impute_values.keys()) == 1: for key, val in impute_values.items(): X = X.fillna(val) @@ -367,11 +367,11 @@ class ONNXTransformer(object): for idx, val in impute_values.items(): if isinstance(idx, int): X.iloc[:, idx] = ( - X.iloc[:, idx].replace(r"^\s*$", np.NaN, regex=True).fillna(val) + X.iloc[:, idx].replace(r"^\s*$", np.nan, regex=True).fillna(val) ) else: X.loc[:, idx] = ( - X.loc[:, idx].replace(r"^\s*$", np.NaN, regex=True).fillna(val) + X.loc[:, idx].replace(r"^\s*$", np.nan, regex=True).fillna(val) ) return X diff --git a/tests/unitary/default_setup/feature_types/test_feature_domain_schema.py b/tests/unitary/default_setup/feature_types/test_feature_domain_schema.py index 252570822..fab24808f 100644 --- a/tests/unitary/default_setup/feature_types/test_feature_domain_schema.py +++ b/tests/unitary/default_setup/feature_types/test_feature_domain_schema.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -# Copyright (c) 2021, 2024 Oracle and/or its affiliates. +# Copyright (c) 2021, 2025 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ import os @@ -11,6 +11,8 @@ import numpy as np import pandas as pd import pytest +from sklearn.datasets import load_iris + from ads.common import logger from ads.common.data import ADSData from ads.common.model_artifact import ModelArtifact @@ -21,10 +23,9 @@ Expression, JsonSchemaLoader, Schema, - YamlSchemaLoader, SchemaSizeTooLarge, + YamlSchemaLoader, ) -from sklearn.datasets import load_iris @patch("ads.model.common.utils.fetch_manifest_from_conda_location") @@ -96,7 +97,7 @@ class TestFeatureDomainSchema: ### Phone Number phonenumber = pd.Series( - ["2068866666", "6508866666", "2068866666", "", np.NaN, np.nan, None], + ["2068866666", "6508866666", "2068866666", "", np.nan, np.nan, None], name="phone", ) @@ -114,16 +115,16 @@ class TestFeatureDomainSchema: "-44.510428,-169.269477", "-56.3344375,-166.407038", "", - np.NaN, + np.nan, None, ], name="latlon", ) ### zip code - zipcode = pd.Series([94065, 90210, np.NaN, None], name="zipcode") + zipcode = pd.Series([94065, 90210, np.nan, None], name="zipcode") ### boolean - boolean = pd.Series([True, False, True, False, np.NaN, None], name="bool") + boolean = pd.Series([True, False, True, False, np.nan, None], name="bool") ### string string = pd.Series( @@ -148,7 +149,7 @@ class TestFeatureDomainSchema: "S", "S", "", - np.NaN, + np.nan, None, ], name="string", @@ -185,7 +186,7 @@ class TestFeatureDomainSchema: "-44.510428,-169.269477", "-56.3344375,-166.407038", "", - np.NaN, + np.nan, None, ], name="gis", @@ -193,19 +194,19 @@ class TestFeatureDomainSchema: ### ipaddress ip_address = pd.Series( - ["2002:db8::", "192.168.0.1", "2001:db8::", "2002:db8::", np.NaN, None], + ["2002:db8::", "192.168.0.1", "2001:db8::", "2002:db8::", np.nan, None], name="ip_address", ) ### ipaddressv4 ip_address_v4 = pd.Series( - ["192.168.0.1", "192.168.0.2", "192.168.0.3", "192.168.0.4", np.NaN, None], + ["192.168.0.1", "192.168.0.2", "192.168.0.3", "192.168.0.4", np.nan, None], name="ip_address_v4", ) ### ipaddressv6 ip_address_v6 = pd.Series( - ["2002:db8::", "2001:db8::", "2001:db8::", "2002:db8::", np.NaN, None], + ["2002:db8::", "2001:db8::", "2001:db8::", "2002:db8::", np.nan, None], name="ip_address_v6", ) diff --git a/tests/unitary/default_setup/feature_types/test_feature_stat.py b/tests/unitary/default_setup/feature_types/test_feature_stat.py index 0eab7b946..b1fd6d80a 100644 --- a/tests/unitary/default_setup/feature_types/test_feature_stat.py +++ b/tests/unitary/default_setup/feature_types/test_feature_stat.py @@ -1,19 +1,19 @@ #!/usr/bin/env python -# Copyright (c) 2021, 2023 Oracle and/or its affiliates. +# Copyright (c) 2021, 2025 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ import os + import numpy as np import pandas as pd import pytest - from sklearn.datasets import load_iris +from ads.feature_engineering.feature_type.string import String from ads.feature_engineering.feature_type_manager import ( FeatureTypeManager as feature_type_manager, ) -from ads.feature_engineering.feature_type.string import String class TestFeatureStat: @@ -66,7 +66,7 @@ class TestFeatureStat: ### Phone Number phonenumber = pd.Series( - ["2068866666", "6508866666", "2068866666", "", np.NaN, np.nan, None], + ["2068866666", "6508866666", "2068866666", "", np.nan, np.nan, None], name="phone", ) @@ -84,16 +84,16 @@ class TestFeatureStat: "-44.510428,-169.269477", "-56.3344375,-166.407038", "", - np.NaN, + np.nan, None, ], name="latlon", ) ### zip code - zipcode = pd.Series([94065, 90210, np.NaN, None], name="zipcode") + zipcode = pd.Series([94065, 90210, np.nan, None], name="zipcode") ### boolean - boolean = pd.Series([True, False, True, False, np.NaN, None], name="bool") + boolean = pd.Series([True, False, True, False, np.nan, None], name="bool") ### string string = pd.Series( @@ -118,7 +118,7 @@ class TestFeatureStat: "S", "S", "", - np.NaN, + np.nan, None, ], name="string", @@ -155,7 +155,7 @@ class TestFeatureStat: "-44.510428,-169.269477", "-56.3344375,-166.407038", "", - np.NaN, + np.nan, None, ], name="gis", @@ -163,19 +163,19 @@ class TestFeatureStat: ### ipaddress ip_address = pd.Series( - ["2002:db8::", "192.168.0.1", "2001:db8::", "2002:db8::", np.NaN, None], + ["2002:db8::", "192.168.0.1", "2001:db8::", "2002:db8::", np.nan, None], name="ip_address", ) ### ipaddressv4 ip_address_v4 = pd.Series( - ["192.168.0.1", "192.168.0.2", "192.168.0.3", "192.168.0.4", np.NaN, None], + ["192.168.0.1", "192.168.0.2", "192.168.0.3", "192.168.0.4", np.nan, None], name="ip_address_v4", ) ### ipaddressv6 ip_address_v6 = pd.Series( - ["2002:db8::", "2001:db8::", "2001:db8::", "2002:db8::", np.NaN, None], + ["2002:db8::", "2001:db8::", "2001:db8::", "2002:db8::", np.nan, None], name="ip_address_v6", ) diff --git a/tests/unitary/default_setup/model/test_model_metadata.py b/tests/unitary/default_setup/model/test_model_metadata.py index ca2f2b812..1f992939a 100644 --- a/tests/unitary/default_setup/model/test_model_metadata.py +++ b/tests/unitary/default_setup/model/test_model_metadata.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -# Copyright (c) 2021, 2024 Oracle and/or its affiliates. +# Copyright (c) 2021, 2025 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ """Unit tests for model metadata module. Includes tests for: @@ -13,35 +13,36 @@ import json from unittest.mock import MagicMock, mock_open, patch +import numpy as np import pytest import yaml -import numpy as np +from oci.data_science.models import Metadata as OciMetadataItem + +from ads.model.datascience_model import ( + CustomerNotificationType, + ModelBackupOperationDetails, + ModelBackupSetting, + ModelRetentionOperationDetails, + ModelRetentionSetting, + SettingStatus, +) from ads.model.model_metadata import ( _METADATA_EMPTY_VALUE, + METADATA_DESCRIPTION_LENGTH_LIMIT, METADATA_SIZE_LIMIT, METADATA_VALUE_LENGTH_LIMIT, - METADATA_DESCRIPTION_LENGTH_LIMIT, - MetadataCustomCategory, Framework, + MetadataCustomCategory, + MetadataDescriptionTooLong, MetadataSizeTooLarge, + MetadataTaxonomyKeys, MetadataValueTooLong, - MetadataDescriptionTooLong, ModelCustomMetadata, ModelCustomMetadataItem, ModelTaxonomyMetadata, ModelTaxonomyMetadataItem, - MetadataTaxonomyKeys, UseCaseType, ) -from ads.model.datascience_model import ( - ModelRetentionSetting, - CustomerNotificationType, - SettingStatus, - ModelBackupSetting, - ModelRetentionOperationDetails, - ModelBackupOperationDetails, -) -from oci.data_science.models import Metadata as OciMetadataItem try: from yaml import CDumper as dumper @@ -409,7 +410,7 @@ def test_to_json_file_success(self): ({"key": "test_key"}, json.dumps({"key": "test_key"})), (None, _METADATA_EMPTY_VALUE), ("", _METADATA_EMPTY_VALUE), - ({"key": np.NaN}, json.dumps({"key": np.NaN}).replace("NaN", "null")), + ({"key": np.nan}, json.dumps({"key": np.nan}).replace("NaN", "null")), ], ) def test__to_oci_metadata(self, test_value, expected_value): diff --git a/tests/unitary/with_extras/feature_engineering/test_feature_plot.py b/tests/unitary/with_extras/feature_engineering/test_feature_plot.py index 094f2ad6b..a57e44d0f 100644 --- a/tests/unitary/with_extras/feature_engineering/test_feature_plot.py +++ b/tests/unitary/with_extras/feature_engineering/test_feature_plot.py @@ -1,18 +1,18 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright (c) 2021, 2023 Oracle and/or its affiliates. +# Copyright (c) 2021, 2025 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ -import numpy as np import matplotlib as mpl +import numpy as np import pandas as pd from sklearn.datasets import load_iris from ads.feature_engineering.utils import ( _is_float, - _str_lat_long_to_point, _plot_gis_scatter, + _str_lat_long_to_point, _to_lat_long, _zip_code, ) @@ -34,16 +34,16 @@ class TestFeaturePlot: ) address_invalid = pd.Series( - ["1 Miller Drive, New York, NY 0987", "", None, np.NaN], name="address" + ["1 Miller Drive, New York, NY 0987", "", None, np.nan], name="address" ) ### boolean - boolean = pd.Series([True, False, True, False, np.NaN, None], name="bool") - boolean_invalid = pd.Series([np.NaN, None], name="bool") + boolean = pd.Series([True, False, True, False, np.nan, None], name="bool") + boolean_invalid = pd.Series([np.nan, None], name="bool") ### constant constant = pd.Series([1, 1, 1, 1, 1], name="constant") - constant_invalid = pd.Series([np.NaN, None], name="constant") + constant_invalid = pd.Series([np.nan, None], name="constant") ### continuous cts = pd.Series([123.32, 23.243, 324.342, np.nan], name="cts") @@ -72,7 +72,7 @@ class TestFeaturePlot: "S", "S", "", - np.NaN, + np.nan, None, ], name="category", @@ -150,14 +150,14 @@ class TestFeaturePlot: "-44.510428,-169.269477", "-56,-166", "", - np.NaN, + np.nan, None, ], name="latlon", ) latlong_invalid = pd.Series( - ["[-56.3344375,-166.407038]", "", np.NaN, None], name="latlon" + ["[-56.3344375,-166.407038]", "", np.nan, None], name="latlon" ) ### string @@ -183,12 +183,12 @@ class TestFeaturePlot: "S", "S", "", - np.NaN, + np.nan, None, ], name="string", ) - string_invalid = pd.Series([123, "", np.NaN, None], name="string") + string_invalid = pd.Series([123, "", np.nan, None], name="string") ### text text = pd.Series( @@ -214,16 +214,16 @@ class TestFeaturePlot: "S", 123, 1.5, - np.NaN, + np.nan, None, ], name="text", ) - text_invalid = pd.Series(["", np.NaN, None], name="text") + text_invalid = pd.Series(["", np.nan, None], name="text") ### zip code - zipcode = pd.Series(["94065", "90210", np.NaN, None], name="zipcode") - zipcode_invalid = pd.Series([94065, "cat", np.NaN, None], name="zipcode") + zipcode = pd.Series(["94065", "90210", np.nan, None], name="zipcode") + zipcode_invalid = pd.Series([94065, "cat", np.nan, None], name="zipcode") zipcode_coord = _zip_code() def test_feature_plot_return_type(self): diff --git a/tests/unitary/with_extras/feature_types/test_feature_types.py b/tests/unitary/with_extras/feature_types/test_feature_types.py index 108b62ef3..3e5c12060 100644 --- a/tests/unitary/with_extras/feature_types/test_feature_types.py +++ b/tests/unitary/with_extras/feature_types/test_feature_types.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -# Copyright (c) 2021, 2023 Oracle and/or its affiliates. +# Copyright (c) 2021, 2025 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ import os @@ -9,10 +9,13 @@ import nltk import numpy as np import pandas as pd +from oci.ai_language import AIServiceLanguageClient + from ads.common.card_identifier import card_identify from ads.feature_engineering.accessor.dataframe_accessor import ADSDataFrameAccessor from ads.feature_engineering.accessor.series_accessor import ADSSeriesAccessor from ads.feature_engineering.feature_type.address import Address +from ads.feature_engineering.feature_type.adsstring.oci_language import OCILanguage from ads.feature_engineering.feature_type.adsstring.parsers.nltk_parser import ( NLTKParser, ) @@ -34,8 +37,6 @@ from ads.feature_engineering.feature_type.string import String from ads.feature_engineering.feature_type.zip_code import ZipCode from ads.feature_engineering.utils import is_boolean -from oci.ai_language import AIServiceLanguageClient -from ads.feature_engineering.feature_type.adsstring.oci_language import OCILanguage class TestFeatureTypes: @@ -105,7 +106,7 @@ def test_string_type(self): "S", "S", "", - np.NaN, + np.nan, None, ], name="string", @@ -136,7 +137,7 @@ def test_string_type(self): def test_ip_address_v4_type(self): ip_address = pd.Series( - ["192.168.0.1", "2001:db8::", "", np.NaN, None], name="ip_address" + ["192.168.0.1", "2001:db8::", "", np.nan, None], name="ip_address" ) ip_address.ads.feature_type = ["ip_address_v4"] assert ( @@ -146,7 +147,7 @@ def test_ip_address_v4_type(self): def test_ip_address_v6_type(self): ip_address = pd.Series( - ["192.168.0.1", "2001:db8::", "", np.NaN, None], name="ip_address" + ["192.168.0.1", "2001:db8::", "", np.nan, None], name="ip_address" ) ip_address.ads.feature_type = ["ip_address_v6"] assert ( @@ -156,7 +157,7 @@ def test_ip_address_v6_type(self): def test_ip_address_type(self): ip_address = pd.Series( - ["192.168.0.1", "2001:db8::", "", np.NaN, None], name="ip_address" + ["192.168.0.1", "2001:db8::", "", np.nan, None], name="ip_address" ) ip_address.ads.feature_type = ["ip_address"] assert ( @@ -243,7 +244,7 @@ def test_phone_number_type(self): def test_zip_code_type(self): assert ZipCode.name == "zip_code" - zipcode = pd.Series(["94065", 90210, np.NaN, None], name="zipcode") + zipcode = pd.Series(["94065", 90210, np.nan, None], name="zipcode") zipcode.ads.feature_type = ["zip_code"] assert ( From be8f45f7078aea6c98289be38db4a83501d083ea Mon Sep 17 00:00:00 2001 From: Dmitrii Cherkasov Date: Tue, 15 Jul 2025 17:24:23 -0700 Subject: [PATCH 4/4] Fixes unit tests --- ads/common/utils.py | 4 +--- pyproject.toml | 8 ++++++- .../default_setup/common/test_common_utils.py | 8 +++---- .../test_model_framework_sklearn_model.py | 23 +++++++++++++++---- 4 files changed, 30 insertions(+), 13 deletions(-) diff --git a/ads/common/utils.py b/ads/common/utils.py index 09a11f401..e0226739d 100644 --- a/ads/common/utils.py +++ b/ads/common/utils.py @@ -782,9 +782,7 @@ def default(self, obj): ), ): return int(obj) - elif isinstance( - obj, (np.float_, np.float16, np.float32, np.float64, np.double) - ): + elif isinstance(obj, (np.float16, np.float32, np.float64, np.double)): return float(obj) elif isinstance(obj, (np.ndarray,)): return obj.tolist() diff --git a/pyproject.toml b/pyproject.toml index c560374fe..2cf15bb9b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -90,6 +90,7 @@ bds = ["hdfs[kerberos]", "ibis-framework[impala]", "sqlalchemy"] boosted = [ "lightgbm", "xgboost", + "scikit-learn>=1.0,<1.6.0", ] data = [ "datefinder>=0.7.1", @@ -109,7 +110,11 @@ huggingface = [ "transformers", "tf-keras" # Keras 3 installed in py3.11+, but this is not yet supported in Transformers. Need to install the backwards-compatible tf-keras ] -notebook = ["ipython>=7.23.1, <8.0", "ipywidgets~=7.6.3"] +notebook = [ + "ipython>=7.23.1, <8.0", + "ipywidgets~=7.6.3", + "scikit-learn>=1.0,<1.6.0" +] onnx = [ "lightgbm", "onnx>=1.12.0,<=1.15.0; python_version < '3.12'", # v 1.15.0 set base on onnxrutime version and onnx opset support - https://onnxruntime.ai/docs/reference/compatibility.html#onnx-opset-support @@ -123,6 +128,7 @@ onnx = [ "skl2onnx~=1.18.0; python_version >= '3.12'", "tf2onnx", "xgboost<=1.7", + "scikit-learn>=1.0,<1.6.0", ] opctl = [ "conda-pack", diff --git a/tests/unitary/default_setup/common/test_common_utils.py b/tests/unitary/default_setup/common/test_common_utils.py index 045bcbeff..5409d3da0 100644 --- a/tests/unitary/default_setup/common/test_common_utils.py +++ b/tests/unitary/default_setup/common/test_common_utils.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -# Copyright (c) 2021, 2023 Oracle and/or its affiliates. +# Copyright (c) 2021, 2025 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ import os @@ -10,11 +10,12 @@ import sys import tempfile from datetime import datetime -from unittest.mock import MagicMock, patch, ANY +from unittest.mock import ANY, MagicMock, patch import numpy as np import pandas as pd import pytest +from oci import object_storage from sklearn import datasets from sklearn.model_selection import train_test_split @@ -32,7 +33,6 @@ remove_file, upload_to_os, ) -from oci import object_storage DEFAULT_SIGNER_CONF = {"config": {}} @@ -197,7 +197,7 @@ def test_json_converter(self): json = self.json_conv.default(int_a) assert json == 10 - float_b = np.float_(10.0) + float_b = np.float64(10.0) json = self.json_conv.default(float_b) assert json == 10.0 diff --git a/tests/unitary/with_extras/model/test_model_framework_sklearn_model.py b/tests/unitary/with_extras/model/test_model_framework_sklearn_model.py index ce72f4ecf..7a9d0e0f8 100644 --- a/tests/unitary/with_extras/model/test_model_framework_sklearn_model.py +++ b/tests/unitary/with_extras/model/test_model_framework_sklearn_model.py @@ -10,15 +10,15 @@ import base64 import os import shutil +import sys from io import BytesIO +import mock import numpy as np import onnx import onnxruntime as rt import pandas as pd -import pytest, sys -from ads.model.framework.sklearn_model import SklearnModel -from ads.model.serde.model_serializer import SklearnOnnxModelSerializer +import pytest from joblib import load from lightgbm import LGBMClassifier, LGBMRegressor from skl2onnx.common.data_types import ( @@ -34,7 +34,9 @@ from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder, StandardScaler from xgboost import XGBClassifier -import sys, mock + +from ads.model.framework.sklearn_model import SklearnModel +from ads.model.serde.model_serializer import SklearnOnnxModelSerializer tmp_model_dir = "/tmp/model" @@ -336,7 +338,12 @@ def test_serialize_and_load_model_as_onnx_xgboost_pipeline(self): self.xgb_pipe.artifact_dir = target_dir target_path = os.path.join(target_dir, "test_pipeline.onnx") self.xgb_pipe.model_file_name = "test_pipeline.onnx" - self.xgb_pipe.serialize_model(as_onnx=True, X_sample=self.X_iris[:10]) + + initial_types = [("input", FloatTensorType([None, self.X_iris.shape[1]]))] + + self.xgb_pipe.serialize_model( + as_onnx=True, X_sample=self.X_iris[:10], initial_types=initial_types + ) assert os.path.exists(target_path) sess = rt.InferenceSession(target_path) @@ -543,3 +550,9 @@ def test_to_onnx_with_skl2onnx_uninstalled(self): def teardown_class(cls): shutil.rmtree(tmp_model_dir, ignore_errors=True) + + def teardown_class(cls): + shutil.rmtree(tmp_model_dir, ignore_errors=True) + + def teardown_class(cls): + shutil.rmtree(tmp_model_dir, ignore_errors=True)