From ae8cc9f4d8c10ef973c8b9ae20fa546e36cd19ea Mon Sep 17 00:00:00 2001 From: Nick Johnson <24689722+ntjohnson1@users.noreply.github.com> Date: Fri, 23 May 2025 16:59:56 -0400 Subject: [PATCH 01/20] Remove stubs --- docs/source/matlab/symktensor.rst | 8 -------- docs/source/matlab/symtensor.rst | 8 -------- pyttb/__init__.py | 6 ------ pyttb/sptensor3.py | 12 ------------ pyttb/symktensor.py | 12 ------------ pyttb/symtensor.py | 12 ------------ tests/test_sptensor3.py | 13 ------------- tests/test_symktensor.py | 13 ------------- tests/test_symtensor.py | 13 ------------- 9 files changed, 97 deletions(-) delete mode 100644 docs/source/matlab/symktensor.rst delete mode 100644 docs/source/matlab/symtensor.rst delete mode 100644 pyttb/sptensor3.py delete mode 100644 pyttb/symktensor.py delete mode 100644 pyttb/symtensor.py delete mode 100644 tests/test_sptensor3.py delete mode 100644 tests/test_symktensor.py delete mode 100644 tests/test_symtensor.py diff --git a/docs/source/matlab/symktensor.rst b/docs/source/matlab/symktensor.rst deleted file mode 100644 index 19e215a8..00000000 --- a/docs/source/matlab/symktensor.rst +++ /dev/null @@ -1,8 +0,0 @@ -``symktensor`` --------------------- - -Data members -^^^^^^^^^^^^ - -Methods -^^^^^^^ \ No newline at end of file diff --git a/docs/source/matlab/symtensor.rst b/docs/source/matlab/symtensor.rst deleted file mode 100644 index 8d673c32..00000000 --- a/docs/source/matlab/symtensor.rst +++ /dev/null @@ -1,8 +0,0 @@ -``symtensor`` -------------------- - -Data members -^^^^^^^^^^^^ - -Methods -^^^^^^^ \ No newline at end of file diff --git a/pyttb/__init__.py b/pyttb/__init__.py index dbec0889..87a70514 100644 --- a/pyttb/__init__.py +++ b/pyttb/__init__.py @@ -22,10 +22,7 @@ from pyttb.matlab import matlab_support from pyttb.sptenmat import sptenmat from pyttb.sptensor import sptendiag, sptenrand, sptensor -from pyttb.sptensor3 import sptensor3 from pyttb.sumtensor import sumtensor -from pyttb.symktensor import symktensor -from pyttb.symtensor import symtensor from pyttb.tenmat import tenmat from pyttb.tensor import tendiag, teneye, tenones, tenrand, tensor, tenzeros from pyttb.ttensor import ttensor @@ -55,10 +52,7 @@ def ignore_warnings(ignore=True): sptendiag.__name__, sptenrand.__name__, sptensor.__name__, - sptensor3.__name__, sumtensor.__name__, - symktensor.__name__, - symtensor.__name__, teneye.__name__, tenmat.__name__, tendiag.__name__, diff --git a/pyttb/sptensor3.py b/pyttb/sptensor3.py deleted file mode 100644 index 2d469b06..00000000 --- a/pyttb/sptensor3.py +++ /dev/null @@ -1,12 +0,0 @@ -"""Sparse Tensor 3 Class Placeholder.""" - -# Copyright 2025 National Technology & Engineering Solutions of Sandia, -# LLC (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the -# U.S. Government retains certain rights in this software. - - -class sptensor3: - """A sparse tensor variant.""" - - def __init__(self): - assert False, "SPTENSOR3 class not yet implemented" diff --git a/pyttb/symktensor.py b/pyttb/symktensor.py deleted file mode 100644 index 67a05e26..00000000 --- a/pyttb/symktensor.py +++ /dev/null @@ -1,12 +0,0 @@ -"""Symmetric Kruskal Tensor Class Placeholder.""" - -# Copyright 2025 National Technology & Engineering Solutions of Sandia, -# LLC (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the -# U.S. Government retains certain rights in this software. - - -class symktensor: - """Class for symmetric Kruskal tensors (decomposed).""" - - def __init__(self): - assert False, "SYMKTENSOR class not yet implemented" diff --git a/pyttb/symtensor.py b/pyttb/symtensor.py deleted file mode 100644 index bd57e5c0..00000000 --- a/pyttb/symtensor.py +++ /dev/null @@ -1,12 +0,0 @@ -"""Symmetric Tensor Class Placeholder.""" - -# Copyright 2025 National Technology & Engineering Solutions of Sandia, -# LLC (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the -# U.S. Government retains certain rights in this software. - - -class symtensor: - """Class for storing only unique entries of symmetric tensor.""" - - def __init__(self): - assert False, "SYMTENSOR class not yet implemented" diff --git a/tests/test_sptensor3.py b/tests/test_sptensor3.py deleted file mode 100644 index fd7cd94f..00000000 --- a/tests/test_sptensor3.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright 2024 National Technology & Engineering Solutions of Sandia, -# LLC (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the -# U.S. Government retains certain rights in this software. - -import pytest - -import pyttb as ttb - - -def test_sptensor3_initialization_empty(): - with pytest.raises(AssertionError) as excinfo: - ttb.sptensor3() - assert "SPTENSOR3 class not yet implemented" in str(excinfo) diff --git a/tests/test_symktensor.py b/tests/test_symktensor.py deleted file mode 100644 index 0265d6a7..00000000 --- a/tests/test_symktensor.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright 2024 National Technology & Engineering Solutions of Sandia, -# LLC (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the -# U.S. Government retains certain rights in this software. - -import pytest - -import pyttb as ttb - - -def test_symktensor_initialization_empty(): - with pytest.raises(AssertionError) as excinfo: - ttb.symktensor() - assert "SYMKTENSOR class not yet implemented" in str(excinfo) diff --git a/tests/test_symtensor.py b/tests/test_symtensor.py deleted file mode 100644 index 5ee45bcf..00000000 --- a/tests/test_symtensor.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright 2024 National Technology & Engineering Solutions of Sandia, -# LLC (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the -# U.S. Government retains certain rights in this software. - -import pytest - -import pyttb as ttb - - -def test_symtensor_initialization_empty(): - with pytest.raises(AssertionError) as excinfo: - ttb.symtensor() - assert "SYMTENSOR class not yet implemented" in str(excinfo) From fb6f006bf0be37dc5efceaf6123710c821677b89 Mon Sep 17 00:00:00 2001 From: Nick Johnson <24689722+ntjohnson1@users.noreply.github.com> Date: Fri, 23 May 2025 17:07:08 -0400 Subject: [PATCH 02/20] Fix rst keyword typo --- pyttb/sptenmat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyttb/sptenmat.py b/pyttb/sptenmat.py index 8670da5d..9a0a143c 100644 --- a/pyttb/sptenmat.py +++ b/pyttb/sptenmat.py @@ -40,7 +40,7 @@ def __init__( # noqa: PLR0913 and values (vals) along with the mappings of the row (rdims) and column indices (cdims) and the shape of the original tensor (tshape). - If you already have an sparse tensor see :method:`pyttb.sptensor.to_sptenmat`. + If you already have an sparse tensor see :meth:`pyttb.sptensor.to_sptenmat`. Parameters ---------- From c6e091e611037249a75b3a9c09b98d7de618f4e3 Mon Sep 17 00:00:00 2001 From: Nick Johnson <24689722+ntjohnson1@users.noreply.github.com> Date: Fri, 23 May 2025 17:07:37 -0400 Subject: [PATCH 03/20] Handle numerical precision error seen locally --- pyttb/ktensor.py | 27 ++++++++++++++++++--------- tests/test_ktensor.py | 16 ++++++++++------ 2 files changed, 28 insertions(+), 15 deletions(-) diff --git a/pyttb/ktensor.py b/pyttb/ktensor.py index 11ed3f19..a2637f8b 100644 --- a/pyttb/ktensor.py +++ b/pyttb/ktensor.py @@ -1606,7 +1606,7 @@ def score( component :class:`pyttb.ktensor` instances that have been normalized so that their weights are `self.weights` and `other.weights`, and their factor matrices are single column vectors containing [a1,a2,...,an] and - [b1,b2,...bn], rescpetively, then the score is defined as + [b1,b2,...bn], respectively, then the score is defined as score = penalty * (a1.T*b1) * (a2.T*b2) * ... * (an.T*bn), @@ -1653,23 +1653,31 @@ def score( Create two :class:`pyttb.ktensor` instances and compute the score between them: - >>> factors = [np.ones((3, 3)), np.ones((4, 3)), np.ones((5, 3))] + >>> factors = [ + ... np.ones((3, 3)) + 0.1, + ... np.ones((4, 3)) + 0.2, + ... np.ones((5, 3)) + 0.3, + ... ] >>> weights = np.array([2.0, 1.0, 3.0]) >>> K = ttb.ktensor(factors, weights) - >>> factors_2 = [np.ones((3, 2)), np.ones((4, 2)), np.ones((5, 2))] + >>> factors_2 = [ + ... np.ones((3, 2)) + 0.1, + ... np.ones((4, 2)) + 0.2, + ... np.ones((5, 2)) + 0.3, + ... ] >>> weights_2 = np.array([2.0, 4.0]) >>> K2 = ttb.ktensor(factors_2, weights_2) >>> score, Kperm, flag, perm = K.score(K2) - >>> print(score) - 0.875 + >>> print(np.isclose(score, 0.875)) + True >>> print(perm) [0 2 1] Compute score without using weights: >>> score, Kperm, flag, perm = K.score(K2, weight_penalty=False) - >>> print(score) - 1.0 + >>> print(np.isclose(score, 1.0)) + True >>> print(perm) [0 1 2] """ @@ -1733,8 +1741,9 @@ def score( best_perm = -1 * np.ones((RA), dtype=int) best_score = 0.0 for _ in range(RB): - idx = np.argmax(C.reshape(prod(C.shape), order=self.order)) - ij = tt_ind2sub((RA, RB), np.array(idx)) + flatten_C = C.reshape(prod(C.shape), order=self.order) + idx = np.argmax(flatten_C) + ij = tt_ind2sub((RA, RB), np.array(idx, dtype=int), order=self.order) best_score = best_score + C[ij[0], ij[1]] C[ij[0], :] = -10 C[:, ij[1]] = -10 diff --git a/tests/test_ktensor.py b/tests/test_ktensor.py index 6c560c26..6abda068 100644 --- a/tests/test_ktensor.py +++ b/tests/test_ktensor.py @@ -779,23 +779,27 @@ def test_ktensor_redistribute(sample_ktensor_2way): def test_ktensor_score(): A = ttb.ktensor( - [np.ones((3, 3)), np.ones((4, 3)), np.ones((5, 3))], np.array([2.0, 1.0, 3.0]) + [np.ones((3, 3)) + 0.1, np.ones((4, 3)) + 0.2, np.ones((5, 3)) + 0.3], + np.array([2.0, 1.0, 3.0]), ) B = ttb.ktensor( - [np.ones((3, 2)), np.ones((4, 2)), np.ones((5, 2))], np.array([2.0, 4.0]) + [np.ones((3, 2)) + 0.1, np.ones((4, 2)) + 0.2, np.ones((5, 2)) + 0.3], + np.array([2.0, 4.0]), ) + A_norm = A.copy().normalize() + # defaults score, Aperm, flag, best_perm = A.score(B) - assert score == 0.875 - assert np.allclose(Aperm.weights, np.array([15.49193338, 23.23790008, 7.74596669])) + assert np.isclose(score, 0.875) + assert np.allclose(Aperm.weights, A_norm.weights[best_perm]) assert flag assert np.array_equal(best_perm, np.array([0, 2, 1])) # compare just factor matrices (i.e., do not use weights) score, Aperm, flag, best_perm = A.score(B, weight_penalty=False) - assert score == 1.0 - assert np.allclose(Aperm.weights, np.array([15.49193338, 7.74596669, 23.23790008])) + assert np.isclose(score, 1.0) + assert np.allclose(Aperm.weights, A_norm.weights[best_perm]) assert not flag assert np.array_equal(best_perm, np.array([0, 1, 2])) From cc17f7c91c20099eee08966211018d05bd3ba411 Mon Sep 17 00:00:00 2001 From: Nick Johnson <24689722+ntjohnson1@users.noreply.github.com> Date: Sat, 24 May 2025 17:44:56 -0400 Subject: [PATCH 04/20] Add coverage for our missed doc components --- docs/source/index.rst | 5 +++++ docs/source/io.rst | 6 ++++++ docs/source/matlab/additional_support.rst | 4 ++++ docs/source/pyttb_utils.rst | 6 ++++-- docs/source/reference.rst | 1 + 5 files changed, 20 insertions(+), 2 deletions(-) create mode 100644 docs/source/io.rst create mode 100644 docs/source/matlab/additional_support.rst diff --git a/docs/source/index.rst b/docs/source/index.rst index 930010c1..3183c0a8 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -47,8 +47,13 @@ algorithms for computing low-rank tensor models. decompositions such as Poisson Tensor Factorization via alternating Poisson regression. +- `IO`_ + + Storing and retrieving tensors from disk. + .. _Tensor Classes: tensor_classes.html .. _Algorithms: algorithms.html +.. _IO: io.html Getting Started diff --git a/docs/source/io.rst b/docs/source/io.rst new file mode 100644 index 00000000..10ccc1fc --- /dev/null +++ b/docs/source/io.rst @@ -0,0 +1,6 @@ +Input/Output +------------ +Storing or reading tensors from disk. + +.. autofunction:: pyttb.import_data.import_data +.. autofunction:: pyttb.export_data.export_data \ No newline at end of file diff --git a/docs/source/matlab/additional_support.rst b/docs/source/matlab/additional_support.rst new file mode 100644 index 00000000..f1df8eab --- /dev/null +++ b/docs/source/matlab/additional_support.rst @@ -0,0 +1,4 @@ +Additional Utilities For MATLAB User Transition +----------------------------------------------- + +.. autofunction:: pyttb.matlab.matlab_support.matlab_print \ No newline at end of file diff --git a/docs/source/pyttb_utils.rst b/docs/source/pyttb_utils.rst index 7cc02f4c..0627df89 100644 --- a/docs/source/pyttb_utils.rst +++ b/docs/source/pyttb_utils.rst @@ -1,5 +1,7 @@ -Helper Functions (:mod:`pyttb_utils`) -------------------------------------- +Helper Functions (:mod:`pyttb_utils`, :mod:`khatrirao`) +-------------------------------------------------------- + +.. autofunction:: pyttb.khatrirao.khatrirao .. automodule:: pyttb.pyttb_utils :members: diff --git a/docs/source/reference.rst b/docs/source/reference.rst index 46cab023..8d7529fc 100644 --- a/docs/source/reference.rst +++ b/docs/source/reference.rst @@ -6,3 +6,4 @@ Reference (:mod:`pyttb`) tensor_classes.rst algorithms.rst + io.rst From 54c12c6f7bf31a857b0aff3a7cabb122919606bb Mon Sep 17 00:00:00 2001 From: Nick Johnson <24689722+ntjohnson1@users.noreply.github.com> Date: Sat, 24 May 2025 18:04:55 -0400 Subject: [PATCH 05/20] Add python 3.13 but only do coveralls for oldest supported --- .github/workflows/regression-tests.yml | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/.github/workflows/regression-tests.yml b/.github/workflows/regression-tests.yml index 72dda521..68c8f436 100644 --- a/.github/workflows/regression-tests.yml +++ b/.github/workflows/regression-tests.yml @@ -15,8 +15,8 @@ jobs: runs-on: ubuntu-latest strategy: fail-fast: false - matrix: - python-version: ["3.9", "3.10", "3.11", "3.12"] + matrix: # Keep these in ascending order for automagic with coverage + python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] steps: - uses: actions/checkout@v4 @@ -32,13 +32,18 @@ jobs: python -c "import pyttb" - name: Install dev dependencies run: | - python -m pip install --upgrade coverage coveralls sphinx_rtd_theme + python -m pip install --upgrade coverage sphinx_rtd_theme pip install ".[dev]" - name: Run tests run: | coverage run --source pyttb -m pytest tests/ coverage report + - name: Add coveralls dependencies + if: strategy.job-index == 0 + run: | + python -m pip install --upgrade coveralls - name: Upload coverage to Coveralls + if: strategy.job-index == 0 uses: coverallsapp/github-action@v2 #env: # GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From ab81d8c2968eafcf851201756c133374ebaae7a2 Mon Sep 17 00:00:00 2001 From: Nick Johnson <24689722+ntjohnson1@users.noreply.github.com> Date: Fri, 30 May 2025 18:37:40 -0400 Subject: [PATCH 06/20] Bulk on non-missing data support --- pyproject.toml | 2 + pyttb/create_problem.py | 207 +++++++++++++++++++++++++++++++++++ tests/test_create_problem.py | 50 +++++++++ 3 files changed, 259 insertions(+) create mode 100644 pyttb/create_problem.py create mode 100644 tests/test_create_problem.py diff --git a/pyproject.toml b/pyproject.toml index 45604a84..41261b93 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -84,6 +84,8 @@ ignore = [ "B011", # There is ongoing discussion about logging/warning etc "B028", + # Personal preference on magic method + "D105", ] [tool.ruff.lint.pydocstyle] convention = "numpy" diff --git a/pyttb/create_problem.py b/pyttb/create_problem.py new file mode 100644 index 00000000..0eae152c --- /dev/null +++ b/pyttb/create_problem.py @@ -0,0 +1,207 @@ +"""Create test problems for tensor factorizations.""" + +from dataclasses import dataclass +from typing import Callable, Optional, Tuple, Union, cast, overload + +import numpy as np + +import pyttb as ttb +from pyttb.pyttb_utils import Shape + +solution_generator = Callable[[Tuple[int, ...]], np.ndarray] + + +def randn(shape: Tuple[int, ...]) -> np.ndarray: + """Stub for MATLAB randn. + + TODO move somewhere shareable. + """ + return np.random.normal(0, 1, size=shape) + + +@dataclass +class BaseProblem: + """Parameters general to all solutions.""" + + shape: Shape + factor_generator: solution_generator = randn + symmetric: Optional[list[Tuple[int, int]]] = None + num_factors: Union[int, list[int], None] = None + + def __post_init__(self): + self.shape = ttb.pyttb_utils.parse_shape(self.shape) + + +@dataclass +class CPProblem(BaseProblem): + """Parameters specifying CP Solutions.""" + + num_factors: int = 2 + # TODO probably rename weight generator for consistency + lambda_generator: solution_generator = np.random.random + + +@dataclass +class TuckerProblem(BaseProblem): + """Parameters specifying Tucker Solutions.""" + + # TODO post_init set to [2, 2, 2] + num_factors: Optional[list[int]] = None + core_generator: solution_generator = randn + + def __post_init__(self): + super().__post_init__() + self.num_factors = self.num_factors or [2, 2, 2] + + +@dataclass +class DataParams: + """Parameters to control data quality.""" + + noise: float = 0.10 + # TODO handle weird sparse_generation option + + def __post_init__( + self, + ): + if not 0.0 <= self.noise <= 1.0: + raise ValueError(f"Noise must be in [0,1] but got {self.noise}") + + +@dataclass +class MissingData: + """Parameters to control missing data.""" + + missing_ratio: float = 0.0 + sparse_model: bool = False + # TODO add spare pattern tensor + + def __post_init__(self): + if not 0.0 <= self.missing_ratio <= 1.0: + raise ValueError( + f"Missing ratio must be in [0,1] but got {self.missing_ratio}" + ) + + if self.sparse_model and self.missing_ratio > 0.0: + raise ValueError("Can't combine missing data and sparse generation.") + + def has_missing(self) -> bool: + """Check if any form of missing data is requested.""" + return self.sparse_model or self.missing_ratio > 0.0 + + def raise_symmetric(self): + """Raise for unsupported symmetry request.""" + if self.missing_ratio: + raise ValueError("Can't generate a symmetric problem with missing data.") + if self.sparse_model: + raise ValueError("Can't generate sparse symmetric problem.") + + +@overload +def create_problem( + problem_params: CPProblem, missing_params: MissingData, data_params: DataParams +) -> Tuple[ttb.ktensor, ttb.tensor]: ... # pragma: no cover see coveragepy/issues/970 + + +@overload +def create_problem( + problem_params: TuckerProblem, missing_params: MissingData, data_params: DataParams +) -> Tuple[ttb.ttensor, ttb.tensor]: ... # pragma: no cover see coveragepy/issues/970 + + +def create_problem( + problem_params: Union[CPProblem, TuckerProblem], + missing_params: MissingData, + data_params: DataParams, +) -> Tuple[Union[ttb.ktensor, ttb.ttensor], ttb.tensor]: + """Generate a problem and solution.""" + if problem_params.symmetric is not None: + missing_params.raise_symmetric() + + solution = generate_solution(problem_params) + + if missing_params.sparse_model: + raise NotImplementedError("Sparse generation not yet supported") + + data = generate_data(solution, problem_params, data_params) + return solution, data + + +def generate_solution_factors(base_params: BaseProblem) -> list[np.ndarray]: + """Generate the factor matrices for either type of solution.""" + # Get shape of final tensor + shape = cast(Tuple[int, ...], base_params.shape) + + # Get shape of factors + if isinstance(base_params.num_factors, int): + nfactors = [base_params.num_factors] * len(shape) + elif base_params.num_factors is not None: + nfactors = base_params.num_factors + else: + raise ValueError("Num_factors shouldn't be none.") + if len(nfactors) != len(shape): + raise ValueError( + "Num_factors should be the same dimensions as shape but got" + f"{nfactors} and {shape}" + ) + factor_matrices = [] + for shape_i, nfactors_i in zip(shape, nfactors): + factor_matrices.append(base_params.factor_generator((shape_i, nfactors_i))) + + if base_params.symmetric is not None: + for grp in base_params.symmetric: + # TODO see if this can be a single indexed op + for j in range(1, len(grp)): + factor_matrices[grp[j]] = factor_matrices[grp[0]] + + return factor_matrices + + +@overload +def generate_solution( + problem_params: TuckerProblem, +) -> ttb.ttensor: ... + + +@overload +def generate_solution( + problem_params: CPProblem, +) -> ttb.ktensor: ... + + +def generate_solution( + problem_params: Union[CPProblem, TuckerProblem], +) -> Union[ttb.ktensor, ttb.ttensor]: + """Generate problem solution.""" + factor_matrices = generate_solution_factors(problem_params) + # Create final model + if isinstance(problem_params, TuckerProblem): + nfactors = cast(list[int], problem_params.num_factors) + core = ttb.tensor(problem_params.core_generator(tuple(nfactors))) + return ttb.ttensor(core, factor_matrices) + elif isinstance(problem_params, CPProblem): + weights = problem_params.lambda_generator((problem_params.num_factors,)) + return ttb.ktensor(factor_matrices, weights) + raise ValueError(f"Unsupported problem parameter type: {type(problem_params)=}") + + +def generate_data( + solution: Union[ttb.ktensor, ttb.ttensor], + problem_params: BaseProblem, + data_params: DataParams, +) -> ttb.tensor: + """Generate problem data.""" + shape = solution.shape + # TODO handle the sparsity pattern + # TODO don't we already have a randn tensor method? + Rdm = ttb.tensor(randn(shape)) + Z = solution.full() + if problem_params.symmetric is not None: + # TODO Note in MATLAB code to follow up + Rdm = Rdm.symmetrize(np.array(problem_params.symmetric)) + + D = Z + data_params.noise * Z.norm() * Rdm / Rdm.norm() + # Make sure the final result is definitely symmetric + if problem_params.symmetric is not None: + D = D.symmetrize(np.array(problem_params.symmetric)) + return D diff --git a/tests/test_create_problem.py b/tests/test_create_problem.py new file mode 100644 index 00000000..3a7aac18 --- /dev/null +++ b/tests/test_create_problem.py @@ -0,0 +1,50 @@ +import pyttb as ttb +from pyttb.create_problem import ( + CPProblem, + DataParams, + TuckerProblem, + generate_data, + generate_solution, +) + + +def test_generate_solution_cp(): + # Smoke test with defaults + shape = (2, 2, 2) + cp_params = CPProblem(shape) + model = generate_solution(cp_params) + assert isinstance(model, ttb.ktensor) + assert model.shape == shape + + # TODO could test with different generators and enforce that they actually get used + + +def test_generate_data_cp(): + # Smoke test with defaults + shape = (2, 2, 2) + cp_params = CPProblem(shape) + model = generate_solution(cp_params) + data = generate_data(model, cp_params, data_params=DataParams()) + assert isinstance(data, ttb.tensor) + assert data.shape == model.shape + + +def test_generate_solution_tucker(): + # Smoke test with defaults + shape = (2, 2, 2) + tucker_params = TuckerProblem(shape) + model = generate_solution(tucker_params) + assert isinstance(model, ttb.ttensor) + assert model.shape == shape + + # TODO could test with different generators and enforce that they actually get used + + +def test_generate_data_tucker(): + # Smoke test with defaults + shape = (2, 2, 2) + tucker_params = TuckerProblem(shape) + model = generate_solution(tucker_params) + data = generate_data(model, tucker_params, data_params=DataParams()) + assert isinstance(data, ttb.tensor) + assert data.shape == model.shape From 476eab7f50d900fe45e36e0b91be4c2535a9f1d7 Mon Sep 17 00:00:00 2001 From: Nick Johnson <24689722+ntjohnson1@users.noreply.github.com> Date: Fri, 6 Jun 2025 08:00:26 -0400 Subject: [PATCH 07/20] Small cleanup and improv some testing --- pyttb/create_problem.py | 4 +-- tests/test_create_problem.py | 48 ++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/pyttb/create_problem.py b/pyttb/create_problem.py index 0eae152c..d6c846f3 100644 --- a/pyttb/create_problem.py +++ b/pyttb/create_problem.py @@ -38,7 +38,7 @@ class CPProblem(BaseProblem): num_factors: int = 2 # TODO probably rename weight generator for consistency - lambda_generator: solution_generator = np.random.random + weight_generator: solution_generator = np.random.random @dataclass @@ -180,7 +180,7 @@ def generate_solution( core = ttb.tensor(problem_params.core_generator(tuple(nfactors))) return ttb.ttensor(core, factor_matrices) elif isinstance(problem_params, CPProblem): - weights = problem_params.lambda_generator((problem_params.num_factors,)) + weights = problem_params.weight_generator((problem_params.num_factors,)) return ttb.ktensor(factor_matrices, weights) raise ValueError(f"Unsupported problem parameter type: {type(problem_params)=}") diff --git a/tests/test_create_problem.py b/tests/test_create_problem.py index 3a7aac18..843ebed8 100644 --- a/tests/test_create_problem.py +++ b/tests/test_create_problem.py @@ -1,13 +1,50 @@ +import pytest + import pyttb as ttb from pyttb.create_problem import ( CPProblem, DataParams, + MissingData, TuckerProblem, + create_problem, generate_data, generate_solution, ) +class TestDataclasses: + def test_dataparams(self): + with pytest.raises(ValueError): + number_larger_than_one = 2.0 + DataParams(noise=number_larger_than_one) + with pytest.raises(ValueError): + number_less_than_zero = -2.0 + DataParams(noise=number_less_than_zero) + + def test_missingdata(self): + with pytest.raises(ValueError): + number_larger_than_one = 2.0 + MissingData(missing_ratio=number_larger_than_one) + with pytest.raises(ValueError): + number_less_than_zero = -2.0 + MissingData(missing_ratio=number_less_than_zero) + with pytest.raises(ValueError): + non_zero = 0.5 + MissingData(missing_ratio=non_zero, sparse_model=True) + + missing_params = MissingData(missing_ratio=0.1) + assert missing_params.has_missing() + with pytest.raises(ValueError): + missing_params.raise_symmetric() + missing_params = MissingData(sparse_model=True) + assert missing_params.has_missing() + with pytest.raises(ValueError): + missing_params.raise_symmetric() + missing_params = MissingData() + assert not missing_params.has_missing() + missing_params.raise_symmetric() + + def test_generate_solution_cp(): # Smoke test with defaults shape = (2, 2, 2) @@ -48,3 +85,14 @@ def test_generate_data_tucker(): data = generate_data(model, tucker_params, data_params=DataParams()) assert isinstance(data, ttb.tensor) assert data.shape == model.shape + + +def test_create_problem_smoke(): + shape = (2, 2, 2) + cp_params = CPProblem(shape) + data_params = DataParams() + missing_params = MissingData() + soln, data = create_problem(cp_params, missing_params, data_params) + assert soln.full().shape == data.shape + + # TODO hit edge cases and symmetric From ff2311be52b3d388548a50afe577b1b9bc3d4924 Mon Sep 17 00:00:00 2001 From: Nick Johnson <24689722+ntjohnson1@users.noreply.github.com> Date: Fri, 6 Jun 2025 14:16:03 -0400 Subject: [PATCH 08/20] Add basic support for sparse_generation --- pyttb/create_problem.py | 89 ++++++++++++++++++++++++++++++++++-- tests/test_create_problem.py | 14 ++++++ 2 files changed, 100 insertions(+), 3 deletions(-) diff --git a/pyttb/create_problem.py b/pyttb/create_problem.py index d6c846f3..af146a9c 100644 --- a/pyttb/create_problem.py +++ b/pyttb/create_problem.py @@ -1,9 +1,12 @@ """Create test problems for tensor factorizations.""" +import logging +import math from dataclasses import dataclass from typing import Callable, Optional, Tuple, Union, cast, overload import numpy as np +from numpy_groupies import aggregate as accumarray import pyttb as ttb from pyttb.pyttb_utils import Shape @@ -39,6 +42,7 @@ class CPProblem(BaseProblem): num_factors: int = 2 # TODO probably rename weight generator for consistency weight_generator: solution_generator = np.random.random + sparse_generation: Optional[float] = None @dataclass @@ -100,7 +104,9 @@ def raise_symmetric(self): @overload def create_problem( problem_params: CPProblem, missing_params: MissingData, data_params: DataParams -) -> Tuple[ttb.ktensor, ttb.tensor]: ... # pragma: no cover see coveragepy/issues/970 +) -> Tuple[ + ttb.ktensor, Union[ttb.tensor, ttb.sptensor] +]: ... # pragma: no cover see coveragepy/issues/970 @overload @@ -113,7 +119,7 @@ def create_problem( problem_params: Union[CPProblem, TuckerProblem], missing_params: MissingData, data_params: DataParams, -) -> Tuple[Union[ttb.ktensor, ttb.ttensor], ttb.tensor]: +) -> Tuple[Union[ttb.ktensor, ttb.ttensor], Union[ttb.tensor, ttb.sptensor]]: """Generate a problem and solution.""" if problem_params.symmetric is not None: missing_params.raise_symmetric() @@ -123,7 +129,15 @@ def create_problem( if missing_params.sparse_model: raise NotImplementedError("Sparse generation not yet supported") - data = generate_data(solution, problem_params, data_params) + data: Union[ttb.tensor, ttb.sptensor] + if ( + isinstance(problem_params, CPProblem) + and problem_params.sparse_generation is not None + ): + solution = cast(ttb.ktensor, solution) + solution, data = generate_data_sparse(solution, problem_params, data_params) + else: + data = generate_data(solution, problem_params, data_params) return solution, data @@ -205,3 +219,72 @@ def generate_data( if problem_params.symmetric is not None: D = D.symmetrize(np.array(problem_params.symmetric)) return D + + +def prosample(nsamples: int, prob: np.ndarray) -> np.ndarray: + """Proportional Sampling.""" + bins = np.minimum(np.cumsum(np.array([0, *prob])), 1) + bins[-1] = 1 + indices = np.digitize(np.random.random(nsamples), bins=bins) + return indices - 1 + + +def generate_data_sparse( + solution: ttb.ktensor, problem_params: CPProblem, data_params: DataParams +) -> Tuple[ttb.ktensor, ttb.sptensor]: + """Generate sparse CP data from a given solution.""" + # Error check on solution + if np.any(solution.weights < 0): + raise ValueError("All weights must be nonnegative.") + if any(np.any(factor < 0) for factor in solution.factor_matrices): + raise ValueError("All factor matrices must be nonnegative.") + if problem_params.symmetric is not None: + logging.warning("Summetric constraints have been ignored.") + if problem_params.sparse_generation is None: + raise ValueError("Cannot generate sparse data without sparse_generation set.") + + # Convert solution to probability tensor + P = solution.normalize(mode=0) + eta = np.sum(P.weights) + P.weights /= eta + + # Determine how many samples per component + nedges = problem_params.sparse_generation + if nedges < 1: + nedges = np.round(nedges * math.prod(P.shape)).astype(int) + nedges = int(nedges) + nd = P.ndims + nc = P.ncomponents + csample = prosample(nedges, P.weights) + # TODO check this + csums = accumarray(csample, 1, size=nc) + + # Determine the subscripts for each randomly sampled entry + shape = solution.shape + subs: list[np.ndarray] = [] + for c in range(nc): + nsample = csums[c] + if nsample == 0: + continue + subs.append(np.zeros((nsample, nd), dtype=int)) + for d in range(nd): + subs[-1][:, d] = prosample(nsample, P.factor_matrices[d][:, c]) + # TODO could sum csums and allocate in place with slicing + allsubs = np.vstack(subs) + # Assemble final tensor. Note that duplicates are summed. + # TODO should we have sptenones for purposes like this? + Z = ttb.sptensor( + allsubs, + np.ones( + len(allsubs), + ), + shape=shape, + ) + + # Rescale S so that it is proportional to the number of edges inserted + solution = P + solution.weights *= nedges + + # TODO no noise introduced in this special case in MATLAB + + return solution, Z diff --git a/tests/test_create_problem.py b/tests/test_create_problem.py index 843ebed8..32ff943e 100644 --- a/tests/test_create_problem.py +++ b/tests/test_create_problem.py @@ -1,3 +1,4 @@ +import numpy as np import pytest import pyttb as ttb @@ -96,3 +97,16 @@ def test_create_problem_smoke(): assert soln.full().shape == data.shape # TODO hit edge cases and symmetric + + +def test_create_problem_smoke_sparse(): + shape = (2, 2, 2) + cp_params = CPProblem( + shape, sparse_generation=0.99, factor_generator=np.random.random + ) + data_params = DataParams() + missing_params = MissingData() + soln, data = create_problem(cp_params, missing_params, data_params) + assert soln.full().shape == data.shape + + # TODO hit edge cases and symmetric From 12cf0679fcec8c183467c3feba7376dd00726409 Mon Sep 17 00:00:00 2001 From: Nick Johnson <24689722+ntjohnson1@users.noreply.github.com> Date: Mon, 9 Jun 2025 07:32:26 -0400 Subject: [PATCH 09/20] Fix a few comments --- pyttb/create_problem.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pyttb/create_problem.py b/pyttb/create_problem.py index af146a9c..312217dc 100644 --- a/pyttb/create_problem.py +++ b/pyttb/create_problem.py @@ -40,8 +40,8 @@ class CPProblem(BaseProblem): """Parameters specifying CP Solutions.""" num_factors: int = 2 - # TODO probably rename weight generator for consistency weight_generator: solution_generator = np.random.random + # TODO: This is in DataParams in MATLAB, but only works for CP problems sparse_generation: Optional[float] = None @@ -63,7 +63,6 @@ class DataParams: """Parameters to control data quality.""" noise: float = 0.10 - # TODO handle weird sparse_generation option def __post_init__( self, @@ -164,7 +163,6 @@ def generate_solution_factors(base_params: BaseProblem) -> list[np.ndarray]: if base_params.symmetric is not None: for grp in base_params.symmetric: - # TODO see if this can be a single indexed op for j in range(1, len(grp)): factor_matrices[grp[j]] = factor_matrices[grp[0]] From 8cd301ae22bbe4e9f28f31d3d1620acb364dea63 Mon Sep 17 00:00:00 2001 From: Nick Johnson <24689722+ntjohnson1@users.noreply.github.com> Date: Mon, 16 Jun 2025 08:39:41 -0400 Subject: [PATCH 10/20] Minor improvement on collapse typing --- pyttb/sptensor.py | 16 ++++++++++++++++ pyttb/tensor.py | 16 ++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/pyttb/sptensor.py b/pyttb/sptensor.py index 942a212e..2eddee5b 100644 --- a/pyttb/sptensor.py +++ b/pyttb/sptensor.py @@ -438,6 +438,20 @@ def allsubs(self) -> np.ndarray: return s.astype(int) + @overload + def collapse( + self, + dims: None, + function_handle: Callable[[np.ndarray], Union[float, np.ndarray]], + ) -> float: ... # pragma: no cover see coveragepy/issues/970 + + @overload + def collapse( + self, + dims: OneDArray, + function_handle: Callable[[np.ndarray], Union[float, np.ndarray]] = sum, + ) -> Union[np.ndarray, sptensor]: ... # pragma: no cover see coveragepy/issues/970 + def collapse( self, dims: Optional[OneDArray] = None, @@ -503,6 +517,8 @@ def collapse( size=newsize[0], func=function_handle, ) + # TODO think about if this makes sense + # complicates return typing return np.zeros((newsize[0],)) # Create Result diff --git a/pyttb/tensor.py b/pyttb/tensor.py index 1fe81dd3..1c2e8aa8 100644 --- a/pyttb/tensor.py +++ b/pyttb/tensor.py @@ -311,6 +311,20 @@ def __deepcopy__(self, memo): """Return deep copy of this tensor.""" return self.copy() + @overload + def collapse( + self, + dims: None, + fun: Callable[[np.ndarray], Union[float, np.ndarray]], + ) -> float: ... + + @overload + def collapse( + self, + dims: OneDArray, + fun: Callable[[np.ndarray], Union[float, np.ndarray]] = np.sum, + ) -> Union[np.ndarray, tensor]: ... + def collapse( self, dims: Optional[OneDArray] = None, @@ -382,6 +396,8 @@ def collapse( Min value: -0.977277879876411 """ if self.data.size == 0: + # TODO verify this is the only thing that returns np array + # and remove return np.array([], order=self.order) if dims is None: From 60339247a905b55cd6dfbb9949494479974ac2ca Mon Sep 17 00:00:00 2001 From: Nick Johnson <24689722+ntjohnson1@users.noreply.github.com> Date: Mon, 16 Jun 2025 10:00:22 -0400 Subject: [PATCH 11/20] Preliminary implementation for missing data generation, minimal testing --- pyttb/create_problem.py | 179 +++++++++++++++++++++++++++++++---- tests/test_create_problem.py | 18 +++- 2 files changed, 174 insertions(+), 23 deletions(-) diff --git a/pyttb/create_problem.py b/pyttb/create_problem.py index 312217dc..0cd19f65 100644 --- a/pyttb/create_problem.py +++ b/pyttb/create_problem.py @@ -9,7 +9,7 @@ from numpy_groupies import aggregate as accumarray import pyttb as ttb -from pyttb.pyttb_utils import Shape +from pyttb.pyttb_utils import Shape, parse_shape solution_generator = Callable[[Tuple[int, ...]], np.ndarray] @@ -77,20 +77,22 @@ class MissingData: missing_ratio: float = 0.0 sparse_model: bool = False - # TODO add spare pattern tensor + missing_pattern: Optional[Union[ttb.sptensor, ttb.tensor]] = None def __post_init__(self): if not 0.0 <= self.missing_ratio <= 1.0: raise ValueError( f"Missing ratio must be in [0,1] but got {self.missing_ratio}" ) - - if self.sparse_model and self.missing_ratio > 0.0: - raise ValueError("Can't combine missing data and sparse generation.") + if self.missing_ratio > 0.0 and self.missing_pattern is not None: + raise ValueError( + "Can't set ratio and explicit pattern to specify missing data. " + "Select one or the other." + ) def has_missing(self) -> bool: """Check if any form of missing data is requested.""" - return self.sparse_model or self.missing_ratio > 0.0 + return self.missing_ratio > 0.0 or self.missing_pattern is not None def raise_symmetric(self): """Raise for unsupported symmetry request.""" @@ -99,6 +101,107 @@ def raise_symmetric(self): if self.sparse_model: raise ValueError("Can't generate sparse symmetric problem.") + def get_pattern(self, shape: Shape) -> Union[None, ttb.tensor, ttb.sptensor]: + """Generate a tensor pattern of missing data.""" + if self.missing_pattern is not None: + if self.missing_pattern.shape != shape: + raise ValueError( + "Missing pattern and problem shapes are not compatible." + ) + return self.missing_pattern + + if self.missing_ratio == 0.0: + # All usages of this are internal, should we just rule out this situation? + return None + if self.missing_ratio < 0.8 and self.sparse_model: + logging.warning( + "Setting sparse to false because there are" + " fewer than 80% missing elements." + ) + return _create_missing_data_pattern( + shape, self.missing_ratio, self.sparse_model + ) + + +def _create_missing_data_pattern( + shape: Shape, missing_ratio: float, sparse_model: bool = False +) -> Union[ttb.tensor, ttb.sptensor]: + """Create a randomly missing element indicator tensor. + + Creates a binary tensor of specified size with 0's indication missing data + and 1's indicating valid data. Will only return a tensor that has at least + one entry per N-1 dimensional slice. + """ + shape = parse_shape(shape) + ndim = len(shape) + P = math.prod(shape) + Q = math.ceil((1 - missing_ratio) * P) + W: Union[ttb.tensor, ttb.sptensor] + + # Create tensor + ## Keep iterating until tensor is created or we give up. + # TODO: make range configurable? + for _ in range(20): + if sparse_model: + # Start with 50% more than Q random subs + # Note in original matlab to work out expected value of a*Q to guarantee + # Q unique entries + subs = np.unique( + np.floor( + np.random.random((int(np.ceil(1.5 * Q)), len(shape))).dot( + np.diag(shape) + ) + ), + axis=0, + ).astype(int) + # Check if there are too many unique subs + if len(subs) > Q: + # TODO: check if note from matlab still relevant + # Note in original matlab: unique orders the subs and would bias toward + # first subs with lower values, so we sample to cut back + idx = np.random.permutation(subs.shape[0]) + subs = subs[idx[:Q]] + elif subs.shape[0] < Q: + logging.warning( + f"Only generated {subs.shape[0]} of " f"{Q} desired subscripts" + ) + W = ttb.sptensor( + subs, + np.ones( + (len(subs), 1), + ), + shape=shape, + ) + else: + # Compute the linear indices of the missing entries. + idx = np.random.permutation(P) + idx = idx[:Q] + W = ttb.tenzeros(shape) + W[idx] = 1 + # return W + + # Check if W has any empty slices + isokay = True + for n in range(ndim): + all_but_n = np.arange(W.ndims) + all_but_n = np.delete(all_but_n, n) + collapse_W = W.collapse(all_but_n) + if isinstance(collapse_W, np.ndarray): + isokay &= bool(np.all(collapse_W)) + else: + isokay &= bool(np.all(collapse_W.double())) + + # Quit if okay + if isokay: + break + + if not isokay: + raise ValueError( + f"After {iter} iterations, cannot produce a tensor with" + f"{missing_ratio*100} missing data without an empty slice." + ) + return W + @overload def create_problem( @@ -125,16 +228,21 @@ def create_problem( solution = generate_solution(problem_params) - if missing_params.sparse_model: - raise NotImplementedError("Sparse generation not yet supported") - data: Union[ttb.tensor, ttb.sptensor] if ( isinstance(problem_params, CPProblem) and problem_params.sparse_generation is not None ): + if missing_params.has_missing(): + raise ValueError( + f"Can't combine missing data {MissingData.__name__} and " + f" sparse generation {CPProblem.__name__}." + ) solution = cast(ttb.ktensor, solution) solution, data = generate_data_sparse(solution, problem_params, data_params) + elif missing_params.has_missing(): + pattern = missing_params.get_pattern(solution.shape) + data = generate_data(solution, problem_params, data_params, pattern) else: data = generate_data(solution, problem_params, data_params) return solution, data @@ -197,20 +305,55 @@ def generate_solution( raise ValueError(f"Unsupported problem parameter type: {type(problem_params)=}") +@overload +def generate_data( + solution: Union[ttb.ktensor, ttb.ttensor], + problem_params: BaseProblem, + data_params: DataParams, + pattern: Optional[ttb.tensor] = None, +) -> ttb.tensor: ... # pragma: no cover see coveragepy/issues/970 + + +@overload def generate_data( solution: Union[ttb.ktensor, ttb.ttensor], problem_params: BaseProblem, data_params: DataParams, -) -> ttb.tensor: + pattern: ttb.sptensor, +) -> ttb.sptensor: ... # pragma: no cover see coveragepy/issues/970 + + +def generate_data( + solution: Union[ttb.ktensor, ttb.ttensor], + problem_params: BaseProblem, + data_params: DataParams, + pattern: Optional[Union[ttb.tensor, ttb.sptensor]] = None, +) -> Union[ttb.tensor, ttb.sptensor]: """Generate problem data.""" shape = solution.shape - # TODO handle the sparsity pattern - # TODO don't we already have a randn tensor method? - Rdm = ttb.tensor(randn(shape)) - Z = solution.full() - if problem_params.symmetric is not None: - # TODO Note in MATLAB code to follow up - Rdm = Rdm.symmetrize(np.array(problem_params.symmetric)) + Rdm: Union[ttb.tensor, ttb.sptensor] + if pattern is not None: + if isinstance(pattern, ttb.sptensor): + Rdm = ttb.sptensor(pattern.subs, randn((pattern.nnz, 1)), pattern.shape) + try: + Z = pattern * solution + except Exception as E: + raise ValueError( + f"{pattern.shape=}, {pattern.subs.shape}, {pattern.vals.shape}" + ) from E + + elif isinstance(pattern, ttb.tensor): + Rdm = pattern * ttb.tensor(randn(shape)) + Z = pattern * solution.full() + else: + raise ValueError(f"Unsupported sparsity pattern of type {type(pattern)}") + else: + # TODO don't we already have a randn tensor method? + Rdm = ttb.tensor(randn(shape)) + Z = solution.full() + if problem_params.symmetric is not None: + # TODO Note in MATLAB code to follow up + Rdm = Rdm.symmetrize(np.array(problem_params.symmetric)) D = Z + data_params.noise * Z.norm() * Rdm / Rdm.norm() # Make sure the final result is definitely symmetric @@ -274,7 +417,7 @@ def generate_data_sparse( Z = ttb.sptensor( allsubs, np.ones( - len(allsubs), + (len(allsubs), 1), ), shape=shape, ) diff --git a/tests/test_create_problem.py b/tests/test_create_problem.py index 32ff943e..996c785e 100644 --- a/tests/test_create_problem.py +++ b/tests/test_create_problem.py @@ -29,16 +29,11 @@ def test_missingdata(self): with pytest.raises(ValueError): number_less_than_zero = -2.0 MissingData(missing_ratio=number_less_than_zero) - with pytest.raises(ValueError): - non_zero = 0.5 - MissingData(missing_ratio=non_zero, sparse_model=True) missing_params = MissingData(missing_ratio=0.1) assert missing_params.has_missing() with pytest.raises(ValueError): missing_params.raise_symmetric() - missing_params = MissingData(sparse_model=True) - assert missing_params.has_missing() with pytest.raises(ValueError): missing_params.raise_symmetric() missing_params = MissingData() @@ -110,3 +105,16 @@ def test_create_problem_smoke_sparse(): assert soln.full().shape == data.shape # TODO hit edge cases and symmetric + + +def test_create_problem_smoke_missing(): + shape = (4, 5, 6) + cp_params = CPProblem(shape, factor_generator=np.random.random) + data_params = DataParams() + missing_params = MissingData(missing_ratio=0.8) + soln, data = create_problem(cp_params, missing_params, data_params) + assert soln.full().shape == data.shape + + missing_params = MissingData(missing_ratio=0.8, sparse_model=True) + soln, data = create_problem(cp_params, missing_params, data_params) + assert soln.full().shape == data.shape From b014292763d75ed6178c42eb70c4f666024fd5da Mon Sep 17 00:00:00 2001 From: Nick Johnson <24689722+ntjohnson1@users.noreply.github.com> Date: Tue, 17 Jun 2025 12:59:49 -0400 Subject: [PATCH 12/20] Add further smoke tests and tutorial notebook --- .../tutorial/utility_test_problem.ipynb | 674 ++++++++++++++++++ pyttb/create_problem.py | 19 +- tests/test_create_problem.py | 28 + 3 files changed, 712 insertions(+), 9 deletions(-) create mode 100644 docs/source/tutorial/utility_test_problem.ipynb diff --git a/docs/source/tutorial/utility_test_problem.ipynb b/docs/source/tutorial/utility_test_problem.ipynb new file mode 100644 index 00000000..08e51035 --- /dev/null +++ b/docs/source/tutorial/utility_test_problem.ipynb @@ -0,0 +1,674 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f1c6d8db", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "source": [ + "# Creating Test Problems\n", + "```\n", + "Copyright 2025 National Technology & Engineering Solutions of Sandia,\n", + "LLC (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the\n", + "U.S. Government retains certain rights in this software.\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "31511b37", + "metadata": {}, + "source": [ + "We demonstrate how to use the `create_problem` function to create test problems for decomposition algorithms. " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "afb832c2", + "metadata": {}, + "outputs": [], + "source": [ + "import pyttb as ttb\n", + "from pyttb.create_problem import (\n", + " CPProblem,\n", + " TuckerProblem,\n", + " MissingData,\n", + " DataParams,\n", + " create_problem,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "9451a579", + "metadata": {}, + "outputs": [], + "source": [ + "# Set global random seed for reproducibility of this notebook\n", + "import numpy as np\n", + "\n", + "np.random.seed(123)" + ] + }, + { + "cell_type": "markdown", + "id": "7771e8fe", + "metadata": {}, + "source": [ + "## Create a CP test problem\n", + "The `create_problem` function generates both the solution (as a `ktensor` for CP) and the test data (as a dense `tensor`)." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e6191ae4", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a problem\n", + "cp_specific_params = CPProblem(shape=(5, 4, 3), num_factors=3)\n", + "data_params = DataParams(noise=0.1)\n", + "no_missing_data = MissingData()\n", + "solution, data = create_problem(cp_specific_params, no_missing_data, data_params)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "8745779d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ktensor of shape (5, 4, 3) with order F\n", + "weights=[0.94416002 0.50183668 0.62395295]\n", + "factor_matrices[0] =\n", + "[[-1.0856306 0.99734545 0.2829785 ]\n", + " [-1.50629471 -0.57860025 1.65143654]\n", + " [-2.42667924 -0.42891263 1.26593626]\n", + " [-0.8667404 -0.67888615 -0.09470897]\n", + " [ 1.49138963 -0.638902 -0.44398196]]\n", + "factor_matrices[1] =\n", + "[[-0.43435128 2.20593008 2.18678609]\n", + " [ 1.0040539 0.3861864 0.73736858]\n", + " [ 1.49073203 -0.93583387 1.17582904]\n", + " [-1.25388067 -0.6377515 0.9071052 ]]\n", + "factor_matrices[2] =\n", + "[[-1.4286807 -0.14006872 -0.8617549 ]\n", + " [-0.25561937 -2.79858911 -1.7715331 ]\n", + " [-0.69987723 0.92746243 -0.17363568]]\n" + ] + } + ], + "source": [ + "# Display the solution\n", + "print(solution)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "b0bc3232", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor of shape (5, 4, 3) with order F\n", + "data[:, :, 0] =\n", + "[[-1.18990893 1.28446351 2.07235179 -1.87633271]\n", + " [-3.12652349 1.07273265 2.34701048 -3.14030325]\n", + " [-2.81968366 2.67865791 4.10636867 -4.33460199]\n", + " [-0.49910248 1.58553609 1.67667918 -1.4803083 ]\n", + " [ 1.5935628 -1.73784063 -2.7256112 2.76967403]]\n", + "data[:, :, 1] =\n", + "[[-4.02748914 -0.53027464 1.39868896 0.35255157]\n", + " [-2.24482406 -0.51914665 -2.34027329 -2.45371282]\n", + " [-2.02367801 -0.3794908 -1.16866717 -2.43337295]\n", + " [ 2.46562453 0.78956773 -0.26223999 -0.47003828]\n", + " [ 3.48686179 0.07186695 -1.21278825 0.24950518]]\n", + "data[:, :, 2] =\n", + "[[ 0.84583153 0.55670008 0.42026956 -0.99690908]\n", + " [-1.5567177 0.8349424 1.8725418 -1.14868937]\n", + " [-1.57718852 1.46198797 2.6604315 -2.05249945]\n", + " [-0.82259772 0.42556336 1.14869343 -0.65901074]\n", + " [-0.28411876 -1.17623054 -1.27449033 1.31403245]]\n" + ] + } + ], + "source": [ + "# Display the data\n", + "print(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "14a85431", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.1\n" + ] + } + ], + "source": [ + "# The difference between the true solution and measured data\n", + "# should match the specified noise setting\n", + "diff = (solution.full() - data).norm() / solution.full().norm()\n", + "print(diff)" + ] + }, + { + "cell_type": "markdown", + "id": "1b7abeb5", + "metadata": {}, + "source": [ + "## Creating a Tucker test problem\n", + "The `create_problem` function can also create Tucker problems by providing a `TuckerParams` data class as the first argument to `create_problem` instead. In this case, the function generates the solution as a `ttensor`." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f7af9632", + "metadata": {}, + "outputs": [], + "source": [ + "tucker_specific_params = TuckerProblem(shape=(5, 4, 3), num_factors=[3, 3, 2])\n", + "data_params = DataParams(noise=0.1)\n", + "no_missing_data = MissingData()\n", + "solution, data = create_problem(tucker_specific_params, no_missing_data, data_params)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "699c9ecc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "TTensor of shape: (5, 4, 3)\n", + "\tCore is a\n", + "\t\ttensor of shape (3, 3, 2) with order F\n", + "\t\tdata[:, :, 0] =\n", + "\t\t[[ 2.29546945 0.8628987 -0.13287838]\n", + "\t\t [ 0.31529775 0.94012555 -1.24988658]\n", + "\t\t [-0.75751615 0.66752096 -1.84400643]]\n", + "\t\tdata[:, :, 1] =\n", + "\t\t[[ 0.82319976 0.06143129 -0.31048223]\n", + "\t\t [-0.71417742 1.06731682 0.3213871 ]\n", + "\t\t [ 0.33786152 -1.90931822 0.37383405]]\n", + "\tU[0] = \n", + "\t\t[[ 0.93898923 0.43781947 1.14109158]\n", + "\t\t [ 0.17145177 -1.54957884 -0.97402348]\n", + "\t\t [-1.0491106 -0.46483438 -0.49055989]\n", + "\t\t [ 1.0007457 2.14851419 1.43240926]\n", + "\t\t [-0.13335333 0.00577405 -0.66762081]]\n", + "\tU[1] = \n", + "\t\t[[-0.94061891 0.93080981 0.04634267]\n", + "\t\t [ 1.33673724 0.28026028 1.49663046]\n", + "\t\t [-0.68415163 0.335301 -1.12855526]\n", + "\t\t [-0.13372712 -0.78503925 -0.23590284]]\n", + "\tU[2] = \n", + "\t\t[[-1.41195749 -0.88776123]\n", + "\t\t [ 0.10426711 0.42249603]\n", + "\t\t [-0.20072189 -1.41672713]]\n", + "\n" + ] + } + ], + "source": [ + "# Display the solution\n", + "print(solution)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "48285087", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor of shape (5, 4, 3) with order F\n", + "data[:, :, 0] =\n", + "[[ 1.90571751 1.29306932 -2.66047991 0.4787608 ]\n", + " [ 3.32632534 -8.24046905 7.05868556 -0.94570443]\n", + " [-1.70172708 2.04521885 0.47297378 -1.76717467]\n", + " [-1.77933637 5.49652024 -7.81954496 2.61105222]\n", + " [-0.62849444 -2.47539421 1.61469082 0.71437041]]\n", + "data[:, :, 1] =\n", + "[[-0.90290826 0.53648692 0.06304186 0.10529605]\n", + " [-0.59241983 0.91173894 -0.68241772 0.38676663]\n", + " [ 0.40853234 -0.04163589 0.21205378 0.08396353]\n", + " [-0.53454083 0.26397327 0.43616478 -0.47223017]\n", + " [ 0.07478656 -0.04549533 0.20458064 -0.37257969]]\n", + "data[:, :, 2] =\n", + "[[ 3.01781992 -1.167676 1.59175537 -0.96841114]\n", + " [ 1.37702074 -0.87936349 0.47784026 -0.01377307]\n", + " [-1.51797541 1.40668289 -0.8199048 0.2912658 ]\n", + " [-0.00535056 -0.77270545 0.0753881 0.21781704]\n", + " [-1.98105208 0.16641742 -0.82378859 1.06506215]]\n" + ] + } + ], + "source": [ + "# Display the data\n", + "print(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "9305a0be", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.1\n" + ] + } + ], + "source": [ + "# The difference between the true solution and measured data\n", + "# should match the specified noise setting\n", + "diff = (solution.full() - data).norm() / solution.full().norm()\n", + "print(diff)" + ] + }, + { + "cell_type": "markdown", + "id": "a3cdffab", + "metadata": {}, + "source": [ + "## Recreating the same test problem\n", + "We are still relying on numpy's deprecated global random state. See [#441](https://github.com/sandialabs/pyttb/issues/441)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "4d836930", + "metadata": {}, + "outputs": [], + "source": [ + "# Problem details\n", + "shape = [5, 4, 3]\n", + "num_factors = 3\n", + "seed = 123\n", + "missing_params = MissingData()\n", + "data_params = DataParams()\n", + "cp_specific_params = CPProblem(shape, num_factors=num_factors)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "21c10394", + "metadata": {}, + "outputs": [], + "source": [ + "# Generate the first test problem\n", + "np.random.seed(seed)\n", + "solution_1, data_1 = create_problem(cp_specific_params, missing_params, data_params)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "749f8aae", + "metadata": {}, + "outputs": [], + "source": [ + "# Generate the second test problem\n", + "np.random.seed(seed)\n", + "solution_2, data_2 = create_problem(cp_specific_params, missing_params, data_params)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "6c6dd4a6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "solution_1.isequal(solution_2)=True\n", + "(data_1-data_2).norm()=0.0\n" + ] + } + ], + "source": [ + "# Check that the solutions are identical\n", + "print(f\"{solution_1.isequal(solution_2)=}\")\n", + "\n", + "# Check that the data are identical\n", + "print(f\"{(data_1-data_2).norm()=}\")" + ] + }, + { + "cell_type": "markdown", + "id": "90a399d2", + "metadata": {}, + "source": [ + "## Options for creating factor matrices, core tensors, and weights\n", + "\n", + "User specified functions may be provided to generate the relevant components of `ktensors` or `ttensors`." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "7e20d77a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1. 1.]\n" + ] + } + ], + "source": [ + "# Example custom weight generator for CP Problems\n", + "cp_specific_params = CPProblem(shape=[5, 4, 3], num_factors=2, weight_generator=np.ones)\n", + "solution, _ = create_problem(cp_specific_params, missing_params, data_params)\n", + "print(f\"{solution.weights}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "4f18ec86", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor of shape (2, 2, 2) with order F\n", + "data[:, :, 0] =\n", + "[[1. 1.]\n", + " [1. 1.]]\n", + "data[:, :, 1] =\n", + "[[1. 1.]\n", + " [1. 1.]]\n" + ] + } + ], + "source": [ + "# Example custom core generator for Tucker\n", + "tucker_specific_params = TuckerProblem(\n", + " shape=[5, 4, 3], num_factors=[2, 2, 2], core_generator=ttb.tenones\n", + ")\n", + "solution, _ = create_problem(tucker_specific_params, missing_params, data_params)\n", + "print(f\"{solution.core}\")" + ] + }, + { + "cell_type": "markdown", + "id": "40db96b5", + "metadata": {}, + "source": [ + "## Create dense missing data problems\n", + "It's possible to create problems that have a percentage of missing data. The problem generator randomly creates the pattern of missing data." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "e6ceafb2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor of shape (5, 4, 3) with order F\n", + "data[:, :, 0] =\n", + "[[1. 1. 1. 1.]\n", + " [1. 1. 1. 1.]\n", + " [1. 0. 0. 1.]\n", + " [1. 0. 1. 1.]\n", + " [0. 0. 1. 1.]]\n", + "data[:, :, 1] =\n", + "[[1. 0. 1. 1.]\n", + " [0. 1. 1. 1.]\n", + " [0. 0. 1. 0.]\n", + " [0. 1. 0. 1.]\n", + " [0. 1. 1. 1.]]\n", + "data[:, :, 2] =\n", + "[[1. 1. 1. 1.]\n", + " [1. 0. 1. 1.]\n", + " [1. 1. 1. 0.]\n", + " [1. 1. 1. 1.]\n", + " [1. 1. 1. 1.]]\n" + ] + } + ], + "source": [ + "# Specify 25% missing data\n", + "missing_data_params = MissingData(missing_ratio=0.25)\n", + "\n", + "# Show an example of randomly generated pattern\n", + "# 1 is known 0 is unknown\n", + "print(missing_data_params.get_pattern(shape=[5, 4, 3]))" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "de646ec4", + "metadata": {}, + "outputs": [], + "source": [ + "# Generate problem using a newly sampled pattern\n", + "solution, data = create_problem(cp_specific_params, missing_data_params, data_params)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "a51a3e70", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor of shape (5, 4, 3) with order F\n", + "data[:, :, 0] =\n", + "[[ 0.26328253 -0.10368023 2.55048192 -3.57426141]\n", + " [ 0.94610094 0. -0.33422528 0. ]\n", + " [-0.92754391 0.06078374 -0.58964057 1.05604786]\n", + " [ 0.09245559 0.09024844 -0.30026929 1.37588424]\n", + " [ 0. 0.28395231 1.72801315 -0.92447749]]\n", + "data[:, :, 1] =\n", + "[[ 9.52217582e+00 -0.00000000e+00 0.00000000e+00 -6.69297443e+00]\n", + " [ 1.15649571e+00 0.00000000e+00 5.55042375e-01 -1.65046604e+00]\n", + " [-4.51899793e+00 0.00000000e+00 5.78509093e-01 0.00000000e+00]\n", + " [-2.79055031e+00 0.00000000e+00 4.46173850e-01 2.02037594e+00]\n", + " [ 0.00000000e+00 -4.02815924e-01 -7.73108195e-01 8.60303664e-03]]\n", + "data[:, :, 2] =\n", + "[[ 3.79691232 -0.06051519 0.65215482 -0. ]\n", + " [ 0.88487369 -0.32951914 -0. -0.4502584 ]\n", + " [-2.0738586 -0.1541553 -0.01849825 0. ]\n", + " [-0.88031719 0. 0. 1.15149304]\n", + " [-0.26446742 -0.16180758 0.39415731 -0.15164033]]\n" + ] + } + ], + "source": [ + "# Show data (including noise) with missing entries zeroed out\n", + "print(data)" + ] + }, + { + "cell_type": "markdown", + "id": "b318629f", + "metadata": {}, + "source": [ + "## Creating sparse missing data problems\n", + "If `sparse_models` is set to true then the returned data is sparse. This should only be used with `missing_ratio` >= 0.8." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "475f352b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "sparse tensor of shape (5, 4, 3) with 12 nonzeros and order F\n", + "[2, 0, 0] = 1.0\n", + "[4, 1, 2] = 1.0\n", + "[0, 2, 1] = 1.0\n", + "[3, 1, 0] = 1.0\n", + "[0, 3, 2] = 1.0\n", + "[4, 1, 0] = 1.0\n", + "[2, 0, 2] = 1.0\n", + "[1, 0, 2] = 1.0\n", + "[0, 1, 2] = 1.0\n", + "[4, 2, 0] = 1.0\n", + "[4, 3, 0] = 1.0\n", + "[4, 1, 1] = 1.0\n" + ] + } + ], + "source": [ + "missing_data_params = MissingData(missing_ratio=0.8, sparse_model=True)\n", + "\n", + "# Here is a candidate pattern of known data\n", + "print(missing_data_params.get_pattern([5, 4, 3]))" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "927d028b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "sparse tensor of shape (5, 4, 3) with 12 nonzeros and order F\n", + "[0, 0, 2] = -5.383104265170353\n", + "[1, 0, 1] = 1.3205409642301527\n", + "[1, 3, 1] = 0.37245008604597707\n", + "[2, 1, 0] = 3.4968221275551286\n", + "[2, 3, 0] = -0.60505637068868\n", + "[3, 1, 0] = 1.2090679007381293\n", + "[3, 3, 0] = 0.465905565990883\n", + "[3, 3, 1] = -0.4776597676392981\n", + "[4, 2, 0] = 1.322753952503849\n", + "[4, 2, 2] = 4.164836676033628\n", + "[4, 3, 1] = 0.04320152879052623\n", + "[4, 3, 2] = 0.5475986467539911\n" + ] + } + ], + "source": [ + "# Here is the data (including noise) with zeros not explicitly represented.\n", + "solution, data = create_problem(cp_specific_params, missing_data_params, data_params)\n", + "print(data)" + ] + }, + { + "cell_type": "markdown", + "id": "b0a4db7a", + "metadata": {}, + "source": [ + "## Create missing data problems with pre-specified pattern\n", + "A specific pattern (dense or sparse) can be use to represent missing data. This is also currently the recommended approach for reproducibility." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "499efc37", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor of shape (5, 4, 3) with order F\n", + "data[:, :, 0] =\n", + "[[ 1.12259246 -0.62712395 0.37444797 0.14341225]\n", + " [ 0. -0.23923868 -0.28106573 -0. ]\n", + " [-2.19406735 -0. -1.26176736 -0.96253911]\n", + " [ 1.19096803 0.73586963 0.82194128 0.71532815]\n", + " [-0.06070134 0.18508213 0.05135651 -0.09115959]]\n", + "data[:, :, 1] =\n", + "[[ 0. -2.17818307 0.00366178 0. ]\n", + " [-0.51123889 0. 0. -0.30924106]\n", + " [-2.75480765 -0.36658613 -1.36684341 -1.02292674]\n", + " [ 0.9916353 0. 0.72938433 0.66456863]\n", + " [-0.40295989 0.38817973 -0.07536029 -0.03630603]]\n", + "data[:, :, 2] =\n", + "[[-1.17821661 1.27948531 0.16695706 -0. ]\n", + " [-0. -0.15915173 -0.17588344 0.02034108]\n", + " [-0. 0. 0. -0.33177688]\n", + " [ 0.61206739 -0.17658631 0.1972258 0. ]\n", + " [ 0. -0.21265941 -0.00546545 0.07131428]]\n" + ] + } + ], + "source": [ + "# Grab a pattern from before\n", + "pattern = MissingData(missing_ratio=0.25).get_pattern([5, 4, 3])\n", + "missing_data_params = MissingData(missing_pattern=pattern)\n", + "solution, data = create_problem(cp_specific_params, missing_data_params, data_params)\n", + "print(data)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pyttb/create_problem.py b/pyttb/create_problem.py index 0cd19f65..fedaf5d5 100644 --- a/pyttb/create_problem.py +++ b/pyttb/create_problem.py @@ -12,6 +12,9 @@ from pyttb.pyttb_utils import Shape, parse_shape solution_generator = Callable[[Tuple[int, ...]], np.ndarray] +core_generator_t = Callable[ + [Tuple[int, ...]], Union[ttb.tensor, ttb.sptensor, np.ndarray] +] def randn(shape: Tuple[int, ...]) -> np.ndarray: @@ -51,7 +54,7 @@ class TuckerProblem(BaseProblem): # TODO post_init set to [2, 2, 2] num_factors: Optional[list[int]] = None - core_generator: solution_generator = randn + core_generator: core_generator_t = randn def __post_init__(self): super().__post_init__() @@ -297,7 +300,11 @@ def generate_solution( # Create final model if isinstance(problem_params, TuckerProblem): nfactors = cast(list[int], problem_params.num_factors) - core = ttb.tensor(problem_params.core_generator(tuple(nfactors))) + generated_core = problem_params.core_generator(tuple(nfactors)) + if isinstance(generated_core, (ttb.tensor, ttb.sptensor)): + core = generated_core + else: + core = ttb.tensor(generated_core) return ttb.ttensor(core, factor_matrices) elif isinstance(problem_params, CPProblem): weights = problem_params.weight_generator((problem_params.num_factors,)) @@ -335,13 +342,7 @@ def generate_data( if pattern is not None: if isinstance(pattern, ttb.sptensor): Rdm = ttb.sptensor(pattern.subs, randn((pattern.nnz, 1)), pattern.shape) - try: - Z = pattern * solution - except Exception as E: - raise ValueError( - f"{pattern.shape=}, {pattern.subs.shape}, {pattern.vals.shape}" - ) from E - + Z = pattern * solution elif isinstance(pattern, ttb.tensor): Rdm = pattern * ttb.tensor(randn(shape)) Z = pattern * solution.full() diff --git a/tests/test_create_problem.py b/tests/test_create_problem.py index 996c785e..d4c202da 100644 --- a/tests/test_create_problem.py +++ b/tests/test_create_problem.py @@ -3,6 +3,7 @@ import pyttb as ttb from pyttb.create_problem import ( + BaseProblem, CPProblem, DataParams, MissingData, @@ -91,6 +92,20 @@ def test_create_problem_smoke(): soln, data = create_problem(cp_params, missing_params, data_params) assert soln.full().shape == data.shape + cp_params.symmetric = [(0, 1)] + soln, data = create_problem(cp_params, missing_params, data_params) + assert soln.full().shape == data.shape + + with pytest.raises(ValueError): + empty_num_factors = BaseProblem(shape) + create_problem(empty_num_factors, missing_params, data_params) + with pytest.raises(ValueError): + inconsistent_num_factors = BaseProblem(shape, num_factors=[2, 2]) + create_problem(inconsistent_num_factors, missing_params, data_params) + with pytest.raises(ValueError): + bad_problem_type = BaseProblem(shape, num_factors=3) + create_problem(bad_problem_type, missing_params, data_params) + # TODO hit edge cases and symmetric @@ -104,6 +119,9 @@ def test_create_problem_smoke_sparse(): soln, data = create_problem(cp_params, missing_params, data_params) assert soln.full().shape == data.shape + with pytest.raises(ValueError): + missing_AND_sparse_generation = MissingData(missing_ratio=0.1) + create_problem(cp_params, missing_AND_sparse_generation, data_params) # TODO hit edge cases and symmetric @@ -118,3 +136,13 @@ def test_create_problem_smoke_missing(): missing_params = MissingData(missing_ratio=0.8, sparse_model=True) soln, data = create_problem(cp_params, missing_params, data_params) assert soln.full().shape == data.shape + + with pytest.raises(ValueError): + bad_pattern_shape = np.ones([dim + 1 for dim in soln.shape]) + missing_params = MissingData(missing_pattern=bad_pattern_shape) + create_problem(cp_params, missing_params, data_params) + + with pytest.raises(ValueError): + bad_pattern_type = np.ones(soln.shape) + missing_params = MissingData(missing_pattern=bad_pattern_type) + create_problem(cp_params, missing_params, data_params) From d2725a369e30164485734a792952bcdc91e7cd5f Mon Sep 17 00:00:00 2001 From: Nick Johnson <24689722+ntjohnson1@users.noreply.github.com> Date: Tue, 17 Jun 2025 13:00:12 -0400 Subject: [PATCH 13/20] Fix ttensor doc string --- pyttb/ttensor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyttb/ttensor.py b/pyttb/ttensor.py index eb58d4f4..ba22ebec 100644 --- a/pyttb/ttensor.py +++ b/pyttb/ttensor.py @@ -200,7 +200,7 @@ def __repr__(self): # pragma: no cover str Contains the core, and factor matrices as strings on different lines. """ - display_string = f"Tensor of shape: {self.shape}\n" f"\tCore is a\n" + display_string = f"TTensor of shape: {self.shape}\n" f"\tCore is a\n" display_string += textwrap.indent(str(self.core), "\t\t") display_string += "\n" From aeb52fc3c8460694a2daaf3e9dbcec60b26e4c7f Mon Sep 17 00:00:00 2001 From: Nick Johnson <24689722+ntjohnson1@users.noreply.github.com> Date: Tue, 17 Jun 2025 13:00:34 -0400 Subject: [PATCH 14/20] Fix coverage in overload --- pyttb/tensor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyttb/tensor.py b/pyttb/tensor.py index 1c2e8aa8..4aaf9ca0 100644 --- a/pyttb/tensor.py +++ b/pyttb/tensor.py @@ -316,14 +316,14 @@ def collapse( self, dims: None, fun: Callable[[np.ndarray], Union[float, np.ndarray]], - ) -> float: ... + ) -> float: ... # pragma: no cover see coveragepy/issues/970 @overload def collapse( self, dims: OneDArray, fun: Callable[[np.ndarray], Union[float, np.ndarray]] = np.sum, - ) -> Union[np.ndarray, tensor]: ... + ) -> Union[np.ndarray, tensor]: ... # pragma: no cover see coveragepy/issues/970 def collapse( self, From a0d85124766ae59eb4c4d40c58dc2b2febcc1298 Mon Sep 17 00:00:00 2001 From: Nick Johnson <24689722+ntjohnson1@users.noreply.github.com> Date: Wed, 18 Jun 2025 08:48:01 -0400 Subject: [PATCH 15/20] Add minimal validation to sptensor to avoid footgun --- pyttb/sptensor.py | 5 +++++ tests/test_sptensor.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/pyttb/sptensor.py b/pyttb/sptensor.py index 2eddee5b..cca500ca 100644 --- a/pyttb/sptensor.py +++ b/pyttb/sptensor.py @@ -163,6 +163,11 @@ def __init__( if vals.size == 0: # In case user provides an empty array in weird format vals = np.array([], dtype=vals.dtype, ndmin=2) + elif len(vals.shape) == 1: + # Enforce column array + vals = vals.reshape((vals.shape[0], 1)) + elif len(vals.shape) > 2: + raise ValueError("Values should be a column vector") if copy: self.subs = subs.copy() diff --git a/tests/test_sptensor.py b/tests/test_sptensor.py index 0e41e23a..b7d32756 100644 --- a/tests/test_sptensor.py +++ b/tests/test_sptensor.py @@ -1357,7 +1357,7 @@ def test_sptensor_squeeze(sample_sptensor): ) assert np.array_equal( ttb.sptensor(np.array([[0, 0, 0]]), np.array([4]), (2, 2, 1)).squeeze().vals, - np.array([4]), + np.array([[4]]), ) # Singleton dimension with empty sptensor From 9a0aef73d52c9734c8ceb509bbdc841cc55a07b9 Mon Sep 17 00:00:00 2001 From: Nick Johnson <24689722+ntjohnson1@users.noreply.github.com> Date: Wed, 18 Jun 2025 11:01:31 -0400 Subject: [PATCH 16/20] Clean up create problem documentation --- docs/source/create_problem.rst | 22 + docs/source/tensor_classes.rst | 1 + .../tutorial/utility_test_problem.ipynb | 461 +++--------------- docs/source/tutorials.rst | 5 + pyttb/create_problem.py | 163 +++++-- tests/test_create_problem.py | 37 +- 6 files changed, 256 insertions(+), 433 deletions(-) create mode 100644 docs/source/create_problem.rst diff --git a/docs/source/create_problem.rst b/docs/source/create_problem.rst new file mode 100644 index 00000000..9b6a99dc --- /dev/null +++ b/docs/source/create_problem.rst @@ -0,0 +1,22 @@ +Create Test Problems (:obj:`pyttb.create_problem`) +--------------------------------------------------- + +.. autoclass:: pyttb.create_problem.BaseProblem + :exclude-members: __dict__, __weakref__, __slots__ + :show-inheritance: + +.. autoclass:: pyttb.create_problem.CPProblem + :exclude-members: __dict__, __weakref__, __slots__ + :show-inheritance: + +.. autoclass:: pyttb.create_problem.TuckerProblem + :exclude-members: __dict__, __weakref__, __slots__ + :show-inheritance: + +.. autoclass:: pyttb.create_problem.MissingData + :members: + :special-members: + :exclude-members: __dict__, __weakref__, __slots__ + :show-inheritance: + +.. autofunction:: pyttb.create_problem.create_problem diff --git a/docs/source/tensor_classes.rst b/docs/source/tensor_classes.rst index 56fb9f21..4a220ed3 100644 --- a/docs/source/tensor_classes.rst +++ b/docs/source/tensor_classes.rst @@ -12,4 +12,5 @@ Tensor Classes tenmat.rst sptenmat.rst pyttb_utils.rst + create_problem.rst diff --git a/docs/source/tutorial/utility_test_problem.ipynb b/docs/source/tutorial/utility_test_problem.ipynb index 08e51035..d4fabe8f 100644 --- a/docs/source/tutorial/utility_test_problem.ipynb +++ b/docs/source/tutorial/utility_test_problem.ipynb @@ -2,12 +2,8 @@ "cells": [ { "cell_type": "markdown", - "id": "f1c6d8db", - "metadata": { - "vscode": { - "languageId": "plaintext" - } - }, + "id": "0", + "metadata": {}, "source": [ "# Creating Test Problems\n", "```\n", @@ -19,7 +15,7 @@ }, { "cell_type": "markdown", - "id": "31511b37", + "id": "1", "metadata": {}, "source": [ "We demonstrate how to use the `create_problem` function to create test problems for decomposition algorithms. " @@ -27,8 +23,8 @@ }, { "cell_type": "code", - "execution_count": 1, - "id": "afb832c2", + "execution_count": null, + "id": "2", "metadata": {}, "outputs": [], "source": [ @@ -37,15 +33,14 @@ " CPProblem,\n", " TuckerProblem,\n", " MissingData,\n", - " DataParams,\n", " create_problem,\n", ")" ] }, { "cell_type": "code", - "execution_count": 2, - "id": "9451a579", + "execution_count": null, + "id": "3", "metadata": {}, "outputs": [], "source": [ @@ -57,7 +52,7 @@ }, { "cell_type": "markdown", - "id": "7771e8fe", + "id": "4", "metadata": {}, "source": [ "## Create a CP test problem\n", @@ -66,48 +61,23 @@ }, { "cell_type": "code", - "execution_count": 3, - "id": "e6191ae4", + "execution_count": null, + "id": "5", "metadata": {}, "outputs": [], "source": [ "# Create a problem\n", - "cp_specific_params = CPProblem(shape=(5, 4, 3), num_factors=3)\n", - "data_params = DataParams(noise=0.1)\n", + "cp_specific_params = CPProblem(shape=(5, 4, 3), num_factors=3, noise=0.1)\n", "no_missing_data = MissingData()\n", - "solution, data = create_problem(cp_specific_params, no_missing_data, data_params)" + "solution, data = create_problem(cp_specific_params, no_missing_data)" ] }, { "cell_type": "code", - "execution_count": 4, - "id": "8745779d", + "execution_count": null, + "id": "6", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ktensor of shape (5, 4, 3) with order F\n", - "weights=[0.94416002 0.50183668 0.62395295]\n", - "factor_matrices[0] =\n", - "[[-1.0856306 0.99734545 0.2829785 ]\n", - " [-1.50629471 -0.57860025 1.65143654]\n", - " [-2.42667924 -0.42891263 1.26593626]\n", - " [-0.8667404 -0.67888615 -0.09470897]\n", - " [ 1.49138963 -0.638902 -0.44398196]]\n", - "factor_matrices[1] =\n", - "[[-0.43435128 2.20593008 2.18678609]\n", - " [ 1.0040539 0.3861864 0.73736858]\n", - " [ 1.49073203 -0.93583387 1.17582904]\n", - " [-1.25388067 -0.6377515 0.9071052 ]]\n", - "factor_matrices[2] =\n", - "[[-1.4286807 -0.14006872 -0.8617549 ]\n", - " [-0.25561937 -2.79858911 -1.7715331 ]\n", - " [-0.69987723 0.92746243 -0.17363568]]\n" - ] - } - ], + "outputs": [], "source": [ "# Display the solution\n", "print(solution)" @@ -115,36 +85,10 @@ }, { "cell_type": "code", - "execution_count": 5, - "id": "b0bc3232", + "execution_count": null, + "id": "7", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "tensor of shape (5, 4, 3) with order F\n", - "data[:, :, 0] =\n", - "[[-1.18990893 1.28446351 2.07235179 -1.87633271]\n", - " [-3.12652349 1.07273265 2.34701048 -3.14030325]\n", - " [-2.81968366 2.67865791 4.10636867 -4.33460199]\n", - " [-0.49910248 1.58553609 1.67667918 -1.4803083 ]\n", - " [ 1.5935628 -1.73784063 -2.7256112 2.76967403]]\n", - "data[:, :, 1] =\n", - "[[-4.02748914 -0.53027464 1.39868896 0.35255157]\n", - " [-2.24482406 -0.51914665 -2.34027329 -2.45371282]\n", - " [-2.02367801 -0.3794908 -1.16866717 -2.43337295]\n", - " [ 2.46562453 0.78956773 -0.26223999 -0.47003828]\n", - " [ 3.48686179 0.07186695 -1.21278825 0.24950518]]\n", - "data[:, :, 2] =\n", - "[[ 0.84583153 0.55670008 0.42026956 -0.99690908]\n", - " [-1.5567177 0.8349424 1.8725418 -1.14868937]\n", - " [-1.57718852 1.46198797 2.6604315 -2.05249945]\n", - " [-0.82259772 0.42556336 1.14869343 -0.65901074]\n", - " [-0.28411876 -1.17623054 -1.27449033 1.31403245]]\n" - ] - } - ], + "outputs": [], "source": [ "# Display the data\n", "print(data)" @@ -152,18 +96,10 @@ }, { "cell_type": "code", - "execution_count": 6, - "id": "14a85431", + "execution_count": null, + "id": "8", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.1\n" - ] - } - ], + "outputs": [], "source": [ "# The difference between the true solution and measured data\n", "# should match the specified noise setting\n", @@ -173,7 +109,7 @@ }, { "cell_type": "markdown", - "id": "1b7abeb5", + "id": "9", "metadata": {}, "source": [ "## Creating a Tucker test problem\n", @@ -182,57 +118,24 @@ }, { "cell_type": "code", - "execution_count": 7, - "id": "f7af9632", + "execution_count": null, + "id": "10", "metadata": {}, "outputs": [], "source": [ - "tucker_specific_params = TuckerProblem(shape=(5, 4, 3), num_factors=[3, 3, 2])\n", - "data_params = DataParams(noise=0.1)\n", + "tucker_specific_params = TuckerProblem(\n", + " shape=(5, 4, 3), num_factors=[3, 3, 2], noise=0.1\n", + ")\n", "no_missing_data = MissingData()\n", - "solution, data = create_problem(tucker_specific_params, no_missing_data, data_params)" + "solution, data = create_problem(tucker_specific_params, no_missing_data)" ] }, { "cell_type": "code", - "execution_count": 8, - "id": "699c9ecc", + "execution_count": null, + "id": "11", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "TTensor of shape: (5, 4, 3)\n", - "\tCore is a\n", - "\t\ttensor of shape (3, 3, 2) with order F\n", - "\t\tdata[:, :, 0] =\n", - "\t\t[[ 2.29546945 0.8628987 -0.13287838]\n", - "\t\t [ 0.31529775 0.94012555 -1.24988658]\n", - "\t\t [-0.75751615 0.66752096 -1.84400643]]\n", - "\t\tdata[:, :, 1] =\n", - "\t\t[[ 0.82319976 0.06143129 -0.31048223]\n", - "\t\t [-0.71417742 1.06731682 0.3213871 ]\n", - "\t\t [ 0.33786152 -1.90931822 0.37383405]]\n", - "\tU[0] = \n", - "\t\t[[ 0.93898923 0.43781947 1.14109158]\n", - "\t\t [ 0.17145177 -1.54957884 -0.97402348]\n", - "\t\t [-1.0491106 -0.46483438 -0.49055989]\n", - "\t\t [ 1.0007457 2.14851419 1.43240926]\n", - "\t\t [-0.13335333 0.00577405 -0.66762081]]\n", - "\tU[1] = \n", - "\t\t[[-0.94061891 0.93080981 0.04634267]\n", - "\t\t [ 1.33673724 0.28026028 1.49663046]\n", - "\t\t [-0.68415163 0.335301 -1.12855526]\n", - "\t\t [-0.13372712 -0.78503925 -0.23590284]]\n", - "\tU[2] = \n", - "\t\t[[-1.41195749 -0.88776123]\n", - "\t\t [ 0.10426711 0.42249603]\n", - "\t\t [-0.20072189 -1.41672713]]\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "# Display the solution\n", "print(solution)" @@ -240,36 +143,10 @@ }, { "cell_type": "code", - "execution_count": 9, - "id": "48285087", + "execution_count": null, + "id": "12", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "tensor of shape (5, 4, 3) with order F\n", - "data[:, :, 0] =\n", - "[[ 1.90571751 1.29306932 -2.66047991 0.4787608 ]\n", - " [ 3.32632534 -8.24046905 7.05868556 -0.94570443]\n", - " [-1.70172708 2.04521885 0.47297378 -1.76717467]\n", - " [-1.77933637 5.49652024 -7.81954496 2.61105222]\n", - " [-0.62849444 -2.47539421 1.61469082 0.71437041]]\n", - "data[:, :, 1] =\n", - "[[-0.90290826 0.53648692 0.06304186 0.10529605]\n", - " [-0.59241983 0.91173894 -0.68241772 0.38676663]\n", - " [ 0.40853234 -0.04163589 0.21205378 0.08396353]\n", - " [-0.53454083 0.26397327 0.43616478 -0.47223017]\n", - " [ 0.07478656 -0.04549533 0.20458064 -0.37257969]]\n", - "data[:, :, 2] =\n", - "[[ 3.01781992 -1.167676 1.59175537 -0.96841114]\n", - " [ 1.37702074 -0.87936349 0.47784026 -0.01377307]\n", - " [-1.51797541 1.40668289 -0.8199048 0.2912658 ]\n", - " [-0.00535056 -0.77270545 0.0753881 0.21781704]\n", - " [-1.98105208 0.16641742 -0.82378859 1.06506215]]\n" - ] - } - ], + "outputs": [], "source": [ "# Display the data\n", "print(data)" @@ -277,18 +154,10 @@ }, { "cell_type": "code", - "execution_count": 10, - "id": "9305a0be", + "execution_count": null, + "id": "13", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.1\n" - ] - } - ], + "outputs": [], "source": [ "# The difference between the true solution and measured data\n", "# should match the specified noise setting\n", @@ -298,7 +167,7 @@ }, { "cell_type": "markdown", - "id": "a3cdffab", + "id": "14", "metadata": {}, "source": [ "## Recreating the same test problem\n", @@ -307,8 +176,8 @@ }, { "cell_type": "code", - "execution_count": 11, - "id": "4d836930", + "execution_count": null, + "id": "15", "metadata": {}, "outputs": [], "source": [ @@ -317,49 +186,39 @@ "num_factors = 3\n", "seed = 123\n", "missing_params = MissingData()\n", - "data_params = DataParams()\n", "cp_specific_params = CPProblem(shape, num_factors=num_factors)" ] }, { "cell_type": "code", - "execution_count": 12, - "id": "21c10394", + "execution_count": null, + "id": "16", "metadata": {}, "outputs": [], "source": [ "# Generate the first test problem\n", "np.random.seed(seed)\n", - "solution_1, data_1 = create_problem(cp_specific_params, missing_params, data_params)" + "solution_1, data_1 = create_problem(cp_specific_params, missing_params)" ] }, { "cell_type": "code", - "execution_count": 13, - "id": "749f8aae", + "execution_count": null, + "id": "17", "metadata": {}, "outputs": [], "source": [ "# Generate the second test problem\n", "np.random.seed(seed)\n", - "solution_2, data_2 = create_problem(cp_specific_params, missing_params, data_params)" + "solution_2, data_2 = create_problem(cp_specific_params, missing_params)" ] }, { "cell_type": "code", - "execution_count": 14, - "id": "6c6dd4a6", + "execution_count": null, + "id": "18", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "solution_1.isequal(solution_2)=True\n", - "(data_1-data_2).norm()=0.0\n" - ] - } - ], + "outputs": [], "source": [ "# Check that the solutions are identical\n", "print(f\"{solution_1.isequal(solution_2)=}\")\n", @@ -370,7 +229,7 @@ }, { "cell_type": "markdown", - "id": "90a399d2", + "id": "19", "metadata": {}, "source": [ "## Options for creating factor matrices, core tensors, and weights\n", @@ -380,57 +239,35 @@ }, { "cell_type": "code", - "execution_count": 15, - "id": "7e20d77a", + "execution_count": null, + "id": "20", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[1. 1.]\n" - ] - } - ], + "outputs": [], "source": [ "# Example custom weight generator for CP Problems\n", "cp_specific_params = CPProblem(shape=[5, 4, 3], num_factors=2, weight_generator=np.ones)\n", - "solution, _ = create_problem(cp_specific_params, missing_params, data_params)\n", + "solution, _ = create_problem(cp_specific_params, missing_params)\n", "print(f\"{solution.weights}\")" ] }, { "cell_type": "code", - "execution_count": 16, - "id": "4f18ec86", + "execution_count": null, + "id": "21", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "tensor of shape (2, 2, 2) with order F\n", - "data[:, :, 0] =\n", - "[[1. 1.]\n", - " [1. 1.]]\n", - "data[:, :, 1] =\n", - "[[1. 1.]\n", - " [1. 1.]]\n" - ] - } - ], + "outputs": [], "source": [ "# Example custom core generator for Tucker\n", "tucker_specific_params = TuckerProblem(\n", " shape=[5, 4, 3], num_factors=[2, 2, 2], core_generator=ttb.tenones\n", ")\n", - "solution, _ = create_problem(tucker_specific_params, missing_params, data_params)\n", + "solution, _ = create_problem(tucker_specific_params, missing_params)\n", "print(f\"{solution.core}\")" ] }, { "cell_type": "markdown", - "id": "40db96b5", + "id": "22", "metadata": {}, "source": [ "## Create dense missing data problems\n", @@ -439,36 +276,10 @@ }, { "cell_type": "code", - "execution_count": 21, - "id": "e6ceafb2", + "execution_count": null, + "id": "23", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "tensor of shape (5, 4, 3) with order F\n", - "data[:, :, 0] =\n", - "[[1. 1. 1. 1.]\n", - " [1. 1. 1. 1.]\n", - " [1. 0. 0. 1.]\n", - " [1. 0. 1. 1.]\n", - " [0. 0. 1. 1.]]\n", - "data[:, :, 1] =\n", - "[[1. 0. 1. 1.]\n", - " [0. 1. 1. 1.]\n", - " [0. 0. 1. 0.]\n", - " [0. 1. 0. 1.]\n", - " [0. 1. 1. 1.]]\n", - "data[:, :, 2] =\n", - "[[1. 1. 1. 1.]\n", - " [1. 0. 1. 1.]\n", - " [1. 1. 1. 0.]\n", - " [1. 1. 1. 1.]\n", - " [1. 1. 1. 1.]]\n" - ] - } - ], + "outputs": [], "source": [ "# Specify 25% missing data\n", "missing_data_params = MissingData(missing_ratio=0.25)\n", @@ -480,47 +291,21 @@ }, { "cell_type": "code", - "execution_count": 24, - "id": "de646ec4", + "execution_count": null, + "id": "24", "metadata": {}, "outputs": [], "source": [ "# Generate problem using a newly sampled pattern\n", - "solution, data = create_problem(cp_specific_params, missing_data_params, data_params)" + "solution, data = create_problem(cp_specific_params, missing_data_params)" ] }, { "cell_type": "code", - "execution_count": 26, - "id": "a51a3e70", + "execution_count": null, + "id": "25", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "tensor of shape (5, 4, 3) with order F\n", - "data[:, :, 0] =\n", - "[[ 0.26328253 -0.10368023 2.55048192 -3.57426141]\n", - " [ 0.94610094 0. -0.33422528 0. ]\n", - " [-0.92754391 0.06078374 -0.58964057 1.05604786]\n", - " [ 0.09245559 0.09024844 -0.30026929 1.37588424]\n", - " [ 0. 0.28395231 1.72801315 -0.92447749]]\n", - "data[:, :, 1] =\n", - "[[ 9.52217582e+00 -0.00000000e+00 0.00000000e+00 -6.69297443e+00]\n", - " [ 1.15649571e+00 0.00000000e+00 5.55042375e-01 -1.65046604e+00]\n", - " [-4.51899793e+00 0.00000000e+00 5.78509093e-01 0.00000000e+00]\n", - " [-2.79055031e+00 0.00000000e+00 4.46173850e-01 2.02037594e+00]\n", - " [ 0.00000000e+00 -4.02815924e-01 -7.73108195e-01 8.60303664e-03]]\n", - "data[:, :, 2] =\n", - "[[ 3.79691232 -0.06051519 0.65215482 -0. ]\n", - " [ 0.88487369 -0.32951914 -0. -0.4502584 ]\n", - " [-2.0738586 -0.1541553 -0.01849825 0. ]\n", - " [-0.88031719 0. 0. 1.15149304]\n", - " [-0.26446742 -0.16180758 0.39415731 -0.15164033]]\n" - ] - } - ], + "outputs": [], "source": [ "# Show data (including noise) with missing entries zeroed out\n", "print(data)" @@ -528,7 +313,7 @@ }, { "cell_type": "markdown", - "id": "b318629f", + "id": "26", "metadata": {}, "source": [ "## Creating sparse missing data problems\n", @@ -538,29 +323,9 @@ { "cell_type": "code", "execution_count": null, - "id": "475f352b", + "id": "27", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "sparse tensor of shape (5, 4, 3) with 12 nonzeros and order F\n", - "[2, 0, 0] = 1.0\n", - "[4, 1, 2] = 1.0\n", - "[0, 2, 1] = 1.0\n", - "[3, 1, 0] = 1.0\n", - "[0, 3, 2] = 1.0\n", - "[4, 1, 0] = 1.0\n", - "[2, 0, 2] = 1.0\n", - "[1, 0, 2] = 1.0\n", - "[0, 1, 2] = 1.0\n", - "[4, 2, 0] = 1.0\n", - "[4, 3, 0] = 1.0\n", - "[4, 1, 1] = 1.0\n" - ] - } - ], + "outputs": [], "source": [ "missing_data_params = MissingData(missing_ratio=0.8, sparse_model=True)\n", "\n", @@ -570,39 +335,19 @@ }, { "cell_type": "code", - "execution_count": 28, - "id": "927d028b", + "execution_count": null, + "id": "28", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "sparse tensor of shape (5, 4, 3) with 12 nonzeros and order F\n", - "[0, 0, 2] = -5.383104265170353\n", - "[1, 0, 1] = 1.3205409642301527\n", - "[1, 3, 1] = 0.37245008604597707\n", - "[2, 1, 0] = 3.4968221275551286\n", - "[2, 3, 0] = -0.60505637068868\n", - "[3, 1, 0] = 1.2090679007381293\n", - "[3, 3, 0] = 0.465905565990883\n", - "[3, 3, 1] = -0.4776597676392981\n", - "[4, 2, 0] = 1.322753952503849\n", - "[4, 2, 2] = 4.164836676033628\n", - "[4, 3, 1] = 0.04320152879052623\n", - "[4, 3, 2] = 0.5475986467539911\n" - ] - } - ], + "outputs": [], "source": [ "# Here is the data (including noise) with zeros not explicitly represented.\n", - "solution, data = create_problem(cp_specific_params, missing_data_params, data_params)\n", + "solution, data = create_problem(cp_specific_params, missing_data_params)\n", "print(data)" ] }, { "cell_type": "markdown", - "id": "b0a4db7a", + "id": "29", "metadata": {}, "source": [ "## Create missing data problems with pre-specified pattern\n", @@ -611,64 +356,20 @@ }, { "cell_type": "code", - "execution_count": 29, - "id": "499efc37", + "execution_count": null, + "id": "30", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "tensor of shape (5, 4, 3) with order F\n", - "data[:, :, 0] =\n", - "[[ 1.12259246 -0.62712395 0.37444797 0.14341225]\n", - " [ 0. -0.23923868 -0.28106573 -0. ]\n", - " [-2.19406735 -0. -1.26176736 -0.96253911]\n", - " [ 1.19096803 0.73586963 0.82194128 0.71532815]\n", - " [-0.06070134 0.18508213 0.05135651 -0.09115959]]\n", - "data[:, :, 1] =\n", - "[[ 0. -2.17818307 0.00366178 0. ]\n", - " [-0.51123889 0. 0. -0.30924106]\n", - " [-2.75480765 -0.36658613 -1.36684341 -1.02292674]\n", - " [ 0.9916353 0. 0.72938433 0.66456863]\n", - " [-0.40295989 0.38817973 -0.07536029 -0.03630603]]\n", - "data[:, :, 2] =\n", - "[[-1.17821661 1.27948531 0.16695706 -0. ]\n", - " [-0. -0.15915173 -0.17588344 0.02034108]\n", - " [-0. 0. 0. -0.33177688]\n", - " [ 0.61206739 -0.17658631 0.1972258 0. ]\n", - " [ 0. -0.21265941 -0.00546545 0.07131428]]\n" - ] - } - ], + "outputs": [], "source": [ "# Grab a pattern from before\n", "pattern = MissingData(missing_ratio=0.25).get_pattern([5, 4, 3])\n", "missing_data_params = MissingData(missing_pattern=pattern)\n", - "solution, data = create_problem(cp_specific_params, missing_data_params, data_params)\n", + "solution, data = create_problem(cp_specific_params, missing_data_params)\n", "print(data)" ] } ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.12" - } - }, + "metadata": {}, "nbformat": 4, "nbformat_minor": 5 } diff --git a/docs/source/tutorials.rst b/docs/source/tutorials.rst index ed373b56..4b1ffbc8 100644 --- a/docs/source/tutorials.rst +++ b/docs/source/tutorials.rst @@ -32,6 +32,11 @@ Tucker Decompositions Working with Tensors ==================== +.. toctree:: + :maxdepth: 1 + + Creating Test Problems + Converting Between Tensors and Matrices --------------------------------------- diff --git a/pyttb/create_problem.py b/pyttb/create_problem.py index fedaf5d5..f89cde6d 100644 --- a/pyttb/create_problem.py +++ b/pyttb/create_problem.py @@ -2,7 +2,7 @@ import logging import math -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Callable, Optional, Tuple, Union, cast, overload import numpy as np @@ -27,30 +27,99 @@ def randn(shape: Tuple[int, ...]) -> np.ndarray: @dataclass class BaseProblem: - """Parameters general to all solutions.""" + """Parameters general to all solutions. + + Attributes + ---------- + shape: + Tensor shape for generated problem. + factor_generator: + Method to generate factor matrices. + symmetric: + List of modes that should be symmetric. + For instance, `[(1,2), (3,4)]` specifies that + modes 1 and 2 have identical factor matrices, and modes 3 and 4 + also have identical factor matrices. + num_factors: + Number of factors. + noise: + Amount of Gaussian noise to add to solution. + If data is sparse noise is only added to nonzero entries. + """ - shape: Shape + shape: Shape = field(metadata={"doc": "A shape"}) factor_generator: solution_generator = randn symmetric: Optional[list[Tuple[int, int]]] = None num_factors: Union[int, list[int], None] = None + noise: float = 0.10 def __post_init__(self): self.shape = ttb.pyttb_utils.parse_shape(self.shape) + if not 0.0 <= self.noise <= 1.0: + raise ValueError(f"Noise must be in [0,1] but got {self.noise}") @dataclass class CPProblem(BaseProblem): - """Parameters specifying CP Solutions.""" + """Parameters specifying CP Solutions. + + Attributes + ---------- + shape: + Tensor shape for generated problem. + factor_generator: + Method to generate factor matrices. + symmetric: + List of modes that should be symmetric. + For instance, `[(1,2), (3,4)]` specifies that + modes 1 and 2 have identical factor matrices, and modes 3 and 4 + also have identical factor matrices. + num_factors: + Number of factors. + noise: + Amount of Gaussian noise to add to solution. + If data is sparse noise is only added to nonzero entries. + weight_generator: + Method to generate weights for ktensor solution. + sparse_generation: + Generate a sparse tensor that can be scaled so that the + column factors and weights are stochastic. Provide a number + of nonzeros to be inserted. A value in range [0,1) will be + interpreted as a ratio. + """ + + # NOTE inherited attributes are manually copy pasted, keep aligned between problems num_factors: int = 2 weight_generator: solution_generator = np.random.random - # TODO: This is in DataParams in MATLAB, but only works for CP problems + # TODO: This is in DataParams in MATLAB, but only works for CP problems so + # feels more reasonable here sparse_generation: Optional[float] = None @dataclass class TuckerProblem(BaseProblem): - """Parameters specifying Tucker Solutions.""" + """Parameters specifying Tucker Solutions. + + Attributes + ---------- + shape: + Tensor shape for generated problem. + factor_generator: + Method to generate factor matrices. + symmetric: + List of modes that should be symmetric. + For instance, `[(1,2), (3,4)]` specifies that + modes 1 and 2 have identical factor matrices, and modes 3 and 4 + also have identical factor matrices. + num_factors: + Number of factors. + noise: + Amount of Gaussian noise to add to solution. + If data is sparse noise is only added to nonzero entries. + core_generator: + Method to generate weights for ttensor solution. + """ # TODO post_init set to [2, 2, 2] num_factors: Optional[list[int]] = None @@ -61,26 +130,25 @@ def __post_init__(self): self.num_factors = self.num_factors or [2, 2, 2] -@dataclass -class DataParams: - """Parameters to control data quality.""" - - noise: float = 0.10 - - def __post_init__( - self, - ): - if not 0.0 <= self.noise <= 1.0: - raise ValueError(f"Noise must be in [0,1] but got {self.noise}") - - @dataclass class MissingData: - """Parameters to control missing data.""" + """Parameters to control missing data. + + Attributes + ---------- + missing_ratio: + Proportion of missing data. + missing_pattern: + An explicit tensor representing missing data locations. + sparse_model: + Whether to generate sparse rather than dense missing data pattern. + Only useful for large tensors that don't easily fit in memory and + when missing ratio > 0.8. + """ missing_ratio: float = 0.0 - sparse_model: bool = False missing_pattern: Optional[Union[ttb.sptensor, ttb.tensor]] = None + sparse_model: bool = False def __post_init__(self): if not 0.0 <= self.missing_ratio <= 1.0: @@ -208,7 +276,7 @@ def _create_missing_data_pattern( @overload def create_problem( - problem_params: CPProblem, missing_params: MissingData, data_params: DataParams + problem_params: CPProblem, missing_params: MissingData ) -> Tuple[ ttb.ktensor, Union[ttb.tensor, ttb.sptensor] ]: ... # pragma: no cover see coveragepy/issues/970 @@ -216,16 +284,47 @@ def create_problem( @overload def create_problem( - problem_params: TuckerProblem, missing_params: MissingData, data_params: DataParams + problem_params: TuckerProblem, + missing_params: MissingData, ) -> Tuple[ttb.ttensor, ttb.tensor]: ... # pragma: no cover see coveragepy/issues/970 def create_problem( problem_params: Union[CPProblem, TuckerProblem], missing_params: MissingData, - data_params: DataParams, ) -> Tuple[Union[ttb.ktensor, ttb.ttensor], Union[ttb.tensor, ttb.sptensor]]: - """Generate a problem and solution.""" + """Generate a problem and solution. + + Arguments + --------- + problem_params: + Parameters related to the problem to generate. + missing_params: + Parameters to control missing data in the generated data/solution. + + Examples + -------- + Base example params + + >>> shape = (5, 4, 3) + + Generate a CP problem + + >>> cp_specific_params = CPProblem(shape=shape, num_factors=3, noise=0.1) + >>> no_missing_data = MissingData() + >>> solution, data = create_problem(cp_specific_params, no_missing_data) + >>> diff = (solution.full() - data).norm() / solution.full().norm() + >>> bool(np.isclose(diff, 0.1)) + True + + Generate Tucker Problem + + >>> tucker_specific_params = TuckerProblem(shape, num_factors=[3, 3, 2], noise=0.1) + >>> solution, data = create_problem(tucker_specific_params, no_missing_data) + >>> diff = (solution.full() - data).norm() / solution.full().norm() + >>> bool(np.isclose(diff, 0.1)) + True + """ if problem_params.symmetric is not None: missing_params.raise_symmetric() @@ -242,12 +341,12 @@ def create_problem( f" sparse generation {CPProblem.__name__}." ) solution = cast(ttb.ktensor, solution) - solution, data = generate_data_sparse(solution, problem_params, data_params) + solution, data = generate_data_sparse(solution, problem_params) elif missing_params.has_missing(): pattern = missing_params.get_pattern(solution.shape) - data = generate_data(solution, problem_params, data_params, pattern) + data = generate_data(solution, problem_params, pattern) else: - data = generate_data(solution, problem_params, data_params) + data = generate_data(solution, problem_params) return solution, data @@ -316,7 +415,6 @@ def generate_solution( def generate_data( solution: Union[ttb.ktensor, ttb.ttensor], problem_params: BaseProblem, - data_params: DataParams, pattern: Optional[ttb.tensor] = None, ) -> ttb.tensor: ... # pragma: no cover see coveragepy/issues/970 @@ -325,7 +423,6 @@ def generate_data( def generate_data( solution: Union[ttb.ktensor, ttb.ttensor], problem_params: BaseProblem, - data_params: DataParams, pattern: ttb.sptensor, ) -> ttb.sptensor: ... # pragma: no cover see coveragepy/issues/970 @@ -333,7 +430,6 @@ def generate_data( def generate_data( solution: Union[ttb.ktensor, ttb.ttensor], problem_params: BaseProblem, - data_params: DataParams, pattern: Optional[Union[ttb.tensor, ttb.sptensor]] = None, ) -> Union[ttb.tensor, ttb.sptensor]: """Generate problem data.""" @@ -356,7 +452,7 @@ def generate_data( # TODO Note in MATLAB code to follow up Rdm = Rdm.symmetrize(np.array(problem_params.symmetric)) - D = Z + data_params.noise * Z.norm() * Rdm / Rdm.norm() + D = Z + problem_params.noise * Z.norm() * Rdm / Rdm.norm() # Make sure the final result is definitely symmetric if problem_params.symmetric is not None: D = D.symmetrize(np.array(problem_params.symmetric)) @@ -372,7 +468,8 @@ def prosample(nsamples: int, prob: np.ndarray) -> np.ndarray: def generate_data_sparse( - solution: ttb.ktensor, problem_params: CPProblem, data_params: DataParams + solution: ttb.ktensor, + problem_params: CPProblem, ) -> Tuple[ttb.ktensor, ttb.sptensor]: """Generate sparse CP data from a given solution.""" # Error check on solution diff --git a/tests/test_create_problem.py b/tests/test_create_problem.py index d4c202da..9b2637d0 100644 --- a/tests/test_create_problem.py +++ b/tests/test_create_problem.py @@ -5,7 +5,6 @@ from pyttb.create_problem import ( BaseProblem, CPProblem, - DataParams, MissingData, TuckerProblem, create_problem, @@ -15,13 +14,14 @@ class TestDataclasses: - def test_dataparams(self): + def test_problemparams(self): + arbitrary_shape = (2, 2, 2) with pytest.raises(ValueError): number_larger_than_one = 2.0 - DataParams(noise=number_larger_than_one) + BaseProblem(arbitrary_shape, noise=number_larger_than_one) with pytest.raises(ValueError): number_less_than_zero = -2.0 - DataParams(noise=number_less_than_zero) + BaseProblem(arbitrary_shape, noise=number_less_than_zero) def test_missingdata(self): with pytest.raises(ValueError): @@ -58,7 +58,7 @@ def test_generate_data_cp(): shape = (2, 2, 2) cp_params = CPProblem(shape) model = generate_solution(cp_params) - data = generate_data(model, cp_params, data_params=DataParams()) + data = generate_data(model, cp_params) assert isinstance(data, ttb.tensor) assert data.shape == model.shape @@ -79,7 +79,7 @@ def test_generate_data_tucker(): shape = (2, 2, 2) tucker_params = TuckerProblem(shape) model = generate_solution(tucker_params) - data = generate_data(model, tucker_params, data_params=DataParams()) + data = generate_data(model, tucker_params) assert isinstance(data, ttb.tensor) assert data.shape == model.shape @@ -87,24 +87,23 @@ def test_generate_data_tucker(): def test_create_problem_smoke(): shape = (2, 2, 2) cp_params = CPProblem(shape) - data_params = DataParams() missing_params = MissingData() - soln, data = create_problem(cp_params, missing_params, data_params) + soln, data = create_problem(cp_params, missing_params) assert soln.full().shape == data.shape cp_params.symmetric = [(0, 1)] - soln, data = create_problem(cp_params, missing_params, data_params) + soln, data = create_problem(cp_params, missing_params) assert soln.full().shape == data.shape with pytest.raises(ValueError): empty_num_factors = BaseProblem(shape) - create_problem(empty_num_factors, missing_params, data_params) + create_problem(empty_num_factors, missing_params) with pytest.raises(ValueError): inconsistent_num_factors = BaseProblem(shape, num_factors=[2, 2]) - create_problem(inconsistent_num_factors, missing_params, data_params) + create_problem(inconsistent_num_factors, missing_params) with pytest.raises(ValueError): bad_problem_type = BaseProblem(shape, num_factors=3) - create_problem(bad_problem_type, missing_params, data_params) + create_problem(bad_problem_type, missing_params) # TODO hit edge cases and symmetric @@ -114,35 +113,33 @@ def test_create_problem_smoke_sparse(): cp_params = CPProblem( shape, sparse_generation=0.99, factor_generator=np.random.random ) - data_params = DataParams() missing_params = MissingData() - soln, data = create_problem(cp_params, missing_params, data_params) + soln, data = create_problem(cp_params, missing_params) assert soln.full().shape == data.shape with pytest.raises(ValueError): missing_AND_sparse_generation = MissingData(missing_ratio=0.1) - create_problem(cp_params, missing_AND_sparse_generation, data_params) + create_problem(cp_params, missing_AND_sparse_generation) # TODO hit edge cases and symmetric def test_create_problem_smoke_missing(): shape = (4, 5, 6) cp_params = CPProblem(shape, factor_generator=np.random.random) - data_params = DataParams() missing_params = MissingData(missing_ratio=0.8) - soln, data = create_problem(cp_params, missing_params, data_params) + soln, data = create_problem(cp_params, missing_params) assert soln.full().shape == data.shape missing_params = MissingData(missing_ratio=0.8, sparse_model=True) - soln, data = create_problem(cp_params, missing_params, data_params) + soln, data = create_problem(cp_params, missing_params) assert soln.full().shape == data.shape with pytest.raises(ValueError): bad_pattern_shape = np.ones([dim + 1 for dim in soln.shape]) missing_params = MissingData(missing_pattern=bad_pattern_shape) - create_problem(cp_params, missing_params, data_params) + create_problem(cp_params, missing_params) with pytest.raises(ValueError): bad_pattern_type = np.ones(soln.shape) missing_params = MissingData(missing_pattern=bad_pattern_type) - create_problem(cp_params, missing_params, data_params) + create_problem(cp_params, missing_params) From 2444a1ad411017a47e3a30e083a9b99db0aef2df Mon Sep 17 00:00:00 2001 From: Nick Johnson <24689722+ntjohnson1@users.noreply.github.com> Date: Sat, 28 Jun 2025 10:54:56 -0400 Subject: [PATCH 17/20] Extend some test converage --- tests/test_create_problem.py | 61 ++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/tests/test_create_problem.py b/tests/test_create_problem.py index 9b2637d0..ee5e97a5 100644 --- a/tests/test_create_problem.py +++ b/tests/test_create_problem.py @@ -9,6 +9,7 @@ TuckerProblem, create_problem, generate_data, + generate_data_sparse, generate_solution, ) @@ -24,6 +25,7 @@ def test_problemparams(self): BaseProblem(arbitrary_shape, noise=number_less_than_zero) def test_missingdata(self): + arbitrary_shape = (2, 2, 2) with pytest.raises(ValueError): number_larger_than_one = 2.0 MissingData(missing_ratio=number_larger_than_one) @@ -31,16 +33,36 @@ def test_missingdata(self): number_less_than_zero = -2.0 MissingData(missing_ratio=number_less_than_zero) + with pytest.raises(ValueError): + number_larger_than_zero = 1.0 + arbitrary_missing_pattern = ttb.tenones(arbitrary_shape) + MissingData( + missing_ratio=number_larger_than_zero, + missing_pattern=arbitrary_missing_pattern, + ) + missing_params = MissingData(missing_ratio=0.1) assert missing_params.has_missing() with pytest.raises(ValueError): missing_params.raise_symmetric() + + missing_params = MissingData(sparse_model=True) with pytest.raises(ValueError): missing_params.raise_symmetric() + missing_params = MissingData() assert not missing_params.has_missing() missing_params.raise_symmetric() + missing_params = MissingData() + assert missing_params.get_pattern(arbitrary_shape) is None + + def test_missingdata_logging(self, caplog): + arbitrary_shape = (2, 2, 2) + missing_params = MissingData(missing_ratio=0.1, sparse_model=True) + missing_params.get_pattern(arbitrary_shape) + assert "missing elements" in caplog.text + def test_generate_solution_cp(): # Smoke test with defaults @@ -71,6 +93,12 @@ def test_generate_solution_tucker(): assert isinstance(model, ttb.ttensor) assert model.shape == shape + # Smoke test with a tensor core generator + shape = (2, 2, 2) + tucker_params = TuckerProblem(shape, core_generator=ttb.tenrand) + model = generate_solution(tucker_params) + assert isinstance(model, ttb.ttensor) + assert model.shape == shape # TODO could test with different generators and enforce that they actually get used @@ -143,3 +171,36 @@ def test_create_problem_smoke_missing(): bad_pattern_type = np.ones(soln.shape) missing_params = MissingData(missing_pattern=bad_pattern_type) create_problem(cp_params, missing_params) + + +def test_generate_data_sparse_value_errors(): + """Test that generate_data_sparse raises expected ValueErrors.""" + shape = (3, 3, 3) + + # Test negative weights + factor_matrices = [np.random.random((3, 2)) for _ in range(3)] + negative_weights = np.array([-1.0, 1.0]) # One negative weight + solution = ttb.ktensor(factor_matrices, negative_weights) + problem_params = CPProblem(shape, sparse_generation=0.5) + + with pytest.raises(ValueError): + generate_data_sparse(solution, problem_params) + + # Test negative factor matrices + factor_matrices = [np.random.random((3, 2)) for _ in range(3)] + factor_matrices[0][0, 0] = -1.0 # Make one element negative + positive_weights = np.array([1.0, 1.0]) + solution = ttb.ktensor(factor_matrices, positive_weights) + problem_params = CPProblem(shape, sparse_generation=0.5) + + with pytest.raises(ValueError): + generate_data_sparse(solution, problem_params) + + # Test missing sparse_generation + factor_matrices = [np.random.random((3, 2)) for _ in range(3)] + positive_weights = np.array([1.0, 1.0]) + solution = ttb.ktensor(factor_matrices, positive_weights) + problem_params = CPProblem(shape, sparse_generation=None) + + with pytest.raises(ValueError): + generate_data_sparse(solution, problem_params) From 2f10f55963886a747dffaee31b49c5118cca326b Mon Sep 17 00:00:00 2001 From: Nick Johnson <24689722+ntjohnson1@users.noreply.github.com> Date: Sat, 28 Jun 2025 12:03:10 -0400 Subject: [PATCH 18/20] Add existing solution support and updated docs --- docs/source/create_problem.rst | 16 ++- .../tutorial/utility_test_problem.ipynb | 68 ++++++++- pyttb/create_problem.py | 130 +++++++++++++++--- tests/test_create_problem.py | 21 +++ 4 files changed, 214 insertions(+), 21 deletions(-) diff --git a/docs/source/create_problem.rst b/docs/source/create_problem.rst index 9b6a99dc..211bbb5d 100644 --- a/docs/source/create_problem.rst +++ b/docs/source/create_problem.rst @@ -1,6 +1,8 @@ Create Test Problems (:obj:`pyttb.create_problem`) --------------------------------------------------- +.. autofunction:: pyttb.create_problem.create_problem + .. autoclass:: pyttb.create_problem.BaseProblem :exclude-members: __dict__, __weakref__, __slots__ :show-inheritance: @@ -13,10 +15,20 @@ Create Test Problems (:obj:`pyttb.create_problem`) :exclude-members: __dict__, __weakref__, __slots__ :show-inheritance: +.. autoclass:: pyttb.create_problem.ExistingSolution + :exclude-members: __dict__, __weakref__, __slots__ + :show-inheritance: + +.. autoclass:: pyttb.create_problem.ExistingCPSolution + :exclude-members: __dict__, __weakref__, __slots__ + :show-inheritance: + +.. autoclass:: pyttb.create_problem.ExistingTuckerSolution + :exclude-members: __dict__, __weakref__, __slots__ + :show-inheritance: + .. autoclass:: pyttb.create_problem.MissingData :members: :special-members: :exclude-members: __dict__, __weakref__, __slots__ :show-inheritance: - -.. autofunction:: pyttb.create_problem.create_problem diff --git a/docs/source/tutorial/utility_test_problem.ipynb b/docs/source/tutorial/utility_test_problem.ipynb index d4fabe8f..92feb77a 100644 --- a/docs/source/tutorial/utility_test_problem.ipynb +++ b/docs/source/tutorial/utility_test_problem.ipynb @@ -31,6 +31,7 @@ "import pyttb as ttb\n", "from pyttb.create_problem import (\n", " CPProblem,\n", + " ExistingCPSolution,\n", " TuckerProblem,\n", " MissingData,\n", " create_problem,\n", @@ -367,9 +368,74 @@ "solution, data = create_problem(cp_specific_params, missing_data_params)\n", "print(data)" ] + }, + { + "cell_type": "markdown", + "id": "31", + "metadata": {}, + "source": [ + "## Creating Sparse Problems (CP only)\n", + "If we assume each model parameter is the input to a Poisson process, then we can generate a sparse test problems. This requires that all the factor matrices and lambda be nonnegative. The default factor generator ('randn') won't work since it produces both positive and negative values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "32", + "metadata": {}, + "outputs": [], + "source": [ + "# Generate factor matrices with a few large entries in each column\n", + "# This will be the basis of our solution\n", + "shape = (20, 15, 10)\n", + "num_factors = 4\n", + "A = []\n", + "for n in range(len(shape)):\n", + " A.append(np.random.rand(shape[n], num_factors))\n", + " for r in range(num_factors):\n", + " p = np.random.permutation(np.arange(shape[n]))\n", + " idx = p[1 : round(0.2 * shape[n])]\n", + " A[n][idx, r] *= 10\n", + "S = ttb.ktensor(A)\n", + "# S.normalize(sort=True);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "33", + "metadata": {}, + "outputs": [], + "source": [ + "S.normalize(sort=True).weights" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34", + "metadata": {}, + "outputs": [], + "source": [ + "# Create sparse test problem based on the solution.\n", + "# `sparse_generation` controls how many insertions to make based on the solution.\n", + "# The weight vector of the solution is automatically rescaled to match the number of insertions.\n", + "existing_params = ExistingCPSolution(S, noise=0.0, sparse_generation=500)\n", + "print(f\"{S.weights=}\")\n", + "solution, data = create_problem(existing_params)\n", + "print(\n", + " f\"num_nozeros: {data.nnz}\\n\"\n", + " f\"total_insertions: {np.sum(data.vals)}\\n\"\n", + " f\"original weights vs rescaled: {S.weights / solution.weights}\"\n", + ")" + ] } ], - "metadata": {}, + "metadata": { + "language_info": { + "name": "python" + } + }, "nbformat": 4, "nbformat_minor": 5 } diff --git a/pyttb/create_problem.py b/pyttb/create_problem.py index f89cde6d..e10c8bf7 100644 --- a/pyttb/create_problem.py +++ b/pyttb/create_problem.py @@ -81,11 +81,6 @@ class CPProblem(BaseProblem): If data is sparse noise is only added to nonzero entries. weight_generator: Method to generate weights for ktensor solution. - sparse_generation: - Generate a sparse tensor that can be scaled so that the - column factors and weights are stochastic. Provide a number - of nonzeros to be inserted. A value in range [0,1) will be - interpreted as a ratio. """ # NOTE inherited attributes are manually copy pasted, keep aligned between problems @@ -130,6 +125,71 @@ def __post_init__(self): self.num_factors = self.num_factors or [2, 2, 2] +@dataclass +class ExistingSolution: + """Parameters for using an existing tensor solution. + + Attributes + ---------- + solution: + Pre-existing tensor solution (ktensor or ttensor). + noise: + Amount of Gaussian noise to add to solution. + If data is sparse noise is only added to nonzero entries. + """ + + solution: Union[ttb.ktensor, ttb.ttensor] + noise: float = 0.10 + + def __post_init__(self): + if not 0.0 <= self.noise <= 1.0: + raise ValueError(f"Noise must be in [0,1] but got {self.noise}") + + @property + def symmetric(self) -> None: + """Get the symmetric modes from the solution.""" + # ExistingSolution doesn't support symmetry constraints + return None + + +@dataclass +class ExistingTuckerSolution(ExistingSolution): + """Parameters for using an existing tucket tensor solution. + + Attributes + ---------- + solution: + Pre-existing ttensor solution. + noise: + Amount of Gaussian noise to add to solution. + If data is sparse noise is only added to nonzero entries. + """ + + solution: ttb.ttensor + + +@dataclass +class ExistingCPSolution(ExistingSolution): + """Parameters for using an existing tucket tensor solution. + + Attributes + ---------- + solution: + Pre-existing ktensor solution. + noise: + Amount of Gaussian noise to add to solution. + If data is sparse noise is only added to nonzero entries. + sparse_generation: + Generate a sparse tensor that can be scaled so that the + column factors and weights are stochastic. Provide a number + of nonzeros to be inserted. A value in range [0,1) will be + interpreted as a ratio. + """ + + solution: ttb.ktensor + sparse_generation: Optional[float] = None + + @dataclass class MissingData: """Parameters to control missing data. @@ -276,7 +336,7 @@ def _create_missing_data_pattern( @overload def create_problem( - problem_params: CPProblem, missing_params: MissingData + problem_params: CPProblem, missing_params: Optional[MissingData] = None ) -> Tuple[ ttb.ktensor, Union[ttb.tensor, ttb.sptensor] ]: ... # pragma: no cover see coveragepy/issues/970 @@ -285,20 +345,29 @@ def create_problem( @overload def create_problem( problem_params: TuckerProblem, - missing_params: MissingData, + missing_params: Optional[MissingData] = None, ) -> Tuple[ttb.ttensor, ttb.tensor]: ... # pragma: no cover see coveragepy/issues/970 +@overload +def create_problem( + problem_params: ExistingSolution, + missing_params: Optional[MissingData] = None, +) -> Tuple[ + Union[ttb.ktensor, ttb.ttensor], Union[ttb.tensor, ttb.sptensor] +]: ... # pragma: no cover see coveragepy/issues/970 + + def create_problem( - problem_params: Union[CPProblem, TuckerProblem], - missing_params: MissingData, + problem_params: Union[CPProblem, TuckerProblem, ExistingSolution], + missing_params: Optional[MissingData] = None, ) -> Tuple[Union[ttb.ktensor, ttb.ttensor], Union[ttb.tensor, ttb.sptensor]]: """Generate a problem and solution. Arguments --------- problem_params: - Parameters related to the problem to generate. + Parameters related to the problem to generate, or an existing solution. missing_params: Parameters to control missing data in the generated data/solution. @@ -324,7 +393,19 @@ def create_problem( >>> diff = (solution.full() - data).norm() / solution.full().norm() >>> bool(np.isclose(diff, 0.1)) True + + Use existing solution + + >>> factor_matrices = [np.random.random((dim, 3)) for dim in shape] + >>> weights = np.random.random(3) + >>> existing_ktensor = ttb.ktensor(factor_matrices, weights) + >>> existing_params = ExistingSolution(existing_ktensor, noise=0.1) + >>> solution, data = create_problem(existing_params, no_missing_data) + >>> assert solution is existing_ktensor """ + if missing_params is None: + missing_params = MissingData() + if problem_params.symmetric is not None: missing_params.raise_symmetric() @@ -332,7 +413,7 @@ def create_problem( data: Union[ttb.tensor, ttb.sptensor] if ( - isinstance(problem_params, CPProblem) + isinstance(problem_params, (CPProblem, ExistingCPSolution)) and problem_params.sparse_generation is not None ): if missing_params.has_missing(): @@ -391,10 +472,18 @@ def generate_solution( ) -> ttb.ktensor: ... +@overload +def generate_solution( + problem_params: ExistingSolution, +) -> Union[ttb.ktensor, ttb.ttensor]: ... + + def generate_solution( - problem_params: Union[CPProblem, TuckerProblem], + problem_params: Union[CPProblem, TuckerProblem, ExistingSolution], ) -> Union[ttb.ktensor, ttb.ttensor]: """Generate problem solution.""" + if isinstance(problem_params, ExistingSolution): + return problem_params.solution factor_matrices = generate_solution_factors(problem_params) # Create final model if isinstance(problem_params, TuckerProblem): @@ -414,7 +503,7 @@ def generate_solution( @overload def generate_data( solution: Union[ttb.ktensor, ttb.ttensor], - problem_params: BaseProblem, + problem_params: Union[BaseProblem, ExistingSolution], pattern: Optional[ttb.tensor] = None, ) -> ttb.tensor: ... # pragma: no cover see coveragepy/issues/970 @@ -422,14 +511,14 @@ def generate_data( @overload def generate_data( solution: Union[ttb.ktensor, ttb.ttensor], - problem_params: BaseProblem, + problem_params: Union[BaseProblem, ExistingSolution], pattern: ttb.sptensor, ) -> ttb.sptensor: ... # pragma: no cover see coveragepy/issues/970 def generate_data( solution: Union[ttb.ktensor, ttb.ttensor], - problem_params: BaseProblem, + problem_params: Union[BaseProblem, ExistingSolution], pattern: Optional[Union[ttb.tensor, ttb.sptensor]] = None, ) -> Union[ttb.tensor, ttb.sptensor]: """Generate problem data.""" @@ -469,7 +558,7 @@ def prosample(nsamples: int, prob: np.ndarray) -> np.ndarray: def generate_data_sparse( solution: ttb.ktensor, - problem_params: CPProblem, + problem_params: Union[CPProblem, ExistingCPSolution], ) -> Tuple[ttb.ktensor, ttb.sptensor]: """Generate sparse CP data from a given solution.""" # Error check on solution @@ -483,7 +572,8 @@ def generate_data_sparse( raise ValueError("Cannot generate sparse data without sparse_generation set.") # Convert solution to probability tensor - P = solution.normalize(mode=0) + # NOTE: Make copy since normalize modifies in place + P = solution.copy().normalize(mode=0) eta = np.sum(P.weights) P.weights /= eta @@ -512,7 +602,7 @@ def generate_data_sparse( allsubs = np.vstack(subs) # Assemble final tensor. Note that duplicates are summed. # TODO should we have sptenones for purposes like this? - Z = ttb.sptensor( + Z = ttb.sptensor.from_aggregator( allsubs, np.ones( (len(allsubs), 1), @@ -522,6 +612,10 @@ def generate_data_sparse( # Rescale S so that it is proportional to the number of edges inserted solution = P + # raise ValueError( + # f"{nedges=}" + # f"{solution.weights=}" + # ) solution.weights *= nedges # TODO no noise introduced in this special case in MATLAB diff --git a/tests/test_create_problem.py b/tests/test_create_problem.py index ee5e97a5..3e02cf97 100644 --- a/tests/test_create_problem.py +++ b/tests/test_create_problem.py @@ -5,6 +5,7 @@ from pyttb.create_problem import ( BaseProblem, CPProblem, + ExistingSolution, MissingData, TuckerProblem, create_problem, @@ -63,6 +64,20 @@ def test_missingdata_logging(self, caplog): missing_params.get_pattern(arbitrary_shape) assert "missing elements" in caplog.text + def test_existing_solution(self, sample_ktensor_2way): + solution = sample_ktensor_2way + existing_solution = ExistingSolution(solution) + assert existing_solution.solution is solution + assert existing_solution.noise == 0.1 + + with pytest.raises(ValueError): + value_less_than_zero = -0.1 + ExistingSolution(solution, noise=value_less_than_zero) + + with pytest.raises(ValueError): + value_greater_than_one = 1.1 + ExistingSolution(solution, noise=value_greater_than_one) + def test_generate_solution_cp(): # Smoke test with defaults @@ -119,6 +134,12 @@ def test_create_problem_smoke(): soln, data = create_problem(cp_params, missing_params) assert soln.full().shape == data.shape + existing_params = ExistingSolution(soln) + missing_params = MissingData() + soln, data = create_problem(existing_params, missing_params) + assert soln.full().shape == data.shape + assert soln is existing_params.solution, "Solution should be the same object" + cp_params.symmetric = [(0, 1)] soln, data = create_problem(cp_params, missing_params) assert soln.full().shape == data.shape From 9d31b15c9e216fae7af349d21fae802e200ff7c1 Mon Sep 17 00:00:00 2001 From: Nick Johnson <24689722+ntjohnson1@users.noreply.github.com> Date: Sat, 28 Jun 2025 12:14:32 -0400 Subject: [PATCH 19/20] Fix nbstripout --- docs/source/tutorial/utility_test_problem.ipynb | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/docs/source/tutorial/utility_test_problem.ipynb b/docs/source/tutorial/utility_test_problem.ipynb index 92feb77a..583f3b8e 100644 --- a/docs/source/tutorial/utility_test_problem.ipynb +++ b/docs/source/tutorial/utility_test_problem.ipynb @@ -431,11 +431,7 @@ ] } ], - "metadata": { - "language_info": { - "name": "python" - } - }, + "metadata": {}, "nbformat": 4, "nbformat_minor": 5 } From f7ef55a3ac116af1f836eb24907557b6fcd229f6 Mon Sep 17 00:00:00 2001 From: Nick Johnson <24689722+ntjohnson1@users.noreply.github.com> Date: Sat, 28 Jun 2025 12:29:32 -0400 Subject: [PATCH 20/20] Update mypy to grab PR fixing 3.13 dataclass error: https://github.com/python/mypy/pull/18464 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 41261b93..ec0e7712 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ documentation = "https://pyttb.readthedocs.io" [project.optional-dependencies] dev = [ - "mypy>=1.10,<1.14.0", + "mypy>=1.15,<1.16.0", # Also in pre-commit "nbstripout>=0.8,<0.9", "pytest>8.0",