[MRG] Improve gromov doc and address convention issues (#497)

cedricvincentcuaz · rflamary · web-flow · commit f98698eb68f4 · 2023-08-04T18:36:14.000+02:00
* fix gromov doc

* fix pep8 + update RELEASE

* improve doc

* merge

---------

Co-authored-by: Rémi Flamary &lt;remi.flamary@gmail.com&gt;
diff --git a/RELEASES.md b/RELEASES.md
@@ -33,6 +33,7 @@ Many other bugs and issues have been fixed and we want to thank all the contribu
 
 #### Closed issues
 
+- Fix gromov conventions (PR #497)
 - Fix change in scipy API for `cdist` (PR #487)
 - More permissive check_backend (PR #494)
 - Fix circleci-redirector action and codecov (PR #460)
diff --git a/ot/gromov/_bregman.py b/ot/gromov/_bregman.py
@@ -225,8 +225,8 @@ def entropic_gromov_wasserstein2(
         C1, C2, p=None, q=None, loss_fun='square_loss', epsilon=0.1, symmetric=None, G0=None, max_iter=1000,
         tol=1e-9, solver='PGD', warmstart=False, verbose=False, log=False, **kwargs):
     r"""
-    Returns the Gromov-Wasserstein discrepancy between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`
-    estimated using Sinkhorn projections.
+    Returns the Gromov-Wasserstein loss :math:`\mathbf{GW}` between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`
+    estimated using Sinkhorn projections. To recover the Gromov-Wasserstein distance as defined in [13] compute :math:`d_{GW} = \frac{1}{2} \sqrt{\mathbf{GW}}`.
 
     If `solver="PGD"`, the function solves the following entropic-regularized
     Gromov-Wasserstein optimization problem using Projected Gradient Descent [12]:
@@ -351,7 +351,7 @@ def entropic_gromov_barycenters(
 
     .. math::
 
-        \mathbf{C} = \mathop{\arg \min}_{\mathbf{C}\in \mathbb{R}^{N \times N}} \quad \sum_s \lambda_s \mathrm{GW}(\mathbf{C}, \mathbf{C}_s, \mathbf{p}, \mathbf{p}_s)
+        \mathbf{C}^* = \mathop{\arg \min}_{\mathbf{C}\in \mathbb{R}^{N \times N}} \quad \sum_s \lambda_s \mathrm{GW}(\mathbf{C}, \mathbf{C}_s, \mathbf{p}, \mathbf{p}_s)
 
     Where :
 
@@ -700,7 +700,7 @@ def entropic_fused_gromov_wasserstein2(
         symmetric=None, alpha=0.5, G0=None, max_iter=1000, tol=1e-9,
         solver='PGD', warmstart=False, verbose=False, log=False, **kwargs):
     r"""
-    Returns the Fused Gromov-Wasserstein transport between :math:`(\mathbf{C_1}, \mathbf{Y_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{Y_2}, \mathbf{q})`
+    Returns the Fused Gromov-Wasserstein distance between :math:`(\mathbf{C_1}, \mathbf{Y_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{Y_2}, \mathbf{q})`
     with pairwise distance matrix :math:`\mathbf{M}` between node feature matrices :math:`\mathbf{Y_1}` and :math:`\mathbf{Y_2}`,
     estimated using Sinkhorn projections.
 
@@ -832,7 +832,7 @@ def entropic_fused_gromov_barycenters(
 
     .. math::
 
-        \mathbf{C}, \mathbf{Y} = \mathop{\arg \min}_{\mathbf{C}\in \mathbb{R}^{N \times N}, \mathbf{Y}\in \mathbb{Y}^{N \times d}} \quad \sum_s \lambda_s \mathrm{FGW}_{\alpha}(\mathbf{C}, \mathbf{C}_s, \mathbf{Y}, \mathbf{Y}_s, \mathbf{p}, \mathbf{p}_s)
+        \mathbf{C}^*, \mathbf{Y}^* = \mathop{\arg \min}_{\mathbf{C}\in \mathbb{R}^{N \times N}, \mathbf{Y}\in \mathbb{Y}^{N \times d}} \quad \sum_s \lambda_s \mathrm{FGW}_{\alpha}(\mathbf{C}, \mathbf{C}_s, \mathbf{Y}, \mathbf{Y}_s, \mathbf{p}, \mathbf{p}_s)
 
     Where :
 
diff --git a/ot/gromov/_estimators.py b/ot/gromov/_estimators.py
@@ -20,14 +20,14 @@
 def GW_distance_estimation(C1, C2, p, q, loss_fun, T,
                            nb_samples_p=None, nb_samples_q=None, std=True, random_state=None):
     r"""
-    Returns an approximation of the gromov-wasserstein cost between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`
-    with a fixed transport plan :math:`\mathbf{T}`.
+    Returns an approximation of the Gromov-Wasserstein loss between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`
+    with a fixed transport plan :math:`\mathbf{T}`. To recover an approximation of the Gromov-Wasserstein distance as defined in [13] compute :math:`d_{GW} = \frac{1}{2} \sqrt{\mathbf{GW}}`.
 
     The function gives an unbiased approximation of the following equation:
 
     .. math::
 
-        GW = \sum_{i,j,k,l} L(\mathbf{C_{1}}_{i,k}, \mathbf{C_{2}}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
+        \mathbf{GW} = \sum_{i,j,k,l} L(\mathbf{C_{1}}_{i,k}, \mathbf{C_{2}}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
 
     Where :
 
diff --git a/ot/gromov/_gw.py b/ot/gromov/_gw.py
@@ -26,9 +26,9 @@
 def gromov_wasserstein(C1, C2, p=None, q=None, loss_fun='square_loss', symmetric=None, log=False, armijo=False, G0=None,
                        max_iter=1e4, tol_rel=1e-9, tol_abs=1e-9, **kwargs):
     r"""
-    Returns the Gromov-Wasserstein transport between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`
+    Returns the Gromov-Wasserstein transport between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`.
 
-    The function solves the following optimization problem:
+    The function solves the following optimization problem using Conditional Gradient:
 
     .. math::
         \mathbf{T}^* \in \mathop{\arg \min}_\mathbf{T} \quad \sum_{i,j,k,l}
@@ -182,9 +182,10 @@ def line_search(cost, G, deltaG, Mi, cost_G, **kwargs):
 def gromov_wasserstein2(C1, C2, p=None, q=None, loss_fun='square_loss', symmetric=None, log=False, armijo=False, G0=None,
                         max_iter=1e4, tol_rel=1e-9, tol_abs=1e-9, **kwargs):
     r"""
-    Returns the Gromov-Wasserstein discrepancy between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`
+    Returns the Gromov-Wasserstein loss :math:`\mathbf{GW}` between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`.
+    To recover the Gromov-Wasserstein distance as defined in [13] compute :math:`d_{GW} = \frac{1}{2} \sqrt{\mathbf{GW}}`.
 
-    The function solves the following optimization problem:
+    The function solves the following optimization problem using Conditional Gradient:
 
     .. math::
         \mathbf{GW} = \min_\mathbf{T} \quad \sum_{i,j,k,l}
@@ -308,31 +309,36 @@ def gromov_wasserstein2(C1, C2, p=None, q=None, loss_fun='square_loss', symmetri
 def fused_gromov_wasserstein(M, C1, C2, p=None, q=None, loss_fun='square_loss', symmetric=None, alpha=0.5,
                              armijo=False, G0=None, log=False, max_iter=1e4, tol_rel=1e-9, tol_abs=1e-9, **kwargs):
     r"""
-    Computes the FGW transport between two graphs (see :ref:`[24] <references-fused-gromov-wasserstein>`)
+    Returns the Fused Gromov-Wasserstein transport between :math:`(\mathbf{C_1}, \mathbf{Y_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{Y_2}, \mathbf{q})`
+    with pairwise distance matrix :math:`\mathbf{M}` between node feature matrices :math:`\mathbf{Y_1}` and :math:`\mathbf{Y_2}` (see :ref:`[24] <references-fused-gromov-wasserstein>`).
+
+    The function solves the following optimization problem using Conditional Gradient:
 
     .. math::
-        \mathbf{T}^* \in \mathop{\arg \min}_\mathbf{T}  \quad (1 - \alpha) \langle \mathbf{T}, \mathbf{M} \rangle_F +
+        \mathbf{T}^* \in\mathop{\arg\min}_\mathbf{T} \quad (1 - \alpha) \langle \mathbf{T}, \mathbf{M} \rangle_F +
         \alpha \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
 
         s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p}
 
              \mathbf{T}^T \mathbf{1} &= \mathbf{q}
 
              \mathbf{T} &\geq 0
+    Where :
 
-    where :
-
-    - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix
-    - :math:`\mathbf{p}` and :math:`\mathbf{q}` are source and target weights (sum to 1)
-    - `L` is a loss function to account for the misfit between the similarity matrices
+    - :math:`\mathbf{M}`: metric cost matrix between features across domains
+    - :math:`\mathbf{C_1}`: Metric cost matrix in the source space
+    - :math:`\mathbf{C_2}`: Metric cost matrix in the target space
+    - :math:`\mathbf{p}`: distribution in the source space
+    - :math:`\mathbf{q}`: distribution in the target space
+    - `L`: loss function to account for the misfit between the similarity and feature matrices
+    - :math:`\alpha`: trade-off parameter
 
     .. note:: This function is backend-compatible and will work on arrays
         from all compatible backends. But the algorithm uses the C++ CPU backend
         which can lead to copy overhead on GPU arrays.
     .. note:: All computations in the conjugate gradient solver are done with
         numpy to limit memory overhead.
 
-    The algorithm used for solving the problem is conditional gradient as discussed in :ref:`[24] <references-fused-gromov-wasserstein>`
 
     Parameters
     ----------
@@ -465,36 +471,39 @@ def line_search(cost, G, deltaG, Mi, cost_G, **kwargs):
 def fused_gromov_wasserstein2(M, C1, C2, p=None, q=None, loss_fun='square_loss', symmetric=None, alpha=0.5,
                               armijo=False, G0=None, log=False, max_iter=1e4, tol_rel=1e-9, tol_abs=1e-9, **kwargs):
     r"""
-    Computes the FGW distance between two graphs see (see :ref:`[24] <references-fused-gromov-wasserstein2>`)
+    Returns the Fused Gromov-Wasserstein distance between :math:`(\mathbf{C_1}, \mathbf{Y_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{Y_2}, \mathbf{q})`
+    with pairwise distance matrix :math:`\mathbf{M}` between node feature matrices :math:`\mathbf{Y_1}` and :math:`\mathbf{Y_2}` (see :ref:`[24] <references-fused-gromov-wasserstein>`).
 
-    .. math::
-        \mathbf{GW} = \min_\mathbf{T} \quad (1 - \alpha) \langle \mathbf(T), \mathbf{M} \rangle_F + \alpha \sum_{i,j,k,l}
-        L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
+    The function solves the following optimization problem using Conditional Gradient:
 
-        s.t. \ \mathbf(T)\mathbf{1} &= \mathbf{p}
+    .. math::
+        \mathbf{FGW} = \mathop{\min}_\mathbf{T} \quad (1 - \alpha) \langle \mathbf{T}, \mathbf{M} \rangle_F +
+        \alpha \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
 
-             \mathbf(T)^T \mathbf{1} &= \mathbf{q}
+        s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p}
 
-             \mathbf(T) &\geq 0
+             \mathbf{T}^T \mathbf{1} &= \mathbf{q}
 
-    where :
+             \mathbf{T} &\geq 0
+    Where :
 
-    - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix
-    - :math:`\mathbf{p}` and :math:`\mathbf{q}` are source and target weights (sum to 1)
-    - `L` is a loss function to account for the misfit between the similarity matrices
+    - :math:`\mathbf{M}`: metric cost matrix between features across domains
+    - :math:`\mathbf{C_1}`: Metric cost matrix in the source space
+    - :math:`\mathbf{C_2}`: Metric cost matrix in the target space
+    - :math:`\mathbf{p}`: distribution in the source space
+    - :math:`\mathbf{q}`: distribution in the target space
+    - `L`: loss function to account for the misfit between the similarity and feature matrices
+    - :math:`\alpha`: trade-off parameter
 
-    The algorithm used for solving the problem is conditional gradient as
-    discussed in :ref:`[24] <references-fused-gromov-wasserstein2>`
+    Note that when using backends, this loss function is differentiable wrt the
+    matrices (C1, C2, M) and weights (p, q) for quadratic loss using the gradients from [38]_.
 
     .. note:: This function is backend-compatible and will work on arrays
         from all compatible backends. But the algorithm uses the C++ CPU backend
         which can lead to copy overhead on GPU arrays.
     .. note:: All computations in the conjugate gradient solver are done with
         numpy to limit memory overhead.
 
-    Note that when using backends, this loss function is differentiable wrt the
-    matrices (C1, C2, M) and weights (p, q) for quadratic loss using the gradients from [38]_.
-
     Parameters
     ----------
     M : array-like, shape (ns, nt)
@@ -668,13 +677,13 @@ def gromov_barycenters(
         max_iter=1000, tol=1e-9, warmstartT=False, verbose=False, log=False,
         init_C=None, random_state=None, **kwargs):
     r"""
-    Returns the gromov-wasserstein barycenters of `S` measured similarity matrices :math:`(\mathbf{C}_s)_{1 \leq s \leq S}`
+    Returns the Gromov-Wasserstein barycenters of `S` measured similarity matrices :math:`(\mathbf{C}_s)_{1 \leq s \leq S}`
 
     The function solves the following optimization problem with block coordinate descent:
 
     .. math::
 
-        \mathbf{C} = \mathop{\arg \min}_{\mathbf{C}\in \mathbb{R}^{N \times N}} \quad \sum_s \lambda_s \mathrm{GW}(\mathbf{C}, \mathbf{C}_s, \mathbf{p}, \mathbf{p}_s)
+        \mathbf{C}^* = \mathop{\arg \min}_{\mathbf{C}\in \mathbb{R}^{N \times N}} \quad \sum_s \lambda_s \mathrm{GW}(\mathbf{C}, \mathbf{C}_s, \mathbf{p}, \mathbf{p}_s)
 
     Where :
 
@@ -812,7 +821,21 @@ def fgw_barycenters(
         fixed_features=False, p=None, loss_fun='square_loss', armijo=False,
         symmetric=True, max_iter=100, tol=1e-9, warmstartT=False, verbose=False,
         log=False, init_C=None, init_X=None, random_state=None, **kwargs):
-    r"""Compute the fgw barycenter as presented eq (5) in :ref:`[24] <references-fgw-barycenters>`
+    r"""
+    Returns the Fused Gromov-Wasserstein barycenters of `S` measurable networks with node features :math:`(\mathbf{C}_s, \mathbf{Y}_s, \mathbf{p}_s)_{1 \leq s \leq S}`
+    (see eq (5) in :ref:`[24] <references-fgw-barycenters>`), estimated using Fused Gromov-Wasserstein transports from Conditional Gradient solvers.
+
+    The function solves the following optimization problem:
+
+    .. math::
+
+        \mathbf{C}^*, \mathbf{Y}^* = \mathop{\arg \min}_{\mathbf{C}\in \mathbb{R}^{N \times N}, \mathbf{Y}\in \mathbb{Y}^{N \times d}} \quad \sum_s \lambda_s \mathrm{FGW}_{\alpha}(\mathbf{C}, \mathbf{C}_s, \mathbf{Y}, \mathbf{Y}_s, \mathbf{p}, \mathbf{p}_s)
+
+    Where :
+
+    - :math:`\mathbf{Y}_s`: feature matrix
+    - :math:`\mathbf{C}_s`: metric cost matrix
+    - :math:`\mathbf{p}_s`: distribution
 
     Parameters
     ----------
diff --git a/ot/gromov/_semirelaxed.py b/ot/gromov/_semirelaxed.py
@@ -21,12 +21,12 @@
 def semirelaxed_gromov_wasserstein(C1, C2, p=None, loss_fun='square_loss', symmetric=None, log=False, G0=None,
                                    max_iter=1e4, tol_rel=1e-9, tol_abs=1e-9, **kwargs):
     r"""
-    Returns the semi-relaxed Gromov-Wasserstein divergence transport from :math:`(\mathbf{C_1}, \mathbf{p})` to :math:`\mathbf{C_2}`
+    Returns the semi-relaxed Gromov-Wasserstein divergence transport from :math:`(\mathbf{C_1}, \mathbf{p})` to :math:`\mathbf{C_2}` (see [48]).
 
-    The function solves the following optimization problem:
+    The function solves the following optimization problem using Conditional Gradient:
 
     .. math::
-        \mathbf{T}^^* \in \mathop{\arg \min}_{\mathbf{T}} \quad \sum_{i,j,k,l}
+        \mathbf{T}^* \in \mathop{\arg \min}_{\mathbf{T}} \quad \sum_{i,j,k,l}
         L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
 
         s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p}
@@ -152,9 +152,9 @@ def line_search(cost, G, deltaG, Mi, cost_G, **kwargs):
 def semirelaxed_gromov_wasserstein2(C1, C2, p=None, loss_fun='square_loss', symmetric=None, log=False, G0=None,
                                     max_iter=1e4, tol_rel=1e-9, tol_abs=1e-9, **kwargs):
     r"""
-    Returns the semi-relaxed gromov-wasserstein divergence from :math:`(\mathbf{C_1}, \mathbf{p})` to :math:`\mathbf{C_2}`
+    Returns the semi-relaxed Gromov-Wasserstein divergence from :math:`(\mathbf{C_1}, \mathbf{p})` to :math:`\mathbf{C_2}` (see [48]).
 
-    The function solves the following optimization problem:
+    The function solves the following optimization problem using Conditional Gradient:
 
     .. math::
         \text{srGW} = \min_{\mathbf{T}} \quad \sum_{i,j,k,l}
@@ -255,7 +255,7 @@ def semirelaxed_fused_gromov_wasserstein(
         M, C1, C2, p=None, loss_fun='square_loss', symmetric=None, alpha=0.5,
         G0=None, log=False, max_iter=1e4, tol_rel=1e-9, tol_abs=1e-9, **kwargs):
     r"""
-    Computes the semi-relaxed FGW transport between two graphs (see :ref:`[48] <references-semirelaxed-fused-gromov-wasserstein>`)
+    Computes the semi-relaxed Fused Gromov-Wasserstein transport between two graphs (see [48]).
 
     .. math::
         \mathbf{T}^* \in \mathop{\arg \min}_{\mathbf{T}} \quad (1 - \alpha) \langle \mathbf{T}, \mathbf{M} \rangle_F +
@@ -395,10 +395,10 @@ def line_search(cost, G, deltaG, Mi, cost_G, **kwargs):
 def semirelaxed_fused_gromov_wasserstein2(M, C1, C2, p=None, loss_fun='square_loss', symmetric=None, alpha=0.5, G0=None, log=False,
                                           max_iter=1e4, tol_rel=1e-9, tol_abs=1e-9, **kwargs):
     r"""
-    Computes the semi-relaxed FGW divergence between two graphs (see :ref:`[48] <references-semirelaxed-fused-gromov-wasserstein2>`)
+    Computes the semi-relaxed FGW divergence between two graphs (see [48]).
 
     .. math::
-        \mathbf{srFGW} = \min_{\mathbf{T}} \quad (1 - \alpha) \langle \mathbf{T}, \mathbf{M} \rangle_F +
+        \mathbf{srFGW}_{\alpha} = \min_{\mathbf{T}} \quad (1 - \alpha) \langle \mathbf{T}, \mathbf{M} \rangle_F +
         \alpha \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) T_{i,j} T_{k,l}
 
         s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p}
@@ -511,7 +511,7 @@ def semirelaxed_fused_gromov_wasserstein2(M, C1, C2, p=None, loss_fun='square_lo
 def solve_semirelaxed_gromov_linesearch(G, deltaG, cost_G, C1, C2, ones_p,
                                         M, reg, alpha_min=None, alpha_max=None, nx=None, **kwargs):
     """
-    Solve the linesearch in the FW iterations
+    Solve the linesearch in the Conditional Gradient iterations for the semi-relaxed Gromov-Wasserstein divergence.
 
     Parameters
     ----------
@@ -829,6 +829,7 @@ def entropic_semirelaxed_fused_gromov_wasserstein(
         alpha=0.5, G0=None, max_iter=1e4, tol=1e-9, log=False, verbose=False, **kwargs):
     r"""
     Computes the entropic-regularized semi-relaxed FGW transport between two graphs (see :ref:`[48] <references-semirelaxed-fused-gromov-wasserstein>`)
+    estimated using a Mirror Descent algorithm following the KL geometry.
 
     .. math::
         \mathbf{T}^* \in \mathop{\arg \min}_{\mathbf{T}} \quad (1 - \alpha) \langle \mathbf{T}, \mathbf{M} \rangle_F +
@@ -988,10 +989,11 @@ def entropic_semirelaxed_fused_gromov_wasserstein2(
         M, C1, C2, p=None, loss_fun='square_loss', symmetric=None, epsilon=0.1,
         alpha=0.5, G0=None, max_iter=1e4, tol=1e-9, log=False, verbose=False, **kwargs):
     r"""
-    Computes the entropic-regularized semi-relaxed FGW transport between two graphs (see :ref:`[48] <references-semirelaxed-fused-gromov-wasserstein>`)
+    Computes the entropic-regularized semi-relaxed FGW divergence between two graphs (see :ref:`[48] <references-semirelaxed-fused-gromov-wasserstein>`)
+    estimated using a Mirror Descent algorithm following the KL geometry.
 
     .. math::
-        \mathbf{srFGW} = \min_{\mathbf{T}} \quad (1 - \alpha) \langle \mathbf{T}, \mathbf{M} \rangle_F +
+        \mathbf{srFGW}_{\alpha} = \min_{\mathbf{T}} \quad (1 - \alpha) \langle \mathbf{T}, \mathbf{M} \rangle_F +
         \alpha \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
 
         s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p}