Add discrete variational autoencoder (#8)

VolodyaCO · Vladimir Vargas Calderón · thisac · web-flow · commit 452120fc5497 · 2025-08-01T16:17:26.000-07:00
* [feature] autoencoder architecture with gumbel-trick model

* autoencoder architecture

* dwave-hybrid is needed for the rbm to be imported

* encoder and decoder architectures. main training loop from scratch

* typing fixes

* spins -&gt; discrete renaming

* kl divergence

* updated requirements for examples to work

* when encoding data into spin strings, an arbitrary number of spin strings (n_samples) per data point is now allowed

* decoder is now aware of many spin strings per datapoint

* annealing learning rate

* fixing shapes for generating images

* fixing shapes into and out of decoder when generating images

* improved docstrings

* remove mmd loss

* Deleting example but keeping it locally

* improved docstrings

* renaming file. consistently using DiscreteAutoEncoder. citing paper. docstrings added

* importing from all

* licence header

* test autoencoder

* type hint fixed

* docstrings improvement

* using functional BCE with logits definition

* docstrings improve,memt

* cited DVAE paper

* updated previous GRBM to new one

* Remove example requirements

* Remove unrequired arg in objective call

* Add unittests

* Rename DVAE and remove numpy from tests

* Remove duplicate files include author in filename

* Address minor PR review comments

* Apply suggestions from code review

Co-authored-by: Theodor Isacsson &lt;theodor@isacsson.ca&gt;

* Apply suggestions from code review

---------

Co-authored-by: Vladimir Vargas Calderón &lt;vvargasc@dwavesys.com&gt;
Co-authored-by: Theodor Isacsson &lt;theodor@isacsson.ca&gt;
Co-authored-by: kchern &lt;kchern@dwavesys.com&gt;
Co-authored-by: Kevin Chern &lt;32395608+kevinchern@users.noreply.github.com&gt;
diff --git a/dwave/plugins/torch/models/__init__.py b/dwave/plugins/torch/models/__init__.py
@@ -13,4 +13,5 @@
 # limitations under the License.
 #
 
+from dwave.plugins.torch.models.discrete_variational_autoencoder import *
 from dwave.plugins.torch.models.boltzmann_machine import *
diff --git a/dwave/plugins/torch/models/boltzmann_machine.py b/dwave/plugins/torch/models/boltzmann_machine.py
@@ -133,15 +133,21 @@ def _setup_hidden(self):
         """Preprocess some indexes to enable vectorized computation of effective fields of hidden
         units."""
         self._connected_hidden = any(
-            a in self.hidden_nodes and b in self.hidden_nodes for a, b in self.edges)
+            a in self.hidden_nodes and b in self.hidden_nodes for a, b in self.edges
+        )
         if self._connected_hidden:
-            err_message = "Current implementation does not support intrahidden-unit connections."
+            err_message = (
+                "Current implementation does not support intrahidden-unit connections."
+            )
             raise NotImplementedError(err_message)
 
-        visible_idx = torch.tensor([self._node_to_idx[v]
-                                    for v in self._nodes if v not in self.hidden_nodes], dtype=int)
+        visible_idx = torch.tensor(
+            [self._node_to_idx[v] for v in self._nodes if v not in self.hidden_nodes],
+            dtype=int,
+        )
         hidden_idx = torch.tensor(
-            [i for i in torch.arange(self._n_nodes) if i not in visible_idx], dtype=int)
+            [i for i in torch.arange(self._n_nodes) if i not in visible_idx], dtype=int
+        )
         self.register_buffer("_visible_idx", visible_idx)
         self.register_buffer("_hidden_idx", hidden_idx)
 
@@ -317,7 +323,8 @@ def sample(
         return sample_set
 
     def sampleset_to_tensor(
-            self, sample_set: SampleSet, device: Optional[torch.device] = None) -> torch.Tensor:
+        self, sample_set: SampleSet, device: Optional[torch.device] = None
+    ) -> torch.Tensor:
         """Converts a ``dimod.SampleSet`` to a ``torch.Tensor`` using the node order of the class.
 
         Args:
@@ -342,7 +349,7 @@ def quasi_objective(
         linear_range: Optional[tuple[float, float]] = None,
         quadratic_range: Optional[tuple[float, float]] = None,
         sampler: Optional[Sampler] = None,
-        sample_kwargs: Optional[dict] = None
+        sample_kwargs: Optional[dict] = None,
     ) -> torch.Tensor:
         """A quasi-objective function with gradients equivalent to the gradients of the
         negative log likelihood.
@@ -432,20 +439,20 @@ def _compute_effective_field(self, padded: torch.Tensor) -> torch.Tensor:
         contribution = padded[:, self._flat_adj] * self._quadratic[self._flat_j_idx]
         cumulative_contribution = contribution.cumsum(1)
         # Don't forget to add the linear fields!
-        h_eff = self._linear[self.hidden_idx] + cumulative_contribution[:, self._bin_idx].diff(
-            dim=1, prepend=torch.zeros(bs, device=padded.device).unsqueeze(1)
-        )
+        h_eff = self._linear[self.hidden_idx] + cumulative_contribution[
+            :, self._bin_idx
+        ].diff(dim=1, prepend=torch.zeros(bs, device=padded.device).unsqueeze(1))
 
         return h_eff
 
     def _approximate_expectation_sampling(
-            self,
-            obs: torch.Tensor,
-            sampler: Sampler,
-            prefactor: float,
-            linear_range: Optional[tuple[float, float]] = None,
-            quadratic_range: Optional[tuple[float, float]] = None,
-            sample_kwargs: Optional[dict] = None
+        self,
+        obs: torch.Tensor,
+        sampler: Sampler,
+        prefactor: float,
+        linear_range: Optional[tuple[float, float]] = None,
+        quadratic_range: Optional[tuple[float, float]] = None,
+        sample_kwargs: Optional[dict] = None,
     ) -> torch.Tensor:
         """Approximate expectation of hidden units via sampling.
 
@@ -471,8 +478,11 @@ def _approximate_expectation_sampling(
         """
         # Create the BQM and remove visible units
         bqm = BinaryQuadraticModel.from_ising(
-            *self.to_ising(prefactor, linear_range, quadratic_range))
-        bqm.remove_variables_from([self.idx_to_node[vidx] for vidx in self.visible_idx.tolist()])
+            *self.to_ising(prefactor, linear_range, quadratic_range)
+        )
+        bqm.remove_variables_from(
+            [self.idx_to_node[vidx] for vidx in self.visible_idx.tolist()]
+        )
 
         # Compute the effective fields for hidden units
         padded = self._pad(obs)
@@ -524,8 +534,10 @@ def _compute_expectation_disconnected(self, obs: torch.Tensor) -> torch.Tensor:
             variables in the model, i.e., number of hidden and visible units.
         """
         if self._connected_hidden:
-            err_msg = ("`_compute_expectation_disconnected` is not applicable when edges exist "
-                       "between hidden units.")
+            err_msg = (
+                "`_compute_expectation_disconnected` is not applicable when edges exist "
+                "between hidden units."
+            )
             raise ValueError(err_msg)
         m = self._pad(obs)
         h_eff = self._compute_effective_field(m)
@@ -592,8 +604,12 @@ def sufficient_statistics(self, x: torch.Tensor) -> torch.Tensor:
         interactions = self.interactions(x)
         return torch.cat([x, interactions], 1)
 
-    def to_ising(self, prefactor: float, linear_range: Optional[tuple[float, float]] = None,
-                 quadratic_range: Optional[tuple[float, float]] = None) -> tuple[dict, dict]:
+    def to_ising(
+        self,
+        prefactor: float,
+        linear_range: Optional[tuple[float, float]] = None,
+        quadratic_range: Optional[tuple[float, float]] = None,
+    ) -> tuple[dict, dict]:
         """Convert the model to Ising format.
 
         Convert the model to Ising format with scaling (``prefactor``) followed by clipping (if
diff --git a/dwave/plugins/torch/models/discrete_variational_autoencoder.py b/dwave/plugins/torch/models/discrete_variational_autoencoder.py
@@ -0,0 +1,148 @@
+# Copyright 2025 D-Wave
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# The use of the discrete autoencoder implementations below (including the
+# DiscreteVariationalAutoencoder) with a quantum computing system is
+# protected by the intellectual property rights of D-Wave Quantum Inc.
+# and its affiliates.
+#
+# The use of the discrete autoencoder implementations below (including the
+# DiscreteVariationalAutoencoder) with D-Wave's quantum computing
+# system will require access to D-Wave’s LeapTM quantum cloud service and
+# will be governed by the Leap Cloud Subscription Agreement available at:
+# https://cloud.dwavesys.com/leap/legal/cloud_subscription_agreement/
+#
+
+from collections.abc import Callable
+from typing import Optional
+
+import torch
+
+__all__ = ["DiscreteVariationalAutoencoder"]
+
+
+class DiscreteVariationalAutoencoder(torch.nn.Module):
+    """DiscreteAutoEncoder architecture amenable for training discrete models as priors.
+    See https://iopscience.iop.org/article/10.1088/2632-2153/aba220
+
+    Such discrete models include spin-variable models amenable for the QPU. This
+    architecture is a modification of the standard autoencoder architecture, where
+    the encoder outputs a latent representation of the data, and the decoder
+    reconstructs the data from the latent representation. In our case, there is an
+    additional step where the latent representation is mapped to a discrete
+    representation, which is then passed to the decoder.
+
+    Args:
+        encoder (torch.nn.Module): The encoder must output latents that are later on
+            passed to ``latent_to_discrete``. An encoder has signature (x) -> l. x has
+            shape (batch_size, f1, f2, ...) and l has shape (batch_size, l1, l2, ...).
+        decoder (torch.nn.Module): Decodes discrete tensors into data tensors. A decoder
+            has signature (d) -> x'. d has shape (batch_size, n, d1, d2, ...) and x' has
+            shape (batch_size, f'1, f'2, ...); if x' is the reconstructed data then
+            fi=f'i, but x' might be another representation of the data (e.g. in a
+            text-to-image model, x is a sequence of tokens, and x' is an image). Note
+            that the decoder input is of shape (batch_size, n, d1, d2, ...), where n is
+            a number of discrete representations to be created from a single latent
+            representation of a single initial data point.
+        latent_to_discrete (Callable[[torch.Tensor, int], torch.Tensor] | None): A
+            stochastic and differentiable function that maps the output of the encoder
+            to a discrete representation (a function is deterministic by definition;
+            here "stochastic" means the function implicitly takes an additional noise
+            variables as input). Importantly, since the function is stochastic, it
+            allows for the creation of multiple discrete representations from the latent
+            representation of a single data point. Thus, the signature of this function
+            is (l, n) -> d, where l is the output of the encoder and has shape
+            (batch_size, l1, l2, ...), n is the number of discrete representations per
+            data point, and d has shape (batch_size, n, d1, d2, ...), which will be the
+            input to the decoder. If None, the gumbel softmax function is used for
+            stochasticity. Defaults to None.
+    """
+
+    def __init__(
+        self,
+        encoder: torch.nn.Module,
+        decoder: torch.nn.Module,
+        latent_to_discrete: Optional[Callable[[torch.Tensor, int], torch.Tensor]] = None,
+    ):
+        super().__init__()
+        self._encoder = encoder
+        self._decoder = decoder
+        if latent_to_discrete is None:
+
+            def latent_to_discrete(
+                logits: torch.Tensor, n_samples: int
+            ) -> torch.Tensor:
+                # Logits is of shape (batch_size, n_discrete), we assume these logits
+                # refer to the probability of each discrete variable being 1. To use the
+                # gumbel softmax function we need to reshape the logits to (batch_size,
+                # n_discrete, 1), and then stack the logits to a zeros tensor of the
+                # same shape. This is done to ensure that the gumbel softmax function
+                # works correctly.
+
+                logits = logits.unsqueeze(-1)
+                logits = torch.cat((logits, torch.zeros_like(logits)), dim=-1)
+                # We now create a new leading dimension and repeat the logits n_samples
+                # times:
+                logits = logits.unsqueeze(1).repeat(1, n_samples, 1, 1)
+                one_hots = torch.nn.functional.gumbel_softmax(
+                    logits, tau=1 / 7, hard=True
+                )
+                # The constant 1/7 is used because it was used in
+                # https://iopscience.iop.org/article/10.1088/2632-2153/aba220
+
+                # one_hots is of shape (batch_size, n_samples, n_discrete, 2), we need
+                # to take the first element of the last dimension and convert it to spin
+                # variables to make the latent space compatible with QPU models.
+                return one_hots[..., 0] * 2 - 1
+
+        self._latent_to_discrete = latent_to_discrete
+
+    @property
+    def encoder(self):
+        """Encoder network that maps image data to latent spinstrings."""
+        return self._encoder
+
+    @property
+    def decoder(self):
+        """Decoder network that maps latent variables to images."""
+        return self._decoder
+
+    @property
+    def latent_to_discrete(self):
+        """Function that maps the output of the encoder to a discrete representation"""
+        return self._latent_to_discrete
+
+    def forward(
+        self, x: torch.Tensor, n_samples: int = 1
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Ingests data into the :class:`DiscreteVariationalAutoencoder`.
+
+        Args:
+            x (torch.Tensor): Input data of shape (batch_size, ...).
+            n_samples (int, optional): Since the ``latent_to_discrete`` map is, in
+                general, stochastic (see :class:`DiscreteVariationalAutoencoder` for more on this),
+                several different discrete samples can be obtained by applying this map
+                to the same encoded data point. This argument specifies how many such
+                samples are obtained. Defaults to 1.
+
+        Returns:
+            tuple[torch.Tensor, torch.Tensor, torch.Tensor]: The reconstructed data of
+            shape (batch_size, n_samples, ...), the discrete representation(s) of the
+            encoded data with the shape (batch_size, n_samples, ...), and the logits,
+            which are the encoded data of shape (batch_size, ...).
+        """
+        latents = self.encoder(x)
+        discretes = self.latent_to_discrete(latents, n_samples)
+        xhat = self.decoder(discretes)
+        return latents, discretes, xhat
diff --git a/dwave/plugins/torch/models/losses/__init__.py b/dwave/plugins/torch/models/losses/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2025 D-Wave
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dwave.plugins.torch.models.losses.kl_divergence import *
diff --git a/dwave/plugins/torch/models/losses/kl_divergence.py b/dwave/plugins/torch/models/losses/kl_divergence.py
@@ -0,0 +1,80 @@
+# Copyright 2025 D-Wave
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import torch
+from dimod import Sampler
+
+from dwave.plugins.torch.models.boltzmann_machine import GraphRestrictedBoltzmannMachine
+
+__all__ = ["pseudo_kl_divergence_loss"]
+
+
+def pseudo_kl_divergence_loss(
+    spins: torch.Tensor,
+    logits: torch.Tensor,
+    boltzmann_machine: GraphRestrictedBoltzmannMachine,
+    sampler: Sampler,
+    sample_kwargs: dict,
+    prefactor: Optional[float] = None,
+    linear_range: Optional[tuple[float, float]] = None,
+    quadratic_range: Optional[tuple[float, float]] = None,
+):
+    """A pseudo Kullback-Leibler divergence loss function for a discrete autoencoder with a
+    Boltzmann machine prior.
+
+    This is not the true KL divergence, but the gradient of this function is the same as
+    the KL divergence gradient. See https://arxiv.org/abs/1609.02200 for more details.
+
+    Args:
+        spins (torch.Tensor): A tensor of spins of shape (batch_size, n_spins) or shape
+            (batch_size, n_samples, n_spins) obtained from a stochastic function that
+            maps the output of the encoder (logit representation) to a spin
+            representation.
+        logits (torch.Tensor): A tensor of logits of shape (batch_size, n_spins). These
+            logits are the raw output of the encoder.
+        boltzmann_machine (GraphRestrictedBoltzmannMachine): An instance of a Boltzmann
+            machine.
+        sampler (Sampler): A sampler used for generating samples.
+        sample_kwargs (dict): Additional keyword arguments for the ``sampler.sample``
+            method.
+        prefactor (float, optional): A scaling applied to the Hamiltonian weights
+            (linear and quadratic weights). When None, no scaling is applied. Defaults
+            to None.
+        linear_range (tuple[float, float], optional): Linear weights are clipped to
+            ``linear_range`` prior to sampling. This clipping occurs after the
+            ``prefactor`` scaling has been applied. When None, no clipping is applied.
+            Defaults to None.
+        quadratic_range (tuple[float, float], optional): Quadratic weights are clipped
+            to ``quadratic_range`` prior to sampling. This clipping occurs after the
+            ``prefactor`` scaling has been applied. When None, no clipping is applied.
+            Defaults to None.
+
+    Returns:
+        torch.Tensor: The computed pseudo KL divergence loss.
+    """
+    samples = boltzmann_machine.sample(
+        sampler=sampler,
+        device=spins.device,
+        prefactor=prefactor if prefactor is not None else 1.0,
+        linear_range=linear_range,
+        quadratic_range=quadratic_range,
+        sample_params=sample_kwargs,
+    )
+    probabilities = torch.sigmoid(logits)
+    entropy = torch.nn.functional.binary_cross_entropy_with_logits(logits, probabilities)
+    cross_entropy = boltzmann_machine.quasi_objective(spins, samples)
+    pseudo_kl_divergence = cross_entropy - entropy
+    return pseudo_kl_divergence
diff --git a/dwave/plugins/torch/utils.py b/dwave/plugins/torch/utils.py
@@ -22,7 +22,8 @@
 
 
 def sampleset_to_tensor(
-        ordered_vars: list, sample_set: SampleSet, device: torch.device = None) -> torch.Tensor:
+    ordered_vars: list, sample_set: SampleSet, device: Optional[torch.device] = None
+) -> torch.Tensor:
     """Converts a ``dimod.SampleSet`` to a ``torch.Tensor``.
 
     Args:
diff --git a/pyproject.toml b/pyproject.toml
@@ -31,6 +31,7 @@ dependencies = [
     "networkx",
     "dimod",
     "dwave-system",
+    "dwave-hybrid",
 ]
 
 [project.readme]
diff --git a/tests/requirements.txt b/tests/requirements.txt
@@ -1,2 +1,3 @@
 coverage
 codecov
+parameterized
diff --git a/tests/test_dvae_winci2020.py b/tests/test_dvae_winci2020.py

Original file line number	Diff line number	Diff line change
`@@ -13,4 +13,5 @@`
`13`	`13`	`# limitations under the License.`
`14`	`14`	`#`
`15`	`15`
	`16`	`+from dwave.plugins.torch.models.discrete_variational_autoencoder import *`
`16`	`17`	`from dwave.plugins.torch.models.boltzmann_machine import *`
Original file line number	Diff line number	Diff line change
`@@ -31,6 +31,7 @@ dependencies = [`
`31`	`31`	`"networkx",`
`32`	`32`	`"dimod",`
`33`	`33`	`"dwave-system",`
	`34`	`+ "dwave-hybrid",`
`34`	`35`	`]`
`35`	`36`
`36`	`37`	`[project.readme]`
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`coverage`
`2`	`2`	`codecov`
	`3`	`+parameterized`