Skip to content

Commit d059bad

Browse files
committed
feat(optimizers): add AdaBelief
1 parent 1a28cb7 commit d059bad

File tree

1 file changed

+112
-0
lines changed

1 file changed

+112
-0
lines changed

neuralnetlib/optimizers.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,3 +228,115 @@ def __str__(self):
228228
return (f"{self.__class__.__name__}(learning_rate={self.learning_rate}, "
229229
f"beta_1={self.beta_1}, beta_2={self.beta_2}, epsilon={self.epsilon}, "
230230
f"clip_norm={self.clip_norm}, clip_value={self.clip_value})")
231+
232+
233+
class AdaBelief(Optimizer):
234+
def __init__(self, learning_rate: float = 0.001, beta_1: float = 0.9, beta_2: float = 0.999,
235+
epsilon: float = 1e-16, clip_norm: float = None, clip_value: float = None) -> None:
236+
super().__init__(learning_rate)
237+
self.beta_1 = beta_1
238+
self.beta_2 = beta_2
239+
self.epsilon = epsilon
240+
self.clip_norm = clip_norm
241+
self.clip_value = clip_value
242+
self.t = 0
243+
244+
self.m_w, self.s_w = {}, {}
245+
self.m_b, self.s_b = {}, {}
246+
247+
self._min_denom = 1e-16
248+
self._max_exp = np.log(np.finfo(np.float64).max)
249+
250+
def _clip_gradients(self, grad: np.ndarray) -> np.ndarray:
251+
if grad is None:
252+
return None
253+
254+
if self.clip_norm is not None:
255+
grad_norm = np.linalg.norm(grad)
256+
if grad_norm > self.clip_norm:
257+
grad = grad * (self.clip_norm / (grad_norm + self._min_denom))
258+
259+
if self.clip_value is not None:
260+
grad = np.clip(grad, -self.clip_value, self.clip_value)
261+
262+
return grad
263+
264+
def _compute_moments(self, param: np.ndarray, grad: np.ndarray, m: np.ndarray, s: np.ndarray) -> tuple:
265+
grad = self._clip_gradients(grad)
266+
267+
m = self.beta_1 * m + (1 - self.beta_1) * grad
268+
269+
grad_residual = grad - m
270+
271+
s = self.beta_2 * s + (1 - self.beta_2) * np.square(grad_residual)
272+
273+
beta1_t = self.beta_1 ** self.t
274+
beta2_t = self.beta_2 ** self.t
275+
276+
m_hat = m / (1 - beta1_t)
277+
s_hat = s / (1 - beta2_t)
278+
279+
denom = np.sqrt(s_hat + self.epsilon)
280+
update = self.learning_rate * m_hat / np.maximum(denom, self._min_denom)
281+
282+
update = np.nan_to_num(update, nan=0.0, posinf=0.0, neginf=0.0)
283+
param -= update
284+
285+
return param, m, s
286+
287+
def update(self, layer_index: int, weights: np.ndarray, weights_grad: np.ndarray, bias: np.ndarray,
288+
bias_grad: np.ndarray) -> None:
289+
if layer_index not in self.m_w:
290+
self.m_w[layer_index] = np.zeros_like(weights)
291+
self.s_w[layer_index] = np.zeros_like(weights)
292+
self.m_b[layer_index] = np.zeros_like(bias)
293+
self.s_b[layer_index] = np.zeros_like(bias)
294+
295+
self.t += 1
296+
297+
weights, self.m_w[layer_index], self.s_w[layer_index] = self._compute_moments(
298+
weights, weights_grad, self.m_w[layer_index], self.s_w[layer_index]
299+
)
300+
301+
bias, self.m_b[layer_index], self.s_b[layer_index] = self._compute_moments(
302+
bias, bias_grad, self.m_b[layer_index], self.s_b[layer_index]
303+
)
304+
305+
def get_config(self) -> dict:
306+
return {
307+
"name": self.__class__.__name__,
308+
"learning_rate": self.learning_rate,
309+
"beta_1": self.beta_1,
310+
"beta_2": self.beta_2,
311+
"epsilon": self.epsilon,
312+
"clip_norm": self.clip_norm,
313+
"clip_value": self.clip_value,
314+
"t": self.t,
315+
"m_w": dict_with_ndarray_to_dict_with_list(self.m_w),
316+
"s_w": dict_with_ndarray_to_dict_with_list(self.s_w),
317+
"m_b": dict_with_ndarray_to_dict_with_list(self.m_b),
318+
"s_b": dict_with_ndarray_to_dict_with_list(self.s_b)
319+
}
320+
321+
@staticmethod
322+
def from_config(config: dict):
323+
adabelief = AdaBelief(
324+
learning_rate=config['learning_rate'],
325+
beta_1=config['beta_1'],
326+
beta_2=config['beta_2'],
327+
epsilon=config['epsilon'],
328+
clip_norm=config.get('clip_norm'),
329+
clip_value=config.get('clip_value')
330+
)
331+
adabelief.t = config['t']
332+
adabelief.m_w = dict_with_list_to_dict_with_ndarray(config['m_w'])
333+
adabelief.s_w = dict_with_list_to_dict_with_ndarray(config['s_w'])
334+
adabelief.m_b = dict_with_list_to_dict_with_ndarray(config['m_b'])
335+
adabelief.s_b = dict_with_list_to_dict_with_ndarray(config['s_b'])
336+
return adabelief
337+
338+
def __str__(self):
339+
"""Retourne une représentation string de l'optimiseur."""
340+
return (f"{self.__class__.__name__}(learning_rate={self.learning_rate}, "
341+
f"beta_1={self.beta_1}, beta_2={self.beta_2}, epsilon={self.epsilon}, "
342+
f"clip_norm={self.clip_norm}, clip_value={self.clip_value})")

0 commit comments

Comments
 (0)