@@ -228,3 +228,115 @@ def __str__(self):
228228 return (f"{ self .__class__ .__name__ } (learning_rate={ self .learning_rate } , "
229229 f"beta_1={ self .beta_1 } , beta_2={ self .beta_2 } , epsilon={ self .epsilon } , "
230230 f"clip_norm={ self .clip_norm } , clip_value={ self .clip_value } )" )
231+
232+
233+ class AdaBelief (Optimizer ):
234+ def __init__ (self , learning_rate : float = 0.001 , beta_1 : float = 0.9 , beta_2 : float = 0.999 ,
235+ epsilon : float = 1e-16 , clip_norm : float = None , clip_value : float = None ) -> None :
236+ super ().__init__ (learning_rate )
237+ self .beta_1 = beta_1
238+ self .beta_2 = beta_2
239+ self .epsilon = epsilon
240+ self .clip_norm = clip_norm
241+ self .clip_value = clip_value
242+ self .t = 0
243+
244+ self .m_w , self .s_w = {}, {}
245+ self .m_b , self .s_b = {}, {}
246+
247+ self ._min_denom = 1e-16
248+ self ._max_exp = np .log (np .finfo (np .float64 ).max )
249+
250+ def _clip_gradients (self , grad : np .ndarray ) -> np .ndarray :
251+ if grad is None :
252+ return None
253+
254+ if self .clip_norm is not None :
255+ grad_norm = np .linalg .norm (grad )
256+ if grad_norm > self .clip_norm :
257+ grad = grad * (self .clip_norm / (grad_norm + self ._min_denom ))
258+
259+ if self .clip_value is not None :
260+ grad = np .clip (grad , - self .clip_value , self .clip_value )
261+
262+ return grad
263+
264+ def _compute_moments (self , param : np .ndarray , grad : np .ndarray , m : np .ndarray , s : np .ndarray ) -> tuple :
265+ grad = self ._clip_gradients (grad )
266+
267+ m = self .beta_1 * m + (1 - self .beta_1 ) * grad
268+
269+ grad_residual = grad - m
270+
271+ s = self .beta_2 * s + (1 - self .beta_2 ) * np .square (grad_residual )
272+
273+ beta1_t = self .beta_1 ** self .t
274+ beta2_t = self .beta_2 ** self .t
275+
276+ m_hat = m / (1 - beta1_t )
277+ s_hat = s / (1 - beta2_t )
278+
279+ denom = np .sqrt (s_hat + self .epsilon )
280+ update = self .learning_rate * m_hat / np .maximum (denom , self ._min_denom )
281+
282+ update = np .nan_to_num (update , nan = 0.0 , posinf = 0.0 , neginf = 0.0 )
283+ param -= update
284+
285+ return param , m , s
286+
287+ def update (self , layer_index : int , weights : np .ndarray , weights_grad : np .ndarray , bias : np .ndarray ,
288+ bias_grad : np .ndarray ) -> None :
289+ if layer_index not in self .m_w :
290+ self .m_w [layer_index ] = np .zeros_like (weights )
291+ self .s_w [layer_index ] = np .zeros_like (weights )
292+ self .m_b [layer_index ] = np .zeros_like (bias )
293+ self .s_b [layer_index ] = np .zeros_like (bias )
294+
295+ self .t += 1
296+
297+ weights , self .m_w [layer_index ], self .s_w [layer_index ] = self ._compute_moments (
298+ weights , weights_grad , self .m_w [layer_index ], self .s_w [layer_index ]
299+ )
300+
301+ bias , self .m_b [layer_index ], self .s_b [layer_index ] = self ._compute_moments (
302+ bias , bias_grad , self .m_b [layer_index ], self .s_b [layer_index ]
303+ )
304+
305+ def get_config (self ) -> dict :
306+ return {
307+ "name" : self .__class__ .__name__ ,
308+ "learning_rate" : self .learning_rate ,
309+ "beta_1" : self .beta_1 ,
310+ "beta_2" : self .beta_2 ,
311+ "epsilon" : self .epsilon ,
312+ "clip_norm" : self .clip_norm ,
313+ "clip_value" : self .clip_value ,
314+ "t" : self .t ,
315+ "m_w" : dict_with_ndarray_to_dict_with_list (self .m_w ),
316+ "s_w" : dict_with_ndarray_to_dict_with_list (self .s_w ),
317+ "m_b" : dict_with_ndarray_to_dict_with_list (self .m_b ),
318+ "s_b" : dict_with_ndarray_to_dict_with_list (self .s_b )
319+ }
320+
321+ @staticmethod
322+ def from_config (config : dict ):
323+ adabelief = AdaBelief (
324+ learning_rate = config ['learning_rate' ],
325+ beta_1 = config ['beta_1' ],
326+ beta_2 = config ['beta_2' ],
327+ epsilon = config ['epsilon' ],
328+ clip_norm = config .get ('clip_norm' ),
329+ clip_value = config .get ('clip_value' )
330+ )
331+ adabelief .t = config ['t' ]
332+ adabelief .m_w = dict_with_list_to_dict_with_ndarray (config ['m_w' ])
333+ adabelief .s_w = dict_with_list_to_dict_with_ndarray (config ['s_w' ])
334+ adabelief .m_b = dict_with_list_to_dict_with_ndarray (config ['m_b' ])
335+ adabelief .s_b = dict_with_list_to_dict_with_ndarray (config ['s_b' ])
336+ return adabelief
337+
338+ def __str__ (self ):
339+ """Retourne une représentation string de l'optimiseur."""
340+ return (f"{ self .__class__ .__name__ } (learning_rate={ self .learning_rate } , "
341+ f"beta_1={ self .beta_1 } , beta_2={ self .beta_2 } , epsilon={ self .epsilon } , "
342+ f"clip_norm={ self .clip_norm } , clip_value={ self .clip_value } )" )
0 commit comments