@@ -34,6 +34,11 @@ def deepspeed_config():
3434 }
3535
3636
37+ @pytest .fixture
38+ def deepspeed_zero_config (deepspeed_config ):
39+ return {** deepspeed_config , 'zero_allow_untested_optimizer' : True , 'zero_optimization' : {'stage' : 2 }}
40+
41+
3742@pytest .mark .skipif (not _DEEPSPEED_AVAILABLE , reason = "DeepSpeed not available." )
3843def test_deepspeed_plugin_string (tmpdir ):
3944 """
@@ -165,9 +170,6 @@ def test_invalid_deepspeed_defaults_no_precision(tmpdir):
165170
166171@pytest .mark .skipif (not torch .cuda .is_available (), reason = "requires GPU machine" )
167172@pytest .mark .skipif (not _DEEPSPEED_AVAILABLE , reason = "DeepSpeed not available." )
168- @pytest .mark .skipif (
169- not os .getenv ("PL_RUNNING_SPECIAL_TESTS" , '0' ) == '1' , reason = "test should be run outside of pytest"
170- )
171173def test_warn_deepspeed_override_backward (tmpdir ):
172174 """
173175 Test to ensure that if the backward hook in the LightningModule is overridden, we throw a warning.
@@ -191,9 +193,6 @@ def backward(self, loss: Tensor, optimizer: Optimizer, optimizer_idx: int, *args
191193
192194@pytest .mark .skipif (not torch .cuda .is_available (), reason = "requires GPU machine" )
193195@pytest .mark .skipif (not _DEEPSPEED_AVAILABLE , reason = "DeepSpeed not available." )
194- @pytest .mark .skipif (
195- not os .getenv ("PL_RUNNING_SPECIAL_TESTS" , '0' ) == '1' , reason = "test should be run outside of pytest"
196- )
197196def test_deepspeed_run_configure_optimizers (tmpdir ):
198197 """
199198 Test end to end that deepspeed works with defaults (without ZeRO as that requires compilation),
@@ -223,10 +222,7 @@ def on_train_start(self) -> None:
223222
224223@pytest .mark .skipif (not torch .cuda .is_available (), reason = "requires GPU machine" )
225224@pytest .mark .skipif (not _DEEPSPEED_AVAILABLE , reason = "DeepSpeed not available." )
226- @pytest .mark .skipif (
227- not os .getenv ("PL_RUNNING_SPECIAL_TESTS" , '0' ) == '1' , reason = "test should be run outside of pytest"
228- )
229- def test_deepspeed_config (tmpdir , deepspeed_config ):
225+ def test_deepspeed_config (tmpdir , deepspeed_zero_config ):
230226 """
231227 Test to ensure deepspeed works correctly when passed a DeepSpeed config object including optimizers/schedulers
232228 and saves the model weights to load correctly.
@@ -255,6 +251,58 @@ def on_train_start(self) -> None:
255251 _assert_save_model_is_equal (model , tmpdir , trainer )
256252
257253
254+ @pytest .mark .skipif (not _DEEPSPEED_AVAILABLE , reason = "DeepSpeed not available." )
255+ @pytest .mark .skipif (not torch .cuda .is_available (), reason = "requires GPU machine" )
256+ def test_deepspeed_custom_precision_params (tmpdir ):
257+ """
258+ Ensure if we modify the FP16 parameters via the DeepSpeedPlugin, the deepspeed config contains these changes.
259+ """
260+
261+ class TestModel (BoringModel ):
262+
263+ def on_train_start (self ) -> None :
264+ assert self .trainer .training_type_plugin .config ['fp16' ]['loss_scale' ] == 10
265+ assert self .trainer .training_type_plugin .config ['fp16' ]['initial_scale_power' ] == 10
266+ assert self .trainer .training_type_plugin .config ['fp16' ]['loss_scale_window' ] == 10
267+ assert self .trainer .training_type_plugin .config ['fp16' ]['hysteresis' ] == 10
268+ assert self .trainer .training_type_plugin .config ['fp16' ]['min_loss_scale' ] == 10
269+ raise SystemExit ()
270+
271+ model = TestModel ()
272+ trainer = Trainer (
273+ plugins = [
274+ DeepSpeedPlugin (
275+ loss_scale = 10 , initial_scale_power = 10 , loss_scale_window = 10 , hysteresis = 10 , min_loss_scale = 10
276+ )
277+ ],
278+ precision = 16 ,
279+ gpus = 1
280+ )
281+ with pytest .raises (SystemExit ):
282+ trainer .fit (model )
283+
284+
285+ @pytest .mark .skipif (not _DEEPSPEED_AVAILABLE , reason = "DeepSpeed not available." )
286+ @pytest .mark .skipif (not torch .cuda .is_available (), reason = "requires GPU machine" )
287+ def test_deepspeed_assert_config_zero_offload_disabled (tmpdir , deepspeed_zero_config ):
288+ """
289+ Ensure if we use a config and turn off cpu_offload, that this is set to False within the config.
290+ """
291+
292+ deepspeed_zero_config ['zero_optimization' ]['cpu_offload' ] = False
293+
294+ class TestModel (BoringModel ):
295+
296+ def on_train_start (self ) -> None :
297+ assert self .trainer .training_type_plugin .config ['zero_optimization' ]['cpu_offload' ] is False
298+ raise SystemExit ()
299+
300+ model = TestModel ()
301+ trainer = Trainer (plugins = [DeepSpeedPlugin (config = deepspeed_zero_config )], precision = 16 , gpus = 1 )
302+ with pytest .raises (SystemExit ):
303+ trainer .fit (model )
304+
305+
258306@pytest .mark .skipif (not torch .cuda .is_available (), reason = "requires GPU machine" )
259307@pytest .mark .skipif (not _DEEPSPEED_AVAILABLE , reason = "DeepSpeed not available." )
260308@pytest .mark .skipif (torch .cuda .device_count () < 2 , reason = "test requires multi-GPU machine" )
0 commit comments