|
7 | 7 | from nnsight import LanguageModel
|
8 | 8 | from dictionary_learning.cache import ActivationCache
|
9 | 9 | from transformers import AutoModelForCausalLM, AutoTokenizer
|
10 |
| - |
| 10 | +import numpy as np |
11 | 11 |
|
12 | 12 | @pytest.fixture
|
13 | 13 | def temp_dir():
|
@@ -267,36 +267,148 @@ def test_activation_cache_with_normalizer(temp_dir):
|
267 | 267 | cache.std, computed_std, atol=1e-5, rtol=1e-5
|
268 | 268 | ), "Cached std doesn't match computed std"
|
269 | 269 |
|
270 |
| - # Test normalizer functionality |
271 |
| - normalizer = cache.normalizer |
| 270 | + print(f"✓ Successfully tested ActivationCache with {len(cache)} activations") |
| 271 | + print(f"✓ Mean shape: {cache.mean.shape}, Std shape: {cache.std.shape}") |
| 272 | + |
272 | 273 |
|
273 |
| - # Test normalization of a sample activation |
274 |
| - sample_activation = cached_activations[0] |
275 |
| - normalized = normalizer(sample_activation) |
| 274 | +def test_sequence_ranges_no_bos_token(temp_dir): |
| 275 | + """Test that sequence ranges are stored when model has no BOS token.""" |
| 276 | + # Set flag to handle meta tensors properly |
| 277 | + if hasattr(th.fx, 'experimental'): |
| 278 | + th.fx.experimental._config.meta_nonzero_assume_all_nonzero = True |
276 | 279 |
|
277 |
| - # Verify normalization: (x - mean) / std (with small epsilon for numerical stability) |
278 |
| - expected_normalized = (sample_activation - cache.mean) / (cache.std + 1e-8) |
279 |
| - assert th.allclose( |
280 |
| - normalized, expected_normalized, atol=1e-6 |
281 |
| - ), "Normalizer doesn't work correctly" |
| 280 | + # Skip test if CUDA not available |
| 281 | + if not th.cuda.is_available(): |
| 282 | + pytest.skip("CUDA not available, skipping test") |
| 283 | + |
| 284 | + # Test strings of different lengths |
| 285 | + test_strings = [ |
| 286 | + "Hello world", |
| 287 | + "This is a longer sentence with more tokens", |
| 288 | + "Short", |
| 289 | + "Medium length text here", |
| 290 | + ] |
282 | 291 |
|
283 |
| - # Test batch normalization |
284 |
| - batch_normalized = normalizer(cached_activations[:5]) |
285 |
| - expected_batch_normalized = (cached_activations[:5] - cache.mean) / ( |
286 |
| - cache.std + 1e-8 |
| 292 | + # Load GPT-2 model and modify tokenizer to simulate no BOS token |
| 293 | + tokenizer = AutoTokenizer.from_pretrained("gpt2") |
| 294 | + model = AutoModelForCausalLM.from_pretrained( |
| 295 | + "gpt2", device_map="auto", torch_dtype=th.float32 |
287 | 296 | )
|
288 |
| - assert th.allclose( |
289 |
| - batch_normalized, expected_batch_normalized, atol=1e-6 |
290 |
| - ), "Batch normalization doesn't work correctly" |
| 297 | + model = LanguageModel(model, torch_dtype=th.float32, tokenizer=tokenizer) |
| 298 | + model.tokenizer.pad_token = model.tokenizer.eos_token |
| 299 | + |
| 300 | + # Simulate model without BOS token |
| 301 | + original_bos_token_id = model.tokenizer.bos_token_id |
| 302 | + model.tokenizer.bos_token_id = None |
| 303 | + |
| 304 | + tokens = model.tokenizer(test_strings, add_special_tokens=True, return_tensors="pt", padding=True, truncation=True) |
| 305 | + lengths = tokens["attention_mask"].sum(dim=1).tolist() |
| 306 | + ranges = np.cumsum([0] + lengths) |
| 307 | + try: |
| 308 | + # Get a transformer block |
| 309 | + target_layer = model.transformer.h[6] |
| 310 | + submodule_name = "transformer_h_6" |
| 311 | + |
| 312 | + # Parameters for activation collection |
| 313 | + batch_size = 2 |
| 314 | + context_len = 32 |
| 315 | + d_model = 768 |
| 316 | + |
| 317 | + # Collect activations with sequence start tracking |
| 318 | + ActivationCache.collect( |
| 319 | + data=test_strings, |
| 320 | + submodules=(target_layer,), |
| 321 | + submodule_names=(submodule_name,), |
| 322 | + model=model, |
| 323 | + store_dir=temp_dir, |
| 324 | + batch_size=batch_size, |
| 325 | + context_len=context_len, |
| 326 | + shard_size=1000, |
| 327 | + d_model=d_model, |
| 328 | + io="out", |
| 329 | + store_tokens=True, |
| 330 | + shuffle_shards=False, # Required for sequence ranges |
| 331 | + ) |
291 | 332 |
|
292 |
| - # Test that normalization preserves shape |
293 |
| - assert ( |
294 |
| - normalized.shape == sample_activation.shape |
295 |
| - ), "Normalization changed tensor shape" |
296 |
| - assert ( |
297 |
| - batch_normalized.shape == cached_activations[:5].shape |
298 |
| - ), "Batch normalization changed tensor shape" |
| 333 | + # Load the cached activations |
| 334 | + cache = ActivationCache(temp_dir, submodule_name + "_out") |
| 335 | + |
| 336 | + # Verify sequence ranges were stored |
| 337 | + sequence_ranges = cache.sequence_ranges |
| 338 | + assert sequence_ranges is not None, "sequence ranges should be stored for model without BOS token" |
| 339 | + |
| 340 | + # Should have one sequence start per input string plus one for the last sequence |
| 341 | + assert len(sequence_ranges) == len(test_strings) + 1, f"Expected {len(test_strings)} sequence ranges, got {len(sequence_ranges)}" |
| 342 | + |
| 343 | + # First sequence should start at position 0 |
| 344 | + assert sequence_ranges[0].item() == 0, "First sequence should start at position 0" |
| 345 | + |
| 346 | + # sequence ranges should be the same as the ranges computed from the tokens |
| 347 | + assert np.allclose(sequence_ranges, ranges), "sequence ranges should be the same as the ranges computed from the tokens" |
| 348 | + |
| 349 | + # sequence ranges should be in ascending order |
| 350 | + for i in range(1, len(sequence_ranges)): |
| 351 | + assert sequence_ranges[i] > sequence_ranges[i-1], f"sequence ranges should be ascending: {sequence_ranges}" |
| 352 | + |
| 353 | + # Verify sequence ranges align with token boundaries |
| 354 | + tokens = cache.tokens |
| 355 | + total_tokens = len(tokens) |
| 356 | + |
| 357 | + # All sequence ranges should be valid indices |
| 358 | + for start_idx in sequence_ranges: |
| 359 | + assert 0 <= start_idx <= total_tokens, f"Invalid sequence start index: {start_idx}" |
| 360 | + |
| 361 | + finally: |
| 362 | + # Restore original BOS token |
| 363 | + model.tokenizer.bos_token_id = original_bos_token_id |
| 364 | + |
| 365 | + |
| 366 | +def test_sequence_ranges_with_bos_token(temp_dir): |
| 367 | + """Test that sequence ranges are NOT stored when model has BOS token.""" |
| 368 | + # Set flag to handle meta tensors properly |
| 369 | + if hasattr(th.fx, 'experimental'): |
| 370 | + th.fx.experimental._config.meta_nonzero_assume_all_nonzero = True |
299 | 371 |
|
300 |
| - print(f"✓ Successfully tested ActivationCache with {len(cache)} activations") |
301 |
| - print(f"✓ Mean shape: {cache.mean.shape}, Std shape: {cache.std.shape}") |
302 |
| - print(f"✓ Normalizer tests passed") |
| 372 | + # Skip test if CUDA not available |
| 373 | + if not th.cuda.is_available(): |
| 374 | + pytest.skip("CUDA not available, skipping test") |
| 375 | + |
| 376 | + test_strings = ["Hello world", "Another test sentence"] |
| 377 | + |
| 378 | + # Load GPT-2 model with BOS token |
| 379 | + tokenizer = AutoTokenizer.from_pretrained("gpt2") |
| 380 | + model = AutoModelForCausalLM.from_pretrained( |
| 381 | + "gpt2", device_map="auto", torch_dtype=th.float32 |
| 382 | + ) |
| 383 | + model = LanguageModel(model, torch_dtype=th.float32, tokenizer=tokenizer) |
| 384 | + model.tokenizer.pad_token = model.tokenizer.eos_token |
| 385 | + |
| 386 | + # Ensure model has BOS token (set it explicitly) |
| 387 | + model.tokenizer.bos_token_id = model.tokenizer.eos_token_id |
| 388 | + |
| 389 | + # Get a transformer block |
| 390 | + target_layer = model.transformer.h[6] |
| 391 | + submodule_name = "transformer_h_6" |
| 392 | + |
| 393 | + # Collect activations |
| 394 | + ActivationCache.collect( |
| 395 | + data=test_strings, |
| 396 | + submodules=(target_layer,), |
| 397 | + submodule_names=(submodule_name,), |
| 398 | + model=model, |
| 399 | + store_dir=temp_dir, |
| 400 | + batch_size=2, |
| 401 | + context_len=32, |
| 402 | + shard_size=1000, |
| 403 | + d_model=768, |
| 404 | + io="out", |
| 405 | + store_tokens=True, |
| 406 | + shuffle_shards=False, |
| 407 | + ) |
| 408 | + |
| 409 | + # Load the cached activations |
| 410 | + cache = ActivationCache(temp_dir, submodule_name + "_out") |
| 411 | + |
| 412 | + # Verify sequence ranges were NOT stored |
| 413 | + sequence_ranges = cache.sequence_ranges |
| 414 | + assert sequence_ranges is None, "sequence ranges should not be stored for model with BOS token" |
0 commit comments