|
4 | 4 | import torch |
5 | 5 | from torch import nn |
6 | 6 | from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss |
| 7 | +from transformers.modeling_attn_mask_utils import ( |
| 8 | + _prepare_4d_causal_attention_mask, |
| 9 | + _prepare_4d_causal_attention_mask_for_sdpa, |
| 10 | +) |
7 | 11 | from transformers.modeling_outputs import ( |
8 | 12 | BaseModelOutputWithPast, |
9 | 13 | CausalLMOutputWithPast, |
10 | 14 | SequenceClassifierOutputWithPast, |
11 | 15 | ) |
12 | | - |
13 | | -try: |
14 | | - from transformers.modeling_attn_mask_utils import ( |
15 | | - _prepare_4d_causal_attention_mask, |
16 | | - _prepare_4d_causal_attention_mask_for_sdpa, |
17 | | - ) |
18 | | - from transformers.models.qwen2.modeling_qwen2 import ( |
19 | | - Qwen2Attention, |
20 | | - Qwen2ForCausalLM, |
21 | | - Qwen2ForSequenceClassification, |
22 | | - Qwen2Model, |
23 | | - apply_rotary_pos_emb, |
24 | | - repeat_kv, |
25 | | - ) |
26 | | -except ImportError: |
27 | | - Qwen2Model = "Qwen2Model" |
28 | | - Qwen2ForCausalLM = "Qwen2ForCausalLM" |
29 | | - Qwen2Attention = "Qwen2Attention" |
30 | | - Qwen2ForSequenceClassification = "Qwen2ForSequenceClassification" |
31 | | - |
| 16 | +from transformers.models.qwen2.modeling_qwen2 import ( |
| 17 | + Qwen2Attention, |
| 18 | + Qwen2ForCausalLM, |
| 19 | + Qwen2ForSequenceClassification, |
| 20 | + Qwen2Model, |
| 21 | + apply_rotary_pos_emb, |
| 22 | + repeat_kv, |
| 23 | +) |
32 | 24 | from transformers.utils import logging |
33 | 25 |
|
34 | 26 | from colossalai.pipeline.stage_manager import PipelineStageManager |
@@ -434,7 +426,6 @@ def qwen2_for_sequence_classification_forward( |
434 | 426 | logits = self.score(hidden_states) |
435 | 427 |
|
436 | 428 | if self.config.pad_token_id is None and batch_size != 1: |
437 | | - print(self.config.pad_token_id) |
438 | 429 | raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") |
439 | 430 | if self.config.pad_token_id is None: |
440 | 431 | sequence_lengths = -1 |
|
0 commit comments