feat(gpt&web0): add more logs&flags

fumiama · fumiama · commit b3d2953dd458 · 2025-12-03T15:07:48.000+08:00
diff --git a/ChatTTS/model/gpt.py b/ChatTTS/model/gpt.py
@@ -297,22 +297,25 @@ def _prepare_generation_outputs(
         hiddens: List[torch.Tensor],
         infer_text: bool,
     ) -> GenerationOutputs:
-        inputs_ids = [
-            inputs_ids[idx].narrow(0, start_idx, i) for idx, i in enumerate(end_idx)
+        end_idx_int = end_idx.int()
+
+        inputs_ids_lst = [
+            inputs_ids[idx].narrow(0, start_idx, int(i)) for idx, i in enumerate(end_idx_int)
         ]
         if infer_text:
-            inputs_ids = [i.narrow(1, 0, 1).squeeze_(1) for i in inputs_ids]
+            inputs_ids_lst = [i.narrow(1, 0, 1).squeeze_(1) for i in inputs_ids_lst]
 
+        hiddens_lst = []
         if len(hiddens) > 0:
-            hiddens = torch.stack(hiddens, 1)
-            hiddens = [
-                hiddens[idx].narrow(0, 0, i) for idx, i in enumerate(end_idx.int())
+            hiddens_lst = torch.stack(hiddens, 1)
+            hiddens_lst = [
+                hiddens_lst[idx].narrow(0, 0, int(i)) for idx, i in enumerate(end_idx_int)
             ]
 
         return self.GenerationOutputs(
-            ids=inputs_ids,
+            ids=inputs_ids_lst,
             attentions=attentions,
-            hiddens=hiddens,
+            hiddens=hiddens_lst,
         )
 
     @torch.no_grad()
@@ -338,6 +341,8 @@ def generate(
         manual_seed: Optional[int] = None,
         context=Context(),
     ):
+        
+        self.logger.debug("start generate")
 
         attentions: List[Optional[Tuple[torch.FloatTensor, ...]]] = []
         hiddens = []
@@ -348,6 +353,8 @@ def generate(
         )
         finish = torch.zeros(inputs_ids.shape[0], device=inputs_ids.device).bool()
 
+        self.logger.debug(f"set start_idx: {start_idx}, end_idx and finish with all zeros, len {inputs_ids.shape[0]}")
+
         old_temperature = temperature
 
         temperature = (
@@ -357,6 +364,8 @@ def generate(
             .view(-1, 1)
         )
 
+        self.logger.debug(f"expand temperature from shape {old_temperature.shape} to {temperature.shape}")
+
         attention_mask_cache = torch.ones(
             (
                 inputs_ids.shape[0],
@@ -365,10 +374,12 @@ def generate(
             dtype=torch.bool,
             device=inputs_ids.device,
         )
+        self.logger.debug(f"init attention_mask_cache with shape {attention_mask_cache.shape}")
         if attention_mask is not None:
             attention_mask_cache.narrow(1, 0, attention_mask.shape[1]).copy_(
                 attention_mask
             )
+            self.logger.debug(f"copy attention_mask with shape {attention_mask.shape}")
 
         progress = inputs_ids.size(1)
         # pre-allocate inputs_ids
@@ -380,6 +391,7 @@ def generate(
             device=inputs_ids.device,
         )
         inputs_ids_buf.narrow(1, 0, progress).copy_(inputs_ids)
+        self.logger.debug(f"expand inputs_ids buf from shape {inputs_ids.shape} to {inputs_ids_buf.shape}")
         del inputs_ids
         inputs_ids = inputs_ids_buf.narrow(1, 0, progress)
 
@@ -396,28 +408,36 @@ def generate(
 
         for i in range(max_new_token):
 
+            self.logger.debug("start _prepare_generation_inputs")
             model_input = self._prepare_generation_inputs(
                 inputs_ids,
                 past_key_values,
                 attention_mask_cache.narrow(1, 0, inputs_ids.shape[1]),
             )
+            self.logger.debug("finis _prepare_generation_inputs")
 
             if i > 0:
                 del emb
                 inputs_ids_emb = model_input.input_ids.to(self.device_gpt)
                 if infer_text:
+                    self.logger.debug("start emb_text")
                     emb: torch.Tensor = self.emb_text(inputs_ids_emb[:, :, 0])
+                    self.logger.debug("finis emb_text")
                 else:
+                    self.logger.debug("start code_emb")
                     code_emb = [
-                        self.emb_code[i](inputs_ids_emb[:, :, i])
+                        self.emb_code[i](inputs_ids_emb[:, :, i]).to(self.device)
                         for i in range(self.num_vq)
                     ]
                     emb = torch.stack(code_emb, 3).sum(3)
+                    self.logger.debug("finis code_emb")
                 del inputs_ids_emb, model_input.input_ids
             model_input.inputs_embeds = emb
 
+            self.logger.debug(f"move model_input to device_gpt: {str(self.device_gpt)}")
             model_input.to(self.device_gpt, self.gpt.dtype)
 
+            self.logger.debug("start gpt...")
             outputs: BaseModelOutputWithPast = self.gpt(
                 attention_mask=model_input.attention_mask,
                 position_ids=model_input.position_ids,
@@ -427,6 +447,7 @@ def generate(
                 output_attentions=return_attn,
                 cache_position=model_input.cache_position,
             )
+            self.logger.debug("finis gpt")
             del_all(model_input)
             attentions.append(outputs.attentions)
             hidden_states = outputs.last_hidden_state.to(
@@ -439,8 +460,11 @@ def generate(
 
             with P.cached():
                 if infer_text:
+                    self.logger.debug("start head_text")
                     logits: torch.Tensor = self.head_text(hidden_states)
+                    self.logger.debug("finis head_text")
                 else:
+                    self.logger.debug("start head_code")
                     # logits = torch.stack([self.head_code[i](hidden_states) for i in range(self.num_vq)], 3)
                     logits = torch.empty(
                         hidden_states.size(0),
@@ -454,9 +478,11 @@ def generate(
                         x: torch.Tensor = self.head_code[num_vq_iter](hidden_states)
                         logits[..., num_vq_iter] = x
                         del x
+                    self.logger.debug("finis head_code")
 
             del hidden_states
 
+            self.logger.debug("start logits")
             # logits = logits[:, -1].float()
             logits = logits.narrow(1, -1, 1).squeeze_(1).float()
 
@@ -500,6 +526,9 @@ def generate(
 
             del logits
 
+            self.logger.debug("finis logits")
+
+            self.logger.debug("start seed")
             if manual_seed is None:
                 idx_next = torch.multinomial(scores, num_samples=1).to(finish.device)
             else:
@@ -511,6 +540,10 @@ def generate(
 
             del scores
 
+            self.logger.debug("finis seed")
+
+            self.logger.debug("start finish")
+
             if not infer_text:
                 # idx_next = rearrange(idx_next, "(b n) 1 -> b n", n=self.num_vq)
                 idx_next = idx_next.view(-1, self.num_vq)
@@ -526,6 +559,8 @@ def generate(
                     idx_next.unsqueeze_(-1).expand(-1, -1, self.num_vq),
                 )
 
+            self.logger.debug("finis finish")
+
             if i == 0 and finish.any():
                 self.logger.warning(
                     "unexpected end at index %s",
@@ -570,6 +605,8 @@ def generate(
                         yield result
                     del inputs_ids
                 return
+            
+            self.logger.debug("start output")
 
             del idx_next
             progress += 1
@@ -591,6 +628,8 @@ def generate(
                     )
             del not_finished
 
+            self.logger.debug("finis output")
+
             if finish.all() or context.get():
                 break
 
diff --git a/ChatTTS/utils/gpu.py b/ChatTTS/utils/gpu.py
@@ -46,9 +46,17 @@ def select_device(min_memory=2047, experimental=False):
             logger.get_logger().info("found Apple GPU, but use CPU.")
             device = torch.device("cpu")
     elif importlib.util.find_spec("torch_directml") is not None:
-        import torch_directml
-
-        device = torch_directml.device(torch_directml.default_device())
+        """
+        Currently DML is under developing and may output wrong result,
+        so only enable this for experimental use.
+        """
+        if experimental:
+            logger.get_logger().warning("experimental: using DML.")
+            import torch_directml
+            device = torch_directml.device(torch_directml.default_device())
+        else:
+            logger.get_logger().info("found DML, but use CPU.")
+            device = torch.device("cpu")
     else:
         logger.get_logger().warning("no GPU or NPU found, use CPU instead")
         device = torch.device("cpu")
diff --git a/examples/web/funcs.py b/examples/web/funcs.py
@@ -25,6 +25,9 @@
 has_interrupted = False
 is_in_generate = False
 
+enable_cache=True
+experimental=False
+
 seed_min = 1
 seed_max = 4294967295
 
@@ -61,14 +64,21 @@ def on_audio_seed_change(audio_seed_input):
         rand_spk = chat.sample_random_speaker()
     return rand_spk
 
+def set_params(en_cache, exp):
+    global enable_cache, experimental
+
+    enable_cache = en_cache
+    experimental = exp
+
+def load_chat(cust_path: Optional[str], coef: Optional[str]) -> bool:
+    global enable_cache, experimental
 
-def load_chat(cust_path: Optional[str], coef: Optional[str], enable_cache=True) -> bool:
     if cust_path == None:
-        ret = chat.load(coef=coef, enable_cache=enable_cache)
+        ret = chat.load(coef=coef, enable_cache=enable_cache, experimental=experimental)
     else:
         logger.info("local model path: %s", cust_path)
         ret = chat.load(
-            "custom", custom_path=cust_path, coef=coef, enable_cache=enable_cache
+            "custom", custom_path=cust_path, coef=coef, enable_cache=enable_cache, experimental=experimental
         )
         global custom_path
         custom_path = cust_path
diff --git a/examples/web/webui.py b/examples/web/webui.py
@@ -264,11 +264,14 @@ def make_audio(autoplay, stream):
     parser.add_argument(
         "--disable_cache", action="store_true", help="enable model cache"
     )
+    parser.add_argument(
+        "--experimental", action="store_true", help="enable model cache"
+    )
     args = parser.parse_args()
-
+    set_params(not args.disable_cache, args.experimental)
     logger.info("loading ChatTTS model...")
 
-    if load_chat(args.custom_path, args.coef, not args.disable_cache):
+    if load_chat(args.custom_path, args.coef):
         logger.info("Models loaded successfully.")
     else:
         logger.error("Models load failed.")