From 7a2bec3a15983c0b0fac6499ef2025be6febb759 Mon Sep 17 00:00:00 2001 From: Eric B Date: Thu, 17 Jul 2025 15:58:27 +0200 Subject: [PATCH 1/5] Fix vocab size for Bark generation. --- src/transformers/generation/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index e9b28eb102dd..623af284f28e 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -3988,6 +3988,8 @@ def _beam_search( vocab_size = self.config.audio_vocab_size elif self.__class__.__name__ == "ImageGPTForCausalImageModeling": vocab_size = self.get_output_embeddings().out_features + elif self.__class__.__name__ == "BarkSemanticModel": + vocab_size = self.config.output_vocab_size else: vocab_size = self.config.get_text_config().vocab_size decoder_prompt_len = cur_len From 2ca84490cb6e8e8a0e103d8f8b80e4634735ba4d Mon Sep 17 00:00:00 2001 From: Eric B Date: Fri, 18 Jul 2025 13:15:43 +0200 Subject: [PATCH 2/5] Fix Bark processor tests. --- .../models/bark/processing_bark.py | 76 ++++++++++++++----- tests/models/bark/test_processor_bark.py | 9 +++ 2 files changed, 67 insertions(+), 18 deletions(-) diff --git a/src/transformers/models/bark/processing_bark.py b/src/transformers/models/bark/processing_bark.py index e2b47dca6acb..9bca1d026a51 100644 --- a/src/transformers/models/bark/processing_bark.py +++ b/src/transformers/models/bark/processing_bark.py @@ -52,9 +52,9 @@ class BarkProcessor(ProcessorMixin): attributes = ["tokenizer"] preset_shape = { - "semantic_prompt": 1, - "coarse_prompt": 2, - "fine_prompt": 2, + "semantic_prompt": 1, # 1D array of shape (X,) + "coarse_prompt": 2, # 2D array of shape (2,X) + "fine_prompt": 2, # 2D array of shape (8,X) } def __init__(self, tokenizer, speaker_embeddings=None): @@ -114,6 +114,9 @@ def from_pretrained( else: speaker_embeddings = None + if speaker_embeddings is not None: + if "repo_or_path" in speaker_embeddings: + speaker_embeddings["repo_or_path"] = pretrained_processor_name_or_path tokenizer = AutoTokenizer.from_pretrained(pretrained_processor_name_or_path, **kwargs) return cls(tokenizer=tokenizer, speaker_embeddings=speaker_embeddings) @@ -153,22 +156,21 @@ def save_pretrained( embeddings_dict["repo_or_path"] = save_directory - for prompt_key in self.speaker_embeddings: - if prompt_key != "repo_or_path": - voice_preset = self._load_voice_preset(prompt_key) + for prompt_key in self.available_voice_presets: + voice_preset = self._load_voice_preset(prompt_key) - tmp_dict = {} - for key in self.speaker_embeddings[prompt_key]: - np.save( - os.path.join( - embeddings_dict["repo_or_path"], speaker_embeddings_directory, f"{prompt_key}_{key}" - ), - voice_preset[key], - allow_pickle=False, - ) - tmp_dict[key] = os.path.join(speaker_embeddings_directory, f"{prompt_key}_{key}.npy") + tmp_dict = {} + for key in self.speaker_embeddings[prompt_key]: + np.save( + os.path.join( + embeddings_dict["repo_or_path"], speaker_embeddings_directory, f"{prompt_key}_{key}" + ), + voice_preset[key], + allow_pickle=False, + ) + tmp_dict[key] = os.path.join(speaker_embeddings_directory, f"{prompt_key}_{key}.npy") - embeddings_dict[prompt_key] = tmp_dict + embeddings_dict[prompt_key] = tmp_dict with open(os.path.join(save_directory, speaker_embeddings_dict_path), "w") as fp: json.dump(embeddings_dict, fp) @@ -222,6 +224,43 @@ def _validate_voice_preset_dict(self, voice_preset: Optional[dict] = None): if len(voice_preset[key].shape) != self.preset_shape[key]: raise ValueError(f"{key} voice preset must be a {str(self.preset_shape[key])}D ndarray.") + @property + def available_voice_presets(self) -> list: + """ + Returns a list of available voice presets. + + Returns: + `list[str]`: A list of voice preset names. + """ + if self.speaker_embeddings is None: + return [] + + voice_presets = list(self.speaker_embeddings.keys()) + if "repo_or_path" in voice_presets: + voice_presets.remove("repo_or_path") + return voice_presets + + def _verify_speaker_embeddings(self, remove_unavailable: bool = True): + # check which actually downloaded properly / are available + unavailable_keys = [] + if self.speaker_embeddings is not None: + for voice_preset in self.available_voice_presets: + try: + voice_preset_dict = self._load_voice_preset(voice_preset) + self._validate_voice_preset_dict(voice_preset_dict) + except Exception: + unavailable_keys.append(voice_preset) + + if unavailable_keys: + logger.warning( + f"The following {len(unavailable_keys)} speaker embeddings are not available: {unavailable_keys} " + "If you would like to use them, please check the paths or try downloading them again." + ) + + if remove_unavailable: + for voice_preset in unavailable_keys: + del self.speaker_embeddings[voice_preset] + def __call__( self, text=None, @@ -247,7 +286,8 @@ def __call__( voice_preset (`str`, `dict[np.ndarray]`): The voice preset, i.e the speaker embeddings. It can either be a valid voice_preset name, e.g `"en_speaker_1"`, or directly a dictionary of `np.ndarray` embeddings for each submodel of `Bark`. Or - it can be a valid file name of a local `.npz` single voice preset. + it can be a valid file name of a local `.npz` single voice preset containing the keys + `"semantic_prompt"`, `"coarse_prompt"` and `"fine_prompt"`. return_tensors (`str` or [`~utils.TensorType`], *optional*): If set, will return tensors of a particular framework. Acceptable values are: diff --git a/tests/models/bark/test_processor_bark.py b/tests/models/bark/test_processor_bark.py index 15b0871d8144..457ec4ff6390 100644 --- a/tests/models/bark/test_processor_bark.py +++ b/tests/models/bark/test_processor_bark.py @@ -55,6 +55,15 @@ def test_save_load_pretrained_additional_features(self): pretrained_processor_name_or_path=self.checkpoint, speaker_embeddings_dict_path=self.speaker_embeddings_dict_path, ) + """ + TODO (ebezzam) not all speaker embedding are properly downloaded. + My hypothesis: there are many files (~700 speaker embeddings) and some fail to download (not the same at different first runs) + https://github.com/huggingface/transformers/blob/967045082faaaaf3d653bfe665080fd746b2bb60/src/transformers/models/bark/processing_bark.py#L89 + https://github.com/huggingface/transformers/blob/967045082faaaaf3d653bfe665080fd746b2bb60/src/transformers/models/bark/processing_bark.py#L188 + + So for testing purposes, we will remove the unavailable speaker embeddings before saving. + """ + processor._verify_speaker_embeddings(remove_unavailable=True) processor.save_pretrained( self.tmpdirname, speaker_embeddings_dict_path=self.speaker_embeddings_dict_path, From d973bb410d171f158e933b11e73eafb3f3c73133 Mon Sep 17 00:00:00 2001 From: Eric B Date: Fri, 18 Jul 2025 13:40:16 +0200 Subject: [PATCH 3/5] Fix style. --- tests/models/bark/test_processor_bark.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/models/bark/test_processor_bark.py b/tests/models/bark/test_processor_bark.py index 457ec4ff6390..1f149cc90dcf 100644 --- a/tests/models/bark/test_processor_bark.py +++ b/tests/models/bark/test_processor_bark.py @@ -60,7 +60,6 @@ def test_save_load_pretrained_additional_features(self): My hypothesis: there are many files (~700 speaker embeddings) and some fail to download (not the same at different first runs) https://github.com/huggingface/transformers/blob/967045082faaaaf3d653bfe665080fd746b2bb60/src/transformers/models/bark/processing_bark.py#L89 https://github.com/huggingface/transformers/blob/967045082faaaaf3d653bfe665080fd746b2bb60/src/transformers/models/bark/processing_bark.py#L188 - So for testing purposes, we will remove the unavailable speaker embeddings before saving. """ processor._verify_speaker_embeddings(remove_unavailable=True) From 96c67d9ebc33a6f17ee1e2006f2e11acf176066d Mon Sep 17 00:00:00 2001 From: Eric B Date: Fri, 18 Jul 2025 18:33:06 +0200 Subject: [PATCH 4/5] Address comments. --- src/transformers/models/bark/processing_bark.py | 6 ++++-- tests/models/bark/test_processor_bark.py | 13 ++++++------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/transformers/models/bark/processing_bark.py b/src/transformers/models/bark/processing_bark.py index 9bca1d026a51..453c9acd0abd 100644 --- a/src/transformers/models/bark/processing_bark.py +++ b/src/transformers/models/bark/processing_bark.py @@ -247,9 +247,11 @@ def _verify_speaker_embeddings(self, remove_unavailable: bool = True): for voice_preset in self.available_voice_presets: try: voice_preset_dict = self._load_voice_preset(voice_preset) - self._validate_voice_preset_dict(voice_preset_dict) - except Exception: + except ValueError: + # error from `_load_voice_preset` of path not existing unavailable_keys.append(voice_preset) + continue + self._validate_voice_preset_dict(voice_preset_dict) if unavailable_keys: logger.warning( diff --git a/tests/models/bark/test_processor_bark.py b/tests/models/bark/test_processor_bark.py index 1f149cc90dcf..73451135b708 100644 --- a/tests/models/bark/test_processor_bark.py +++ b/tests/models/bark/test_processor_bark.py @@ -55,13 +55,12 @@ def test_save_load_pretrained_additional_features(self): pretrained_processor_name_or_path=self.checkpoint, speaker_embeddings_dict_path=self.speaker_embeddings_dict_path, ) - """ - TODO (ebezzam) not all speaker embedding are properly downloaded. - My hypothesis: there are many files (~700 speaker embeddings) and some fail to download (not the same at different first runs) - https://github.com/huggingface/transformers/blob/967045082faaaaf3d653bfe665080fd746b2bb60/src/transformers/models/bark/processing_bark.py#L89 - https://github.com/huggingface/transformers/blob/967045082faaaaf3d653bfe665080fd746b2bb60/src/transformers/models/bark/processing_bark.py#L188 - So for testing purposes, we will remove the unavailable speaker embeddings before saving. - """ + + # TODO (ebezzam) not all speaker embedding are properly downloaded. + # My hypothesis: there are many files (~700 speaker embeddings) and some fail to download (not the same at different first runs) + # https://github.com/huggingface/transformers/blob/967045082faaaaf3d653bfe665080fd746b2bb60/src/transformers/models/bark/processing_bark.py#L89 + # https://github.com/huggingface/transformers/blob/967045082faaaaf3d653bfe665080fd746b2bb60/src/transformers/models/bark/processing_bark.py#L188 + # So for testing purposes, we will remove the unavailable speaker embeddings before saving. processor._verify_speaker_embeddings(remove_unavailable=True) processor.save_pretrained( self.tmpdirname, From e3e3542c800af719c25e919e11d303bbb0829230 Mon Sep 17 00:00:00 2001 From: Eric B Date: Mon, 21 Jul 2025 10:07:23 +0200 Subject: [PATCH 5/5] Fix formatting. --- tests/models/bark/test_processor_bark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/bark/test_processor_bark.py b/tests/models/bark/test_processor_bark.py index 73451135b708..39d0550af4df 100644 --- a/tests/models/bark/test_processor_bark.py +++ b/tests/models/bark/test_processor_bark.py @@ -55,7 +55,7 @@ def test_save_load_pretrained_additional_features(self): pretrained_processor_name_or_path=self.checkpoint, speaker_embeddings_dict_path=self.speaker_embeddings_dict_path, ) - + # TODO (ebezzam) not all speaker embedding are properly downloaded. # My hypothesis: there are many files (~700 speaker embeddings) and some fail to download (not the same at different first runs) # https://github.com/huggingface/transformers/blob/967045082faaaaf3d653bfe665080fd746b2bb60/src/transformers/models/bark/processing_bark.py#L89