From b6fc11891d45ba0ae13a7410ccf84154bb43b37c Mon Sep 17 00:00:00 2001 From: kento2247 Date: Sun, 21 Dec 2025 20:17:18 +0900 Subject: [PATCH 1/3] add * into wheel --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 6f96a8f7..5c3f091b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,7 @@ dependencies = [ ] [tool.setuptools.packages.find] +where = ["."] include = ["sam_audio*"] [tool.ruff] From 32d7cb418b7287295e44eedcdca88e662806a523 Mon Sep 17 00:00:00 2001 From: kento2247 Date: Sun, 21 Dec 2025 20:48:57 +0900 Subject: [PATCH 2/3] up --- pyproject.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 5c3f091b..ce6b753f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ authors = [ readme = "README.md" license = { file="LICENSE" } -requires-python = ">=3.10" +requires-python = ">=3.11" dependencies = [ "dacvae@git+https://github.com/facebookresearch/dacvae.git", "audiobox_aesthetics", @@ -23,9 +23,9 @@ dependencies = [ "numpy", "perception-models@git+https://github.com/facebookresearch/perception_models@unpin-deps", "pydub", - "torch", + "torch==2.6.0", "torchaudio", - "torchcodec", + "torchcodec==0.2", "torchdiffeq", "torchvision", "transformers>=4.54.0", From 9fc461b483c431d05fde404571f65b058cc86926 Mon Sep 17 00:00:00 2001 From: kento2247 Date: Tue, 6 Jan 2026 17:53:25 +0900 Subject: [PATCH 3/3] fix vram usage --- sam_audio/model/model.py | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/sam_audio/model/model.py b/sam_audio/model/model.py index 1bf5c586..28d8e8d5 100644 --- a/sam_audio/model/model.py +++ b/sam_audio/model/model.py @@ -79,8 +79,9 @@ class SAMAudio(BaseModel): def __init__(self, cfg: SAMAudioConfig): super().__init__() self.audio_codec = DACVAE(cfg.audio_codec) - self.text_encoder = T5TextEncoder(cfg.text_encoder) - self.vision_encoder = PerceptionEncoder(cfg.vision_encoder) + self.text_encoder = T5TextEncoder(cfg.text_encoder) # base + # self.vision_encoder = PerceptionEncoder(cfg.vision_encoder) + self.vision_encoder_dim = cfg.vision_encoder.dim # 1024 self.transformer = DiT(cfg.transformer) self.proj = torch.nn.Linear(cfg.in_channels, cfg.transformer.dim) self.align_masked_video = AlignModalities( @@ -91,15 +92,17 @@ def __init__(self, cfg: SAMAudioConfig): ) self.memory_proj = torch.nn.Linear(cfg.text_encoder.dim, cfg.transformer.dim) self.timestep_emb = SinusoidalEmbedding(cfg.transformer.dim) - self.visual_ranker = create_ranker(cfg.visual_ranker) - self.text_ranker = create_ranker(cfg.text_ranker) - if cfg.span_predictor is not None: - self.span_predictor = PEAudioFrame.from_config( - cfg.span_predictor, pretrained=True - ) - self.span_predictor_transform = PEAudioFrameTransform.from_config( - cfg.span_predictor - ) + self.visual_ranker = None # create_ranker(cfg.visual_ranker) + self.text_ranker = None # create_ranker(cfg.text_ranker) + self.span_predictor = None + self.span_predictor_transform = None + # if cfg.span_predictor is not None: + # self.span_predictor = PEAudioFrame.from_config( + # cfg.span_predictor, pretrained=True + # ) + # self.span_predictor_transform = PEAudioFrameTransform.from_config( + # cfg.span_predictor + # ) @property def sample_rate(self): @@ -186,9 +189,9 @@ def _get_audio_features(self, audios: torch.Tensor): def _get_video_features(self, video, audio_features): B, T, _ = audio_features.shape if video is None: - return audio_features.new_zeros(B, self.vision_encoder.dim, T) + return audio_features.new_zeros(B, self.vision_encoder_dim, T) else: - return self.vision_encoder(video).transpose(1, 2) + raise Exception("Video not supported") def _repeat_for_reranking(self, tensor, candidates): if candidates > 1: @@ -354,7 +357,7 @@ def load_state_dict(self, state_dict, strict=True): ) missing_keys = [x for x in missing_keys if not re.search(skip_regex, x)] if len(missing_keys) > 0 or len(unexpected_keys) > 0: - raise RuntimeError( + print( f"Missing keys: {missing_keys}, unexpected_keys: {unexpected_keys}" )