Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions vibevoice/modular/modeling_vibevoice_asr.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,14 @@ def encode_speech(
else:
semantic_tokens = self.model.semantic_tokenizer.encode(speech_tensors.unsqueeze(1)).mean
semantic_features = self.model.semantic_connector(semantic_tokens)

# Ensure acoustic and semantic features have matching temporal dimensions.
# Different tokenizer architectures can produce slightly different
# frame counts, causing tensor size mismatches on concatenation.
min_len = min(acoustic_features.shape[1], semantic_features.shape[1])
if acoustic_features.shape[1] != semantic_features.shape[1]:
acoustic_features = acoustic_features[:, :min_len]
semantic_features = semantic_features[:, :min_len]
else:
# Long audio: streaming processing
# print(f"Using streaming processing for long audio: {total_samples/sample_rate:.1f}s "
Expand Down Expand Up @@ -329,6 +337,15 @@ def _iter_segments(total_length: int, segment_length: int):
# Concatenate all semantic means
semantic_tokens = torch.cat(semantic_mean_segments, dim=1).contiguous()
semantic_features = self.model.semantic_connector(semantic_tokens)

# Ensure acoustic and semantic features have matching temporal dimensions.
# Different segment boundaries can cause slight frame-count mismatches
# between the two tokenizers (e.g. tensor a (228) vs tensor b (223)).
min_len = min(acoustic_features.shape[1], semantic_features.shape[1])
if acoustic_features.shape[1] != semantic_features.shape[1]:
acoustic_features = acoustic_features[:, :min_len]
semantic_features = semantic_features[:, :min_len]
semantic_features = self.model.semantic_connector(semantic_tokens)

# Combine acoustic and semantic features
if speech_masks is not None:
Expand Down