LatentSync问题解析
face not detected
- stable-paw-comfyui增加
error_code
字段,当服务端出现face not detected
时可以给后端发送回调信息,后端也同步增加,这样前端也能catch到就可以让用户传递更优质的video进来。但是这个方法只能治标,用户体验还不够好 - 只能从源码出发,修改源码,当检测到没有人脸的帧逻辑是跳过该帧而不是直接抛出异常,这一块技术栈涉及到pytorch。修改了
ComfyUI-LatentSyncWrapper/latentsync/utils/image_processor.py
的源码1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95def detect_facial_landmarks(self, image: np.ndarray):
height, width, _ = image.shape
results = self.face_mesh.process(image)
if not results.multi_face_landmarks: # Face not detected
print("Skipping frame: No face detected")
return None # Return None instead of raising an error
face_landmarks = results.multi_face_landmarks[0] # Only use the first face in the image
landmark_coordinates = [
(int(landmark.x * width), int(landmark.y * height)) for landmark in face_landmarks.landmark
] # x means width, y means height
return landmark_coordinates
def affine_transform(self, image: torch.Tensor):
# Convert image to numpy array if necessary
if isinstance(image, torch.Tensor):
image = rearrange(image, "c h w -> h w c").numpy()
# Detect facial landmarks
if self.fa is None:
landmark_coordinates = self.detect_facial_landmarks(image)
if landmark_coordinates is None: # No face detected
return None, None, None # Skip this frame
lm68 = mediapipe_lm478_to_face_alignment_lm68(landmark_coordinates)
else:
detected_faces = self.fa.get_landmarks(image)
if detected_faces is None: # No face detected
return None, None, None # Skip this frame
lm68 = detected_faces[0]
# Perform affine transformation
points = self.smoother.smooth(lm68)
lmk3_ = np.zeros((3, 2))
lmk3_[0] = points[17:22].mean(0)
lmk3_[1] = points[22:27].mean(0)
lmk3_[2] = points[27:36].mean(0)
face, affine_matrix = self.restorer.align_warp_face(
image.copy(), lmks3=lmk3_, smooth=True, border_mode="constant"
)
box = [0, 0, face.shape[1], face.shape[0]] # x1, y1, x2, y2
face = cv2.resize(face, (self.resolution, self.resolution), interpolation=cv2.INTER_CUBIC)
face = rearrange(torch.from_numpy(face), "h w c -> c h w")
return face, box, affine_matrix
def preprocess_fixed_mask_image(self, image: torch.Tensor, affine_transform=False):
if affine_transform:
result = self.affine_transform(image)
if result is None: # No face detected
return None, None, None # Skip this frame
image, _, _ = result
else:
image = self.resize(image)
pixel_values = self.normalize(image / 255.0)
masked_pixel_values = pixel_values * self.mask_image
return pixel_values, masked_pixel_values, self.mask_image[0:1]
def prepare_masks_and_masked_images(self, images: Union[torch.Tensor, np.ndarray], affine_transform=False):
if isinstance(images, np.ndarray):
images = torch.from_numpy(images)
if images.shape[3] == 3:
images = rearrange(images, "b h w c -> b c h w")
pixel_values_list, masked_pixel_values_list, masks_list = [], [], []
for image in images:
if self.mask == "fix_mask":
result = self.preprocess_fixed_mask_image(image, affine_transform=affine_transform)
else:
result = self.preprocess_one_masked_image(image)
if result is not None: # Skip frames where no face is detected
pixel_values, masked_pixel_values, mask = result
pixel_values_list.append(pixel_values)
masked_pixel_values_list.append(masked_pixel_values)
masks_list.append(mask)
if not pixel_values_list: # If no valid frames were processed
return None, None, None
return torch.stack(pixel_values_list), torch.stack(masked_pixel_values_list), torch.stack(masks_list)
if __name__ == "__main__":
image_processor = ImageProcessor(512, mask="fix_mask")
video = cv2.VideoCapture("/mnt/bn/maliva-gen-ai-v2/chunyu.li/HDTF/original/val/RD_Radio57_000.mp4")
while True:
ret, frame = video.read()
if not ret:
break
frame = rearrange(torch.Tensor(frame).type(torch.uint8), "h w c -> c h w")
result = image_processor.affine_transform(frame)
if result is not None: # Only process frames where a face is detected
face, _, _ = result
face = (rearrange(face, "c h w -> h w c").detach().cpu().numpy()).astype(np.uint8)
cv2.imwrite("face.jpg", face)
break
模型下载问题
- 模型下载问题
- 由于我们的服务必须保证稳定,防止线上模型丢失或者镜像拉取速度慢或者线上模型有更新的现象,所以我们在使用该服务过程中涉及到的模型必须要从我们的amazon s3云存储中拉取,
- 但是使用该服务发现其中涉及到两个模型会进行自动下载
- 来自hugging face的vae模型
- 来自字节的latentsync模型
其中vae模型是由于源码中的vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16)
这一行代码触发了自动下载。因此将模型从s3下载到本地并修改源码``
关于latentsync模型的解决方案是直接放到cache
文件夹下,再打包镜像的时候有这一步逻辑,启动服务后还会加上这一部分逻辑:
1 | def download_from_s3(bucket, key, save_path): |
嘴型不一致问题
这个不是模型问题,而是在comfyui workflow中由于output video frame设置成了24.
latentsync会将传入的音频时间*25帧率作为output video的视频帧数,所以如果后面output video frame设置成了24,就会导致音频和视频对不上
这里修改了java后端代码,workflowparse修改,并且contentmetadata
修改就行了
后端timeout问题
- endpoint设置timeout时间只有10分钟
- 在
LatentSyncAudio2VideoCreationHandler.java
中增加以下逻辑1
2
3
4
5
6
7
8
9
10
11
12
public int estimateGenerationDuration(Content content) {
StablePawLatentSyncAudio2VideoContentMetadata contentMetadata = fetchMetadata(content);
if (contentMetadata != null) {
return estimateGenerationDurationFromMetadata(contentMetadata);
}
return 30;
}
protected int estimateGenerationDurationFromMetadata(StablePawLatentSyncAudio2VideoContentMetadata contentMetadata) {
return (int) (contentMetadata.getDuration() / 2000);
}
每次服务占用大量RAM的问题
运行发现gpu使用率并不高反而是cpu和ram占用率很高
看了源码发现出现在两个阶段:
- whisper提取音频特征加载了模型所有信息,不止参数。这个阶段还好30s音频RAM上限40GB,不至于崩
- 推理结束需要将视频帧结合起来,所以音频时间一增加,内存使用就会暴增
还有问题是每次服务结束会留着上一个task的大量缓存,因此需要每次task结束刷新worker
解决方案是:
- 前后端共同限制用户传入audio time 和video resolution
- 服务端stable-paw-comfyui增加刷新worker的功能
本博客所有文章除特别声明外,均采用 CC BY-NC-SA 4.0 许可协议。转载请注明来源 Priska's blog!