LatentSync问题解析

face not detected

stable-paw-comfyui增加error_code字段，当服务端出现face not detected时可以给后端发送回调信息，后端也同步增加，这样前端也能catch到就可以让用户传递更优质的video进来。但是这个方法只能治标，用户体验还不够好

只能从源码出发，修改源码，当检测到没有人脸的帧逻辑是跳过该帧而不是直接抛出异常，这一块技术栈涉及到pytorch。修改了ComfyUI-LatentSyncWrapper/latentsync/utils/image_processor.py的源码

def detect_facial_landmarks(self, image: np.ndarray):  
    height, width, _ = image.shape  
    results = self.face_mesh.process(image)  
    if not results.multi_face_landmarks:  # Face not detected  
        print("Skipping frame: No face detected")  
        return None  # Return None instead of raising an error  
    face_landmarks = results.multi_face_landmarks[0]  # Only use the first face in the image  
    landmark_coordinates = [  
        (int(landmark.x * width), int(landmark.y * height)) for landmark in face_landmarks.landmark  
    ]  # x means width, y means height  
    return landmark_coordinates
def affine_transform(self, image: torch.Tensor):  
    # Convert image to numpy array if necessary  
    if isinstance(image, torch.Tensor):  
        image = rearrange(image, "c h w -> h w c").numpy()  
  
    # Detect facial landmarks  
    if self.fa is None:  
        landmark_coordinates = self.detect_facial_landmarks(image)  
        if landmark_coordinates is None:  # No face detected  
            return None, None, None  # Skip this frame  
        lm68 = mediapipe_lm478_to_face_alignment_lm68(landmark_coordinates)  
    else:  
        detected_faces = self.fa.get_landmarks(image)  
        if detected_faces is None:  # No face detected  
            return None, None, None  # Skip this frame  
        lm68 = detected_faces[0]  
  
    # Perform affine transformation  
    points = self.smoother.smooth(lm68)  
    lmk3_ = np.zeros((3, 2))  
    lmk3_[0] = points[17:22].mean(0)  
    lmk3_[1] = points[22:27].mean(0)  
    lmk3_[2] = points[27:36].mean(0)  
    face, affine_matrix = self.restorer.align_warp_face(  
        image.copy(), lmks3=lmk3_, smooth=True, border_mode="constant"  
    )  
    box = [0, 0, face.shape[1], face.shape[0]]  # x1, y1, x2, y2  
    face = cv2.resize(face, (self.resolution, self.resolution), interpolation=cv2.INTER_CUBIC)  
    face = rearrange(torch.from_numpy(face), "h w c -> c h w")  
    return face, box, affine_matrix  
  
def preprocess_fixed_mask_image(self, image: torch.Tensor, affine_transform=False):  
    if affine_transform:  
        result = self.affine_transform(image)  
        if result is None:  # No face detected  
            return None, None, None  # Skip this frame  
        image, _, _ = result  
    else:  
        image = self.resize(image)  
    pixel_values = self.normalize(image / 255.0)  
    masked_pixel_values = pixel_values * self.mask_image  
    return pixel_values, masked_pixel_values, self.mask_image[0:1]  
  
def prepare_masks_and_masked_images(self, images: Union[torch.Tensor, np.ndarray], affine_transform=False):  
    if isinstance(images, np.ndarray):  
        images = torch.from_numpy(images)  
    if images.shape[3] == 3:  
        images = rearrange(images, "b h w c -> b c h w")  
  
    pixel_values_list, masked_pixel_values_list, masks_list = [], [], []  
    for image in images:  
        if self.mask == "fix_mask":  
            result = self.preprocess_fixed_mask_image(image, affine_transform=affine_transform)  
        else:  
            result = self.preprocess_one_masked_image(image)  
          
        if result is not None:  # Skip frames where no face is detected  
            pixel_values, masked_pixel_values, mask = result  
            pixel_values_list.append(pixel_values)  
            masked_pixel_values_list.append(masked_pixel_values)  
            masks_list.append(mask)  
  
    if not pixel_values_list:  # If no valid frames were processed  
        return None, None, None  
  
    return torch.stack(pixel_values_list), torch.stack(masked_pixel_values_list), torch.stack(masks_list)


if __name__ == "__main__":  
    image_processor = ImageProcessor(512, mask="fix_mask")  
    video = cv2.VideoCapture("/mnt/bn/maliva-gen-ai-v2/chunyu.li/HDTF/original/val/RD_Radio57_000.mp4")  
    while True:  
        ret, frame = video.read()  
        if not ret:  
            break  
  
        frame = rearrange(torch.Tensor(frame).type(torch.uint8), "h w c ->  c h w")  
        result = image_processor.affine_transform(frame)  
  
        if result is not None:  # Only process frames where a face is detected  
            face, _, _ = result  
            face = (rearrange(face, "c h w -> h w c").detach().cpu().numpy()).astype(np.uint8)  
            cv2.imwrite("face.jpg", face)  
            break

模型下载问题

模型下载问题
- 由于我们的服务必须保证稳定，防止线上模型丢失或者镜像拉取速度慢或者线上模型有更新的现象，所以我们在使用该服务过程中涉及到的模型必须要从我们的amazon s3云存储中拉取，
- 但是使用该服务发现其中涉及到两个模型会进行自动下载
  - 来自hugging face的vae模型
  - 来自字节的latentsync模型

其中vae模型是由于源码中的vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16)这一行代码触发了自动下载。因此将模型从s3下载到本地并修改源码``

关于latentsync模型的解决方案是直接放到cache文件夹下，再打包镜像的时候有这一步逻辑，启动服务后还会加上这一部分逻辑：

def download_from_s3(bucket, key, save_path):  
    """从 S3 下载模型文件"""  
    import boto3  
    os.makedirs(os.path.dirname(save_path), exist_ok=True)  
    s3_client = boto3.client('s3')  
    try:  
        s3_client.download_file(bucket, key, save_path)  
        print(f"Successfully downloaded s3://{bucket}/{key}")  
    except Exception as e:  
        raise RuntimeError(f"Failed to download model from S3: {str(e)}")  
  
  
def pre_download_models():  
    """从 S3 下载所需的模型文件"""  
    models = {  
        "s3fd-619a316812.pth": "stable-paw/basic-model/latentsync/s3fd-619a316812.pth",  
        "2DFAN4-cd938726ad.zip": "stable-paw/basic-model/latentsync/2DFAN4-cd938726ad.zip"  
    }  
  
    cache_dir = os.path.expanduser("~/.cache/torch/hub/checkpoints")  
    for model_name, s3_key in models.items():  
        save_path = os.path.join(cache_dir, model_name)  
        if not os.path.exists(save_path):  
            print(f"Downloading {model_name} from S3...")  
            download_from_s3("generative-paw", s3_key, save_path)  
        else:  
            print(f"{model_name} already exists in cache.")  
  
  
def setup_models():  
    """Setup and pre-download all required models."""  
    # Pre-download additional models    try:  
        pre_download_models()  
    except Exception as e:  
        print(f"Error downloading models from S3: {str(e)}")  
        print("\nPlease ensure:")  
        print("1. AWS credentials are properly configured")  
        print("2. Required models exist in S3 bucket")  
        print("3. Network connection to S3 is available")  
        raise RuntimeError("Model setup failed. See instructions above.")

嘴型不一致问题

这个不是模型问题，而是在comfyui workflow中由于output video frame设置成了24.
latentsync会将传入的音频时间*25帧率作为output video的视频帧数，所以如果后面output video frame设置成了24，就会导致音频和视频对不上

这里修改了java后端代码，workflowparse修改，并且contentmetadata修改就行了

后端timeout问题

endpoint设置timeout时间只有10分钟

在LatentSyncAudio2VideoCreationHandler.java中增加以下逻辑

@Override  
public int estimateGenerationDuration(Content content) {  
    StablePawLatentSyncAudio2VideoContentMetadata contentMetadata = fetchMetadata(content);  
    if (contentMetadata != null) {  
        return estimateGenerationDurationFromMetadata(contentMetadata);  
    }  
    return 30;  
}  
  
protected int estimateGenerationDurationFromMetadata(StablePawLatentSyncAudio2VideoContentMetadata contentMetadata) {  
    return (int) (contentMetadata.getDuration() / 2000);  
}

每次服务占用大量RAM的问题

运行发现gpu使用率并不高反而是cpu和ram占用率很高
看了源码发现出现在两个阶段：

whisper提取音频特征加载了模型所有信息，不止参数。这个阶段还好30s音频RAM上限40GB，不至于崩
推理结束需要将视频帧结合起来，所以音频时间一增加，内存使用就会暴增

还有问题是每次服务结束会留着上一个task的大量缓存，因此需要每次task结束刷新worker

解决方案是：

前后端共同限制用户传入audio time 和video resolution
服务端stable-paw-comfyui增加刷新worker的功能