diff --git a/.gitignore b/.gitignore index 801935a6..506d5d35 100644 --- a/.gitignore +++ b/.gitignore @@ -157,7 +157,7 @@ cython_debug/ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ +.idea/ examples/results/* gfpgan/* @@ -166,4 +166,7 @@ assets/* results/* Dockerfile start_docker.sh -start.sh \ No newline at end of file +start.sh + +# Mac +.DS_Store diff --git a/README.md b/README.md index af83e996..7724c7cf 100644 --- a/README.md +++ b/README.md @@ -5,8 +5,7 @@ -             [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Winfredy/SadTalker/blob/main/quick_demo.ipynb)       [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/vinthony/SadTalker)       [![sd webui-colab](https://img.shields.io/badge/Automatic1111-Colab-green)](https://colab.research.google.com/github/camenduru/stable-diffusion-webui-colab/blob/main/video/stable/stable_diffusion_1_5_video_webui_colab.ipynb) - +     [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Winfredy/SadTalker/blob/main/quick_demo.ipynb)   [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/vinthony/SadTalker)   [![sd webui-colab](https://img.shields.io/badge/Automatic1111-Colab-green)](https://colab.research.google.com/github/camenduru/stable-diffusion-webui-colab/blob/main/video/stable/stable_diffusion_1_5_video_webui_colab.ipynb)   [![Replicate](https://replicate.com/cjwbw/sadtalker/badge)](https://replicate.com/cjwbw/sadtalker)
Wenxuan Zhang *,1,2   @@ -121,9 +120,10 @@ Tutorials from communities: [中文windows教程](https://www.bilibili.com/video ### Windows ([中文windows教程](https://www.bilibili.com/video/BV1Dc411W7V6/)): 1. Install [Python 3.10.6](https://www.python.org/downloads/windows/), checking "Add Python to PATH". -2. Install [git](https://git-scm.com/download/win). -3. Install `ffmpeg`, following [this instruction](https://www.wikihow.com/Install-FFmpeg-on-Windows). +2. Install [git](https://git-scm.com/download/win) manually (OR `scoop install git` via [scoop](https://scoop.sh/)). +3. Install `ffmpeg`, following [this instruction](https://www.wikihow.com/Install-FFmpeg-on-Windows) (OR using `scoop install ffmpeg` via [scoop](https://scoop.sh/)). 4. Download our SadTalker repository, for example by running `git clone https://github.com/Winfredy/SadTalker.git`. +5. Download the `checkpoint` and `gfpgan` [below↓](https://github.com/Winfredy/SadTalker#-2-download-trained-models). 5. Run `start.bat` from Windows Explorer as normal, non-administrator, user, a gradio WebUI demo will be started. ### Macbook: diff --git a/cog.yaml b/cog.yaml new file mode 100644 index 00000000..05bcbd58 --- /dev/null +++ b/cog.yaml @@ -0,0 +1,35 @@ +build: + gpu: true + cuda: "11.3" + python_version: "3.8" + system_packages: + - "ffmpeg" + - "libgl1-mesa-glx" + - "libglib2.0-0" + python_packages: + - "torch==1.12.1" + - "torchvision==0.13.1" + - "torchaudio==0.12.1" + - "joblib==1.1.0" + - "scikit-image==0.19.3" + - "basicsr==1.4.2" + - "facexlib==0.3.0" + - "resampy==0.3.1" + - "pydub==0.25.1" + - "scipy==1.10.1" + - "kornia==0.6.8" + - "face_alignment==1.3.5" + - "imageio==2.19.3" + - "imageio-ffmpeg==0.4.7" + - "librosa==0.9.2" # + - "tqdm==4.65.0" + - "yacs==0.1.8" + - "gfpgan==1.3.8" + - "dlib-bin==19.24.1" + - "av==10.0.0" + - "trimesh==3.9.20" + run: + - mkdir -p /root/.cache/torch/hub/checkpoints/ && wget --output-document "/root/.cache/torch/hub/checkpoints/s3fd-619a316812.pth" "https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth" + - mkdir -p /root/.cache/torch/hub/checkpoints/ && wget --output-document "/root/.cache/torch/hub/checkpoints/2DFAN4-cd938726ad.zip" "https://www.adrianbulat.com/downloads/python-fan/2DFAN4-cd938726ad.zip" + +predict: "predict.py:Predictor" diff --git a/docs/FAQ.md b/docs/FAQ.md index fe758809..6451a226 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -26,3 +26,21 @@ Make sure you have downloaded the checkpoints and gfpgan as [here](https://githu **Q: RuntimeError: unexpected EOF, expected 237192 more bytes. The file might be corrupted.** The files are not automatically downloaded. Please update the code and download the gfpgan folders as [here](https://github.com/Winfredy/SadTalker#-2-download-trained-models). + +**Q: CUDA out of memory error** + +please refer to https://stackoverflow.com/questions/73747731/runtimeerror-cuda-out-of-memory-how-setting-max-split-size-mb + +``` +# windows +set PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128 +python inference.py ... + +# linux +export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128 +python inference.py ... +``` + +**Q: Error while decoding stream #0:0: Invalid data found when processing input [mp3float @ 0000015037628c00] Header missing** + +Our method only support wav or mp3 files as input, please make sure the feeded audios are in these formats. diff --git a/predict.py b/predict.py new file mode 100644 index 00000000..1bfcd28e --- /dev/null +++ b/predict.py @@ -0,0 +1,192 @@ +"""run bash scripts/download_models.sh first to prepare the weights file""" +import os +import shutil +from argparse import Namespace +from src.utils.preprocess import CropAndExtract +from src.test_audio2coeff import Audio2Coeff +from src.facerender.animate import AnimateFromCoeff +from src.generate_batch import get_data +from src.generate_facerender_batch import get_facerender_data +from src.utils.init_path import init_path +from cog import BasePredictor, Input, Path + +checkpoints = "checkpoints" + + +class Predictor(BasePredictor): + def setup(self): + """Load the model into memory to make running multiple predictions efficient""" + device = "cuda" + + + sadtalker_paths = init_path(checkpoints,os.path.join("src","config")) + + # init model + self.preprocess_model = CropAndExtract(sadtalker_paths, device + ) + + self.audio_to_coeff = Audio2Coeff( + sadtalker_paths, + device, + ) + + self.animate_from_coeff = { + "full": AnimateFromCoeff( + sadtalker_paths, + device, + ), + "others": AnimateFromCoeff( + sadtalker_paths, + device, + ), + } + + def predict( + self, + source_image: Path = Input( + description="Upload the source image, it can be video.mp4 or picture.png", + ), + driven_audio: Path = Input( + description="Upload the driven audio, accepts .wav and .mp4 file", + ), + enhancer: str = Input( + description="Choose a face enhancer", + choices=["gfpgan", "RestoreFormer"], + default="gfpgan", + ), + preprocess: str = Input( + description="how to preprocess the images", + choices=["crop", "resize", "full"], + default="full", + ), + ref_eyeblink: Path = Input( + description="path to reference video providing eye blinking", + default=None, + ), + ref_pose: Path = Input( + description="path to reference video providing pose", + default=None, + ), + still: bool = Input( + description="can crop back to the original videos for the full body aniamtion when preprocess is full", + default=True, + ), + ) -> Path: + """Run a single prediction on the model""" + + animate_from_coeff = ( + self.animate_from_coeff["full"] + if preprocess == "full" + else self.animate_from_coeff["others"] + ) + + args = load_default() + args.pic_path = str(source_image) + args.audio_path = str(driven_audio) + device = "cuda" + args.still = still + args.ref_eyeblink = None if ref_eyeblink is None else str(ref_eyeblink) + args.ref_pose = None if ref_pose is None else str(ref_pose) + + # crop image and extract 3dmm from image + results_dir = "results" + if os.path.exists(results_dir): + shutil.rmtree(results_dir) + os.makedirs(results_dir) + first_frame_dir = os.path.join(results_dir, "first_frame_dir") + os.makedirs(first_frame_dir) + + print("3DMM Extraction for source image") + first_coeff_path, crop_pic_path, crop_info = self.preprocess_model.generate( + args.pic_path, first_frame_dir, preprocess, source_image_flag=True + ) + if first_coeff_path is None: + print("Can't get the coeffs of the input") + return + + if ref_eyeblink is not None: + ref_eyeblink_videoname = os.path.splitext(os.path.split(ref_eyeblink)[-1])[ + 0 + ] + ref_eyeblink_frame_dir = os.path.join(results_dir, ref_eyeblink_videoname) + os.makedirs(ref_eyeblink_frame_dir, exist_ok=True) + print("3DMM Extraction for the reference video providing eye blinking") + ref_eyeblink_coeff_path, _, _ = self.preprocess_model.generate( + ref_eyeblink, ref_eyeblink_frame_dir + ) + else: + ref_eyeblink_coeff_path = None + + if ref_pose is not None: + if ref_pose == ref_eyeblink: + ref_pose_coeff_path = ref_eyeblink_coeff_path + else: + ref_pose_videoname = os.path.splitext(os.path.split(ref_pose)[-1])[0] + ref_pose_frame_dir = os.path.join(results_dir, ref_pose_videoname) + os.makedirs(ref_pose_frame_dir, exist_ok=True) + print("3DMM Extraction for the reference video providing pose") + ref_pose_coeff_path, _, _ = self.preprocess_model.generate( + ref_pose, ref_pose_frame_dir + ) + else: + ref_pose_coeff_path = None + + # audio2ceoff + batch = get_data( + first_coeff_path, + args.audio_path, + device, + ref_eyeblink_coeff_path, + still=still, + ) + coeff_path = self.audio_to_coeff.generate( + batch, results_dir, args.pose_style, ref_pose_coeff_path + ) + # coeff2video + print("coeff2video") + data = get_facerender_data( + coeff_path, + crop_pic_path, + first_coeff_path, + args.audio_path, + args.batch_size, + args.input_yaw, + args.input_pitch, + args.input_roll, + expression_scale=args.expression_scale, + still_mode=still, + preprocess=preprocess, + ) + animate_from_coeff.generate( + data, results_dir, args.pic_path, crop_info, + enhancer=enhancer, background_enhancer=args.background_enhancer, + preprocess=preprocess) + + output = "/tmp/out.mp4" + mp4_path = os.path.join(results_dir, [f for f in os.listdir(results_dir) if "enhanced.mp4" in f][0]) + shutil.copy(mp4_path, output) + + return Path(output) + + +def load_default(): + return Namespace( + pose_style=0, + batch_size=2, + expression_scale=1.0, + input_yaw=None, + input_pitch=None, + input_roll=None, + background_enhancer=None, + face3dvis=False, + net_recon="resnet50", + init_path=None, + use_last_fc=False, + bfm_folder="./src/config/", + bfm_model="BFM_model_front.mat", + focal=1015.0, + center=112.0, + camera_d=10.0, + z_near=5.0, + z_far=15.0, + ) diff --git a/src/facerender/animate.py b/src/facerender/animate.py index 563d87fe..85583157 100644 --- a/src/facerender/animate.py +++ b/src/facerender/animate.py @@ -206,7 +206,8 @@ def generate(self, x, video_save_dir, pic_path, crop_info, enhancer=None, backgr audio_name = os.path.splitext(os.path.split(audio_path)[-1])[0] new_audio_path = os.path.join(video_save_dir, audio_name+'.wav') start_time = 0 - sound = AudioSegment.from_mp3(audio_path) + # cog will not keep the .mp3 filename + sound = AudioSegment.from_file(audio_path) frames = frame_num end_time = start_time + frames*1/25*1000 word1=sound.set_frame_rate(16000)