itechmusic commited on
Commit ·
2d271c3
1
Parent(s): 652057e
Update README.md
Browse files- README.md +46 -13
- assets/BBOX_SHIFT.md +4 -4
- assets/demo/man/man.png +3 -0
- assets/demo/musk/musk.png +3 -0
- assets/demo/sit/sit.jpeg +3 -0
- scripts/inference.py +2 -2
README.md
CHANGED
|
@@ -11,7 +11,7 @@ Chao Zhan,
|
|
| 11 |
Wenjiang Zhou
|
| 12 |
(<sup>*</sup>Equal Contribution, <sup>†</sup>Corresponding Author, benbinwu@tencent.com)
|
| 13 |
|
| 14 |
-
**[github](https://github.com/TMElyralab/MuseTalk)** **[huggingface](https://huggingface.co/TMElyralab/MuseTalk)** **Project(comming soon)** **Technical report (comming soon)**
|
| 15 |
|
| 16 |
We introduce `MuseTalk`, a **real-time high quality** lip-syncing model (30fps+ on an NVIDIA Tesla V100). MuseTalk can be applied with input videos, e.g., generated by [MuseV](https://github.com/TMElyralab/MuseV), as a complete virtual human solution.
|
| 17 |
|
|
@@ -37,18 +37,51 @@ MuseTalk was trained in latent spaces, where the images were encoded by a freeze
|
|
| 37 |
<table class="center">
|
| 38 |
<tr style="font-weight: bolder;text-align:center;">
|
| 39 |
<td width="33%">Image</td>
|
| 40 |
-
<td width="33%">MuseV
|
| 41 |
-
<td width="33%"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
</tr>
|
| 43 |
<tr>
|
| 44 |
<td>
|
| 45 |
<img src=assets/demo/yongen/yongen.jpeg width="95%">
|
| 46 |
</td>
|
| 47 |
<td >
|
| 48 |
-
<video src=assets/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
</td>
|
| 50 |
<td >
|
| 51 |
-
<video src=
|
| 52 |
</td>
|
| 53 |
</tr>
|
| 54 |
<tr>
|
|
@@ -56,10 +89,10 @@ MuseTalk was trained in latent spaces, where the images were encoded by a freeze
|
|
| 56 |
<img src=assets/demo/monalisa/monalisa.png width="95%">
|
| 57 |
</td>
|
| 58 |
<td >
|
| 59 |
-
<video src=assets/
|
| 60 |
</td>
|
| 61 |
<td >
|
| 62 |
-
<video src=assets/
|
| 63 |
</td>
|
| 64 |
</tr>
|
| 65 |
<tr>
|
|
@@ -67,10 +100,10 @@ MuseTalk was trained in latent spaces, where the images were encoded by a freeze
|
|
| 67 |
<img src=assets/demo/sun1/sun.png width="95%">
|
| 68 |
</td>
|
| 69 |
<td >
|
| 70 |
-
<video src=assets/
|
| 71 |
</td>
|
| 72 |
<td >
|
| 73 |
-
<video src=assets/
|
| 74 |
</td>
|
| 75 |
</tr>
|
| 76 |
<tr>
|
|
@@ -78,10 +111,10 @@ MuseTalk was trained in latent spaces, where the images were encoded by a freeze
|
|
| 78 |
<img src=assets/demo/sun2/sun.png width="95%">
|
| 79 |
</td>
|
| 80 |
<td >
|
| 81 |
-
<video src=assets/
|
| 82 |
</td>
|
| 83 |
<td >
|
| 84 |
-
<video src=assets/
|
| 85 |
</td>
|
| 86 |
</tr>
|
| 87 |
</table >
|
|
@@ -96,7 +129,7 @@ MuseTalk was trained in latent spaces, where the images were encoded by a freeze
|
|
| 96 |
</tr>
|
| 97 |
<tr>
|
| 98 |
<td>
|
| 99 |
-
<video src=assets/
|
| 100 |
</td>
|
| 101 |
<td>
|
| 102 |
<a href="//www.bilibili.com/video/BV1wT411b7HU">Link</a>
|
|
@@ -204,7 +237,7 @@ python -m scripts.inference --inference_config configs/inference/test.yaml --bbo
|
|
| 204 |
|
| 205 |
#### Combining MuseV and MuseTalk
|
| 206 |
|
| 207 |
-
|
| 208 |
|
| 209 |
# Note
|
| 210 |
|
|
|
|
| 11 |
Wenjiang Zhou
|
| 12 |
(<sup>*</sup>Equal Contribution, <sup>†</sup>Corresponding Author, benbinwu@tencent.com)
|
| 13 |
|
| 14 |
+
**[github](https://github.com/TMElyralab/MuseTalk)** **[huggingface](https://huggingface.co/TMElyralab/MuseTalk)** **Project (comming soon)** **Technical report (comming soon)**
|
| 15 |
|
| 16 |
We introduce `MuseTalk`, a **real-time high quality** lip-syncing model (30fps+ on an NVIDIA Tesla V100). MuseTalk can be applied with input videos, e.g., generated by [MuseV](https://github.com/TMElyralab/MuseV), as a complete virtual human solution.
|
| 17 |
|
|
|
|
| 37 |
<table class="center">
|
| 38 |
<tr style="font-weight: bolder;text-align:center;">
|
| 39 |
<td width="33%">Image</td>
|
| 40 |
+
<td width="33%">MuseV</td>
|
| 41 |
+
<td width="33%">+MuseTalk</td>
|
| 42 |
+
</tr>
|
| 43 |
+
<tr>
|
| 44 |
+
<td>
|
| 45 |
+
<img src=assets/demo/musk/musk.png width="95%">
|
| 46 |
+
</td>
|
| 47 |
+
<td >
|
| 48 |
+
<video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/4a4bb2d1-9d14-4ca9-85c8-7f19c39f712e controls preload></video>
|
| 49 |
+
</td>
|
| 50 |
+
<td >
|
| 51 |
+
<video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/b2a879c2-e23a-4d39-911d-51f0343218e4 controls preload></video>
|
| 52 |
+
</td>
|
| 53 |
</tr>
|
| 54 |
<tr>
|
| 55 |
<td>
|
| 56 |
<img src=assets/demo/yongen/yongen.jpeg width="95%">
|
| 57 |
</td>
|
| 58 |
<td >
|
| 59 |
+
<video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/57ef9dee-a9fd-4dc8-839b-3fbbbf0ff3f4 controls preload></video>
|
| 60 |
+
</td>
|
| 61 |
+
<td >
|
| 62 |
+
<video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/94d8dcba-1bcd-4b54-9d1d-8b6fc53228f0 controls preload></video>
|
| 63 |
+
</td>
|
| 64 |
+
</tr>
|
| 65 |
+
<tr>
|
| 66 |
+
<td>
|
| 67 |
+
<img src=assets/demo/man/man.png width="95%">
|
| 68 |
+
</td>
|
| 69 |
+
<td >
|
| 70 |
+
<video src= controls preload></video>
|
| 71 |
+
</td>
|
| 72 |
+
<td >
|
| 73 |
+
<video src= controls preload></video>
|
| 74 |
+
</td>
|
| 75 |
+
</tr>
|
| 76 |
+
<tr>
|
| 77 |
+
<td>
|
| 78 |
+
<img src=assets/demo/sit/sit.jpeg width="95%">
|
| 79 |
+
</td>
|
| 80 |
+
<td >
|
| 81 |
+
<video src= controls preload></video>
|
| 82 |
</td>
|
| 83 |
<td >
|
| 84 |
+
<video src= controls preload></video>
|
| 85 |
</td>
|
| 86 |
</tr>
|
| 87 |
<tr>
|
|
|
|
| 89 |
<img src=assets/demo/monalisa/monalisa.png width="95%">
|
| 90 |
</td>
|
| 91 |
<td >
|
| 92 |
+
<video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/1568f604-a34f-4526-a13a-7d282aa2e773 controls preload></video>
|
| 93 |
</td>
|
| 94 |
<td >
|
| 95 |
+
<video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/a40784fc-a885-4c1f-9b7e-8f87b7caf4e0 controls preload></video>
|
| 96 |
</td>
|
| 97 |
</tr>
|
| 98 |
<tr>
|
|
|
|
| 100 |
<img src=assets/demo/sun1/sun.png width="95%">
|
| 101 |
</td>
|
| 102 |
<td >
|
| 103 |
+
<video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/37a3a666-7b90-4244-8d3a-058cb0e44107 controls preload></video>
|
| 104 |
</td>
|
| 105 |
<td >
|
| 106 |
+
<video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/172f4ff1-d432-45bd-a5a7-a07dec33a26b controls preload></video>
|
| 107 |
</td>
|
| 108 |
</tr>
|
| 109 |
<tr>
|
|
|
|
| 111 |
<img src=assets/demo/sun2/sun.png width="95%">
|
| 112 |
</td>
|
| 113 |
<td >
|
| 114 |
+
<video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/37a3a666-7b90-4244-8d3a-058cb0e44107 controls preload></video>
|
| 115 |
</td>
|
| 116 |
<td >
|
| 117 |
+
<video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/85a6873d-a028-4cce-af2b-6c59a1f2971d controls preload></video>
|
| 118 |
</td>
|
| 119 |
</tr>
|
| 120 |
</table >
|
|
|
|
| 129 |
</tr>
|
| 130 |
<tr>
|
| 131 |
<td>
|
| 132 |
+
<video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/4d7c5fa1-3550-4d52-8ed2-52f158150f24 controls preload></video>
|
| 133 |
</td>
|
| 134 |
<td>
|
| 135 |
<a href="//www.bilibili.com/video/BV1wT411b7HU">Link</a>
|
|
|
|
| 237 |
|
| 238 |
#### Combining MuseV and MuseTalk
|
| 239 |
|
| 240 |
+
As a complete solution to virtual human generation, you are suggested to first apply [MuseV](https://github.com/TMElyralab/MuseV) to generate a video (text-to-video, image-to-video or pose-to-video) by referring [this](https://github.com/TMElyralab/MuseV?tab=readme-ov-file#text2video). Then, you can use `MuseTalk` to generate a lip-sync video by referring [this](https://github.com/TMElyralab/MuseTalk?tab=readme-ov-file#inference).
|
| 241 |
|
| 242 |
# Note
|
| 243 |
|
assets/BBOX_SHIFT.md
CHANGED
|
@@ -1,15 +1,15 @@
|
|
| 1 |
## Why is there a "bbox_shift" parameter?
|
| 2 |
When processing training data, we utilize the combination of face detection results (bbox) and facial landmarks to determine the region of the head segmentation box. Specifically, we use the upper bound of the bbox as the upper boundary of the segmentation box, the maximum y value of the facial landmarks coordinates as the lower boundary of the segmentation box, and the minimum and maximum x values of the landmarks coordinates as the left and right boundaries of the segmentation box. By processing the dataset in this way, we can ensure the integrity of the face.
|
| 3 |
|
| 4 |
-
However, we have observed that the masked ratio on the face varies across different images due to the varying face shapes of subjects. Furthermore, we found that the upper-bound of the mask mainly lies close to the
|
| 5 |
|
| 6 |
-
During the inference process, we
|
| 7 |
|
| 8 |

|
| 9 |
|
| 10 |
Fig.1. Facial landmarks
|
| 11 |
### Step 0.
|
| 12 |
-
Running with the default configuration to obtain the adjustable value range
|
| 13 |
```
|
| 14 |
python -m scripts.inference --inference_config configs/inference/test.yaml
|
| 15 |
```
|
|
@@ -19,7 +19,7 @@ Total frame:「838」 Manually adjust range : [ -9~9 ] , the current value: 0
|
|
| 19 |
*************************************************************************************************************************************
|
| 20 |
```
|
| 21 |
### Step 1.
|
| 22 |
-
|
| 23 |
```
|
| 24 |
python -m scripts.inference --inference_config configs/inference/test.yaml --bbox_shift xx # where xx is in [-9, 9].
|
| 25 |
```
|
|
|
|
| 1 |
## Why is there a "bbox_shift" parameter?
|
| 2 |
When processing training data, we utilize the combination of face detection results (bbox) and facial landmarks to determine the region of the head segmentation box. Specifically, we use the upper bound of the bbox as the upper boundary of the segmentation box, the maximum y value of the facial landmarks coordinates as the lower boundary of the segmentation box, and the minimum and maximum x values of the landmarks coordinates as the left and right boundaries of the segmentation box. By processing the dataset in this way, we can ensure the integrity of the face.
|
| 3 |
|
| 4 |
+
However, we have observed that the masked ratio on the face varies across different images due to the varying face shapes of subjects. Furthermore, we found that the upper-bound of the mask mainly lies close to the landmark28, landmark29 and landmark30 landmark points (as shown in Fig.1), which correspond to proportions of 15%, 63%, and 22% in the dataset, respectively.
|
| 5 |
|
| 6 |
+
During the inference process, we discover that as the upper-bound of the mask gets closer to the mouth (near landmark30), the audio features contribute more to lip movements. Conversely, as the upper-bound of the mask moves away from the mouth (near landmark28), the audio features contribute more to generating details of facial appearance. Hence, we define this characteristic as a parameter that can adjust the contribution of audio features to generating lip movements, which users can modify according to their specific needs in practical scenarios.
|
| 7 |
|
| 8 |

|
| 9 |
|
| 10 |
Fig.1. Facial landmarks
|
| 11 |
### Step 0.
|
| 12 |
+
Running with the default configuration to obtain the adjustable value range.
|
| 13 |
```
|
| 14 |
python -m scripts.inference --inference_config configs/inference/test.yaml
|
| 15 |
```
|
|
|
|
| 19 |
*************************************************************************************************************************************
|
| 20 |
```
|
| 21 |
### Step 1.
|
| 22 |
+
Re-run the script within the above range.
|
| 23 |
```
|
| 24 |
python -m scripts.inference --inference_config configs/inference/test.yaml --bbox_shift xx # where xx is in [-9, 9].
|
| 25 |
```
|
assets/demo/man/man.png
ADDED
|
Git LFS Details
|
assets/demo/musk/musk.png
ADDED
|
Git LFS Details
|
assets/demo/sit/sit.jpeg
ADDED
|
Git LFS Details
|
scripts/inference.py
CHANGED
|
@@ -30,8 +30,8 @@ def main(args):
|
|
| 30 |
input_basename = os.path.basename(video_path).split('.')[0]
|
| 31 |
audio_basename = os.path.basename(audio_path).split('.')[0]
|
| 32 |
output_basename = f"{input_basename}_{audio_basename}"
|
| 33 |
-
crop_coord_save_path = os.path.join(args.result_dir, input_basename+".pkl") # only related to video input
|
| 34 |
result_img_save_path = os.path.join(args.result_dir, output_basename) # related to video & audio inputs
|
|
|
|
| 35 |
os.makedirs(result_img_save_path,exist_ok =True)
|
| 36 |
|
| 37 |
if args.output_vid_name=="":
|
|
@@ -122,7 +122,7 @@ def main(args):
|
|
| 122 |
os.system(cmd_combine_audio)
|
| 123 |
|
| 124 |
os.system("rm temp.mp4")
|
| 125 |
-
os.system(f"rm -
|
| 126 |
print(f"result is save to {output_vid_name}")
|
| 127 |
|
| 128 |
if __name__ == "__main__":
|
|
|
|
| 30 |
input_basename = os.path.basename(video_path).split('.')[0]
|
| 31 |
audio_basename = os.path.basename(audio_path).split('.')[0]
|
| 32 |
output_basename = f"{input_basename}_{audio_basename}"
|
|
|
|
| 33 |
result_img_save_path = os.path.join(args.result_dir, output_basename) # related to video & audio inputs
|
| 34 |
+
crop_coord_save_path = os.path.join(result_img_save_path, input_basename+".pkl") # only related to video input
|
| 35 |
os.makedirs(result_img_save_path,exist_ok =True)
|
| 36 |
|
| 37 |
if args.output_vid_name=="":
|
|
|
|
| 122 |
os.system(cmd_combine_audio)
|
| 123 |
|
| 124 |
os.system("rm temp.mp4")
|
| 125 |
+
os.system(f"rm -rf {result_img_save_path}")
|
| 126 |
print(f"result is save to {output_vid_name}")
|
| 127 |
|
| 128 |
if __name__ == "__main__":
|