Spaces:
Sleeping
Sleeping
Delete MakeItTalk/animated.py
Browse files- MakeItTalk/animated.py +0 -277
MakeItTalk/animated.py
DELETED
@@ -1,277 +0,0 @@
|
|
1 |
-
|
2 |
-
# To add a new cell, type '# %%'
|
3 |
-
# To add a new markdown cell, type '# %% [markdown]'
|
4 |
-
# %%
|
5 |
-
import torch
|
6 |
-
|
7 |
-
# this ensures that the current MacOS version is at least 12.3+
|
8 |
-
print(torch.backends.mps.is_available())
|
9 |
-
# this ensures that the current current PyTorch installation was built with MPS activated.
|
10 |
-
print(torch.backends.mps.is_built())
|
11 |
-
|
12 |
-
|
13 |
-
# %%
|
14 |
-
import ipywidgets as widgets
|
15 |
-
import glob
|
16 |
-
import matplotlib.pyplot as plt
|
17 |
-
print("Choose the image name to animate: (saved in folder 'MakeItTalk/examples/')")
|
18 |
-
img_list = glob.glob1('examples', '*.jpg')
|
19 |
-
img_list.sort()
|
20 |
-
img_list = [item.split('.')[0] for item in img_list]
|
21 |
-
default_head_name = widgets.Dropdown(options=img_list, value='marlene_v2')
|
22 |
-
def on_change(change):
|
23 |
-
if change['type'] == 'change' and change['name'] == 'value':
|
24 |
-
plt.imshow(plt.imread('MakeItTalk/examples/{}.jpg'.format(default_head_name.value)))
|
25 |
-
plt.axis('off')
|
26 |
-
plt.show()
|
27 |
-
default_head_name.observe(on_change)
|
28 |
-
display(default_head_name)
|
29 |
-
plt.imshow(plt.imread('MakeItTalk/examples/{}.jpg'.format(default_head_name.value)))
|
30 |
-
plt.axis('off')
|
31 |
-
plt.show()
|
32 |
-
|
33 |
-
|
34 |
-
# %%
|
35 |
-
#@markdown # Animation Controllers
|
36 |
-
#@markdown Amplify the lip motion in horizontal direction
|
37 |
-
AMP_LIP_SHAPE_X = 2 #@param {type:"slider", min:0.5, max:5.0, step:0.1}
|
38 |
-
|
39 |
-
#@markdown Amplify the lip motion in vertical direction
|
40 |
-
AMP_LIP_SHAPE_Y = 2 #@param {type:"slider", min:0.5, max:5.0, step:0.1}
|
41 |
-
|
42 |
-
#@markdown Amplify the head pose motion (usually smaller than 1.0, put it to 0. for a static head pose)
|
43 |
-
AMP_HEAD_POSE_MOTION = 0.35 #@param {type:"slider", min:0.0, max:1.0, step:0.05}
|
44 |
-
|
45 |
-
#@markdown Add naive eye blink
|
46 |
-
ADD_NAIVE_EYE = True #@param ["False", "True"] {type:"raw"}
|
47 |
-
|
48 |
-
#@markdown If your image has an opened mouth, put this as True, else False
|
49 |
-
CLOSE_INPUT_FACE_MOUTH = True #@param ["False", "True"] {type:"raw"}
|
50 |
-
|
51 |
-
|
52 |
-
#@markdown # Landmark Adjustment
|
53 |
-
|
54 |
-
#@markdown Adjust upper lip thickness (postive value means thicker)
|
55 |
-
UPPER_LIP_ADJUST = -1 #@param {type:"slider", min:-3.0, max:3.0, step:1.0}
|
56 |
-
|
57 |
-
#@markdown Adjust lower lip thickness (postive value means thicker)
|
58 |
-
LOWER_LIP_ADJUST = -1 #@param {type:"slider", min:-3.0, max:3.0, step:1.0}
|
59 |
-
|
60 |
-
#@markdown Adjust static lip width (in multipication)
|
61 |
-
LIP_WIDTH_ADJUST = 1.0 #@param {type:"slider", min:0.8, max:1.2, step:0.01}
|
62 |
-
|
63 |
-
|
64 |
-
# %%
|
65 |
-
import sys
|
66 |
-
sys.path.append("thirdparty/AdaptiveWingLoss")
|
67 |
-
import os, glob
|
68 |
-
import numpy as np
|
69 |
-
import cv2
|
70 |
-
import argparse
|
71 |
-
from src.approaches.train_image_translation import Image_translation_block
|
72 |
-
import torch
|
73 |
-
import pickle
|
74 |
-
import face_alignment
|
75 |
-
from face_alignment import face_alignment
|
76 |
-
from src.autovc.AutoVC_mel_Convertor_retrain_version import AutoVC_mel_Convertor
|
77 |
-
import shutil
|
78 |
-
import time
|
79 |
-
import util.utils as util
|
80 |
-
from scipy.signal import savgol_filter
|
81 |
-
from src.approaches.train_audio2landmark import Audio2landmark_model
|
82 |
-
|
83 |
-
|
84 |
-
# %%
|
85 |
-
sys.stdout = open(os.devnull, 'a')
|
86 |
-
|
87 |
-
parser = argparse.ArgumentParser()
|
88 |
-
parser.add_argument('--jpg', type=str, default='{}.jpg'.format(default_head_name.value))
|
89 |
-
parser.add_argument('--close_input_face_mouth', default=CLOSE_INPUT_FACE_MOUTH, action='store_true')
|
90 |
-
parser.add_argument('--load_AUTOVC_name', type=str, default='MakeItTalk/examples/ckpt/ckpt_autovc.pth')
|
91 |
-
parser.add_argument('--load_a2l_G_name', type=str, default='MakeItTalk/examples/ckpt/ckpt_speaker_branch.pth')
|
92 |
-
parser.add_argument('--load_a2l_C_name', type=str, default='MakeItTalk/examples/ckpt/ckpt_content_branch.pth') #ckpt_audio2landmark_c.pth')
|
93 |
-
parser.add_argument('--load_G_name', type=str, default='MakeItTalk/examples/ckpt/ckpt_116_i2i_comb.pth') #ckpt_image2image.pth') #ckpt_i2i_finetune_150.pth') #c
|
94 |
-
parser.add_argument('--amp_lip_x', type=float, default=AMP_LIP_SHAPE_X)
|
95 |
-
parser.add_argument('--amp_lip_y', type=float, default=AMP_LIP_SHAPE_Y)
|
96 |
-
parser.add_argument('--amp_pos', type=float, default=AMP_HEAD_POSE_MOTION)
|
97 |
-
parser.add_argument('--reuse_train_emb_list', type=str, nargs='+', default=[]) # ['iWeklsXc0H8']) #['45hn7-LXDX8']) #['E_kmpT-EfOg']) #'iWeklsXc0H8', '29k8RtSUjE0', '45hn7-LXDX8',
|
98 |
-
parser.add_argument('--add_audio_in', default=False, action='store_true')
|
99 |
-
parser.add_argument('--comb_fan_awing', default=False, action='store_true')
|
100 |
-
parser.add_argument('--output_folder', type=str, default='examples')
|
101 |
-
parser.add_argument('--test_end2end', default=True, action='store_true')
|
102 |
-
parser.add_argument('--dump_dir', type=str, default='', help='')
|
103 |
-
parser.add_argument('--pos_dim', default=7, type=int)
|
104 |
-
parser.add_argument('--use_prior_net', default=True, action='store_true')
|
105 |
-
parser.add_argument('--transformer_d_model', default=32, type=int)
|
106 |
-
parser.add_argument('--transformer_N', default=2, type=int)
|
107 |
-
parser.add_argument('--transformer_heads', default=2, type=int)
|
108 |
-
parser.add_argument('--spk_emb_enc_size', default=16, type=int)
|
109 |
-
parser.add_argument('--init_content_encoder', type=str, default='')
|
110 |
-
parser.add_argument('--lr', type=float, default=1e-3, help='learning rate')
|
111 |
-
parser.add_argument('--reg_lr', type=float, default=1e-6, help='weight decay')
|
112 |
-
parser.add_argument('--write', default=False, action='store_true')
|
113 |
-
parser.add_argument('--segment_batch_size', type=int, default=1, help='batch size')
|
114 |
-
parser.add_argument('--emb_coef', default=3.0, type=float)
|
115 |
-
parser.add_argument('--lambda_laplacian_smooth_loss', default=1.0, type=float)
|
116 |
-
parser.add_argument('--use_11spk_only', default=False, action='store_true')
|
117 |
-
parser.add_argument('-f')
|
118 |
-
opt_parser = parser.parse_args()
|
119 |
-
|
120 |
-
|
121 |
-
# %%
|
122 |
-
img = cv2.imread('MakeItTalk/examples/' + opt_parser.jpg)
|
123 |
-
plt.imshow(img)
|
124 |
-
|
125 |
-
|
126 |
-
# %%
|
127 |
-
predictor = face_alignment.FaceAlignment(face_alignment.LandmarksType._3D, device='mps', flip_input=True)
|
128 |
-
shapes = predictor.get_landmarks(img)
|
129 |
-
if (not shapes or len(shapes) != 1):
|
130 |
-
print('Cannot detect face landmarks. Exit.')
|
131 |
-
exit(-1)
|
132 |
-
shape_3d = shapes[0]
|
133 |
-
|
134 |
-
|
135 |
-
# %%
|
136 |
-
if(opt_parser.close_input_face_mouth):
|
137 |
-
util.close_input_face_mouth(shape_3d)
|
138 |
-
shape_3d[48:, 0] = (shape_3d[48:, 0] - np.mean(shape_3d[48:, 0])) * LIP_WIDTH_ADJUST + np.mean(shape_3d[48:, 0]) # wider lips
|
139 |
-
shape_3d[49:54, 1] -= UPPER_LIP_ADJUST # thinner upper lip
|
140 |
-
shape_3d[55:60, 1] += LOWER_LIP_ADJUST # thinner lower lip
|
141 |
-
shape_3d[[37,38,43,44], 1] -=2. # larger eyes
|
142 |
-
shape_3d[[40,41,46,47], 1] +=2. # larger eyes
|
143 |
-
shape_3d, scale, shift = util.norm_input_face(shape_3d)
|
144 |
-
|
145 |
-
print("Loaded Image...", file=sys.stderr)
|
146 |
-
|
147 |
-
|
148 |
-
# %%
|
149 |
-
au_data = []
|
150 |
-
au_emb = []
|
151 |
-
ains = glob.glob1('examples', '*.wav')
|
152 |
-
ains = [item for item in ains if item != 'tmp.wav']
|
153 |
-
ains.sort()
|
154 |
-
for ain in ains:
|
155 |
-
os.system('ffmpeg -y -loglevel error -i MakeItTalk/examples/{} -ar 16000 MakeItTalk/examples/tmp.wav'.format(ain))
|
156 |
-
shutil.copyfile('MakeItTalk/examples/tmp.wav', 'MakeItTalk/examples/{}'.format(ain))
|
157 |
-
|
158 |
-
# au embedding
|
159 |
-
from thirdparty.resemblyer_util.speaker_emb import get_spk_emb
|
160 |
-
me, ae = get_spk_emb('MakeItTalk/examples/{}'.format(ain))
|
161 |
-
au_emb.append(me.reshape(-1))
|
162 |
-
|
163 |
-
print('Processing audio file', ain)
|
164 |
-
c = AutoVC_mel_Convertor('examples')
|
165 |
-
|
166 |
-
au_data_i = c.convert_single_wav_to_autovc_input(audio_filename=os.path.join('examples', ain),
|
167 |
-
autovc_model_path=opt_parser.load_AUTOVC_name)
|
168 |
-
au_data += au_data_i
|
169 |
-
if(os.path.isfile('MakeItTalk/examples/tmp.wav')):
|
170 |
-
os.remove('MakeItTalk/examples/tmp.wav')
|
171 |
-
|
172 |
-
print("Loaded audio...", file=sys.stderr)
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
# %%
|
177 |
-
# landmark fake placeholder
|
178 |
-
fl_data = []
|
179 |
-
rot_tran, rot_quat, anchor_t_shape = [], [], []
|
180 |
-
for au, info in au_data:
|
181 |
-
au_length = au.shape[0]
|
182 |
-
fl = np.zeros(shape=(au_length, 68 * 3))
|
183 |
-
fl_data.append((fl, info))
|
184 |
-
rot_tran.append(np.zeros(shape=(au_length, 3, 4)))
|
185 |
-
rot_quat.append(np.zeros(shape=(au_length, 4)))
|
186 |
-
anchor_t_shape.append(np.zeros(shape=(au_length, 68 * 3)))
|
187 |
-
|
188 |
-
if(os.path.exists(os.path.join('examples', 'dump', 'random_val_fl.pickle'))):
|
189 |
-
os.remove(os.path.join('examples', 'dump', 'random_val_fl.pickle'))
|
190 |
-
if(os.path.exists(os.path.join('examples', 'dump', 'random_val_fl_interp.pickle'))):
|
191 |
-
os.remove(os.path.join('examples', 'dump', 'random_val_fl_interp.pickle'))
|
192 |
-
if(os.path.exists(os.path.join('examples', 'dump', 'random_val_au.pickle'))):
|
193 |
-
os.remove(os.path.join('examples', 'dump', 'random_val_au.pickle'))
|
194 |
-
if (os.path.exists(os.path.join('examples', 'dump', 'random_val_gaze.pickle'))):
|
195 |
-
os.remove(os.path.join('examples', 'dump', 'random_val_gaze.pickle'))
|
196 |
-
|
197 |
-
with open(os.path.join('examples', 'dump', 'random_val_fl.pickle'), 'wb') as fp:
|
198 |
-
pickle.dump(fl_data, fp)
|
199 |
-
with open(os.path.join('examples', 'dump', 'random_val_au.pickle'), 'wb') as fp:
|
200 |
-
pickle.dump(au_data, fp)
|
201 |
-
with open(os.path.join('examples', 'dump', 'random_val_gaze.pickle'), 'wb') as fp:
|
202 |
-
gaze = {'rot_trans':rot_tran, 'rot_quat':rot_quat, 'anchor_t_shape':anchor_t_shape}
|
203 |
-
pickle.dump(gaze, fp)
|
204 |
-
|
205 |
-
|
206 |
-
# %%
|
207 |
-
model = Audio2landmark_model(opt_parser, jpg_shape=shape_3d)
|
208 |
-
if(len(opt_parser.reuse_train_emb_list) == 0):
|
209 |
-
model.test(au_emb=au_emb)
|
210 |
-
else:
|
211 |
-
model.test(au_emb=None)
|
212 |
-
|
213 |
-
print("Audio->Landmark...", file=sys.stderr)
|
214 |
-
|
215 |
-
|
216 |
-
# %%
|
217 |
-
fls = glob.glob1('examples', 'pred_fls_*.txt')
|
218 |
-
fls.sort()
|
219 |
-
|
220 |
-
for i in range(0,len(fls)):
|
221 |
-
fl = np.loadtxt(os.path.join('examples', fls[i])).reshape((-1, 68,3))
|
222 |
-
print(fls[i])
|
223 |
-
fl[:, :, 0:2] = -fl[:, :, 0:2]
|
224 |
-
fl[:, :, 0:2] = fl[:, :, 0:2] / scale - shift
|
225 |
-
|
226 |
-
if (ADD_NAIVE_EYE):
|
227 |
-
fl = util.add_naive_eye(fl)
|
228 |
-
|
229 |
-
# additional smooth
|
230 |
-
fl = fl.reshape((-1, 204))
|
231 |
-
fl[:, :48 * 3] = savgol_filter(fl[:, :48 * 3], 15, 3, axis=0)
|
232 |
-
fl[:, 48*3:] = savgol_filter(fl[:, 48*3:], 5, 3, axis=0)
|
233 |
-
fl = fl.reshape((-1, 68, 3))
|
234 |
-
|
235 |
-
''' STEP 6: Imag2image translation '''
|
236 |
-
model = Image_translation_block(opt_parser, single_test=True)
|
237 |
-
with torch.no_grad():
|
238 |
-
model.single_test(jpg=img, fls=fl, filename=fls[i], prefix=opt_parser.jpg.split('.')[0])
|
239 |
-
print('finish image2image gen')
|
240 |
-
os.remove(os.path.join('examples', fls[i]))
|
241 |
-
|
242 |
-
print("{} / {}: Landmark->Face...".format(i+1, len(fls)), file=sys.stderr)
|
243 |
-
print("Done!", file=sys.stderr)
|
244 |
-
|
245 |
-
# %% [markdown]
|
246 |
-
# # Generated video from image and sound clip
|
247 |
-
|
248 |
-
# %%
|
249 |
-
from IPython.display import Video
|
250 |
-
|
251 |
-
Video("MakeItTalk/examples/marlenes_v1.mp4")
|
252 |
-
|
253 |
-
|
254 |
-
# %%
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
# %%
|
259 |
-
from IPython.display import HTML
|
260 |
-
from base64 import b64encode
|
261 |
-
|
262 |
-
for ain in ains:
|
263 |
-
OUTPUT_MP4_NAME = '{}_pred_fls_{}_audio_embed.mp4'.format(
|
264 |
-
opt_parser.jpg.split('.')[0],
|
265 |
-
ain.split('.')[0]
|
266 |
-
)
|
267 |
-
mp4 = open('MakeItTalk/examples/{}'.format(OUTPUT_MP4_NAME),'rb').read()
|
268 |
-
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
|
269 |
-
|
270 |
-
print('Display animation: MakeItTalk/examples/{}'.format(OUTPUT_MP4_NAME), file=sys.stderr)
|
271 |
-
display(HTML("""
|
272 |
-
<video width=600 controls>
|
273 |
-
<source src="%s" type="video/mp4">
|
274 |
-
</video>
|
275 |
-
""" % data_url))
|
276 |
-
|
277 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|