44
55import torch
66import numpy
7- import time , pdb , argparse , subprocess , os
7+ import time , pdb , argparse , subprocess , os , math , glob
88import cv2
99import python_speech_features
1010
1111from scipy import signal
1212from scipy .io import wavfile
1313from SyncNetModel import *
14+ from shutil import rmtree
1415
1516
1617# ==================== Get OFFSET ====================
@@ -41,21 +42,33 @@ def __init__(self, dropout = 0, num_layers_in_fc_layers = 1024):
4142 def evaluate (self , opt , videofile ):
4243
4344 self .__S__ .eval ();
45+
46+ # ========== ==========
47+ # Convert files
48+ # ========== ==========
49+
50+ if os .path .exists (os .path .join (opt .tmp_dir ,opt .reference )):
51+ rmtree (os .path .join (opt .tmp_dir ,opt .reference ))
52+
53+ os .makedirs (os .path .join (opt .tmp_dir ,opt .reference ))
54+
55+ command = ("ffmpeg -y -i %s -threads 1 -f image2 %s" % (videofile ,os .path .join (opt .tmp_dir ,opt .reference ,'%06d.jpg' )))
56+ output = subprocess .call (command , shell = True , stdout = None )
57+
58+ command = ("ffmpeg -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (videofile ,os .path .join (opt .tmp_dir ,opt .reference ,'audio.wav' )))
59+ output = subprocess .call (command , shell = True , stdout = None )
4460
4561 # ========== ==========
4662 # Load video
4763 # ========== ==========
48- cap = cv2 .VideoCapture (videofile )
4964
50- frame_num = 1 ;
5165 images = []
52- while frame_num :
53- frame_num += 1
54- ret , image = cap .read ()
55- if ret == 0 :
56- break
66+
67+ flist = glob .glob (os .path .join (opt .tmp_dir ,opt .reference ,'*.jpg' ))
68+ flist .sort ()
5769
58- images .append (image )
70+ for fname in flist :
71+ images .append (cv2 .imread (fname ))
5972
6073 im = numpy .stack (images ,axis = 3 )
6174 im = numpy .expand_dims (im ,axis = 0 )
@@ -67,12 +80,7 @@ def evaluate(self, opt, videofile):
6780 # Load audio
6881 # ========== ==========
6982
70- audiotmp = os .path .join (opt .tmp_dir ,'audio.wav' )
71-
72- command = ("ffmpeg -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (videofile ,audiotmp ))
73- output = subprocess .call (command , shell = True , stdout = None )
74-
75- sample_rate , audio = wavfile .read (audiotmp )
83+ sample_rate , audio = wavfile .read (os .path .join (opt .tmp_dir ,opt .reference ,'audio.wav' ))
7684 mfcc = zip (* python_speech_features .mfcc (audio ,sample_rate ))
7785 mfcc = numpy .stack ([numpy .array (i ) for i in mfcc ])
7886
@@ -83,15 +91,16 @@ def evaluate(self, opt, videofile):
8391 # Check audio and video input length
8492 # ========== ==========
8593
86- if (float (len (audio ))/ 16000 ) < (float (len (images ))/ 25 ) :
87- print (" *** WARNING: The audio (%.4fs) is shorter than the video (%.4fs). Type 'cont' to continue. *** " % (float (len (audio ))/ 16000 ,float (len (images ))/ 25 ))
88- pdb .set_trace ()
94+ if (float (len (audio ))/ 16000 ) != (float (len (images ))/ 25 ) :
95+ print ("WARNING: Audio (%.4fs) and video (%.4fs) lengths are different." % (float (len (audio ))/ 16000 ,float (len (images ))/ 25 ))
96+
97+ min_length = min (len (images ),math .floor (len (audio )/ 640 ))
8998
9099 # ========== ==========
91100 # Generate video and audio feats
92101 # ========== ==========
93102
94- lastframe = len ( images ) - 5
103+ lastframe = min_length - 5
95104 im_feat = []
96105 cc_feat = []
97106
0 commit comments