Skip to content

Commit 6efbb1c

Browse files
committed
new detector combined
1 parent dae7abe commit 6efbb1c

13 files changed

Lines changed: 532 additions & 254 deletions

.gitignore

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,5 +41,5 @@ Thumbs.db
4141
#########################
4242
data/
4343
protos/
44-
utils/__init__.py
45-
44+
utils/
45+
*.pth

LICENSE.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
Copyright (c) 2016-present Joon Son Chung.
2+
3+
Permission is hereby granted, free of charge, to any person obtaining a copy
4+
of this software and associated documentation files (the "Software"), to deal
5+
in the Software without restriction, including without limitation the rights
6+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7+
copies of the Software, and to permit persons to whom the Software is
8+
furnished to do so, subject to the following conditions:
9+
10+
The above copyright notice and this permission notice shall be included in
11+
all copies or substantial portions of the Software.
12+
13+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19+
THE SOFTWARE.

README.md

Lines changed: 4 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -4,28 +4,15 @@ This repository contains the demo for the audio-to-video synchronisation network
44
1. Removing temporal lags between the audio and visual streams in a video;
55
2. Determining who is speaking amongst multiple faces in a video.
66

7-
The model can be used for research purposes under <a href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License</a>. Please cite the paper below if you make use of the software.
7+
Please cite the paper below if you make use of the software.
88

9-
## Prerequisites
10-
The following packages are required to run the SyncNet demo:
9+
## Dependencies
1110
```
12-
python (2.7.12)
13-
pytorch (0.4.0)
14-
numpy (1.14.3)
15-
scipy (1.0.1)
16-
opencv-python (3.4.0) - via opencv-contrib-python
17-
python_speech_features (0.6)
18-
cuda (8.0)
19-
ffmpeg (3.4.2)
11+
pip install -r requirements.txt
2012
```
2113

22-
In addition to above, these are required to run the full pipeline:
23-
```
24-
tensorflow (1.2, 1.4)
25-
pyscenedetect (0.5)
26-
```
14+
In addition, `ffmpeg` is required.
2715

28-
The demo has been tested with the package versions shown above, but may also work on other versions.
2916

3017
## Demo
3118

SyncNetModel.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
#!/usr/bin/python
2+
#-*- coding: utf-8 -*-
3+
14
import torch
25
import torch.nn as nn
36

detectors/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Face detector
2+
3+
This face detector is adapted from `https://github.com/cs-giung/face-detection-pytorch`.

detectors/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .s3fd import S3FD

detectors/s3fd/__init__.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
import time
2+
import numpy as np
3+
import cv2
4+
import torch
5+
from torchvision import transforms
6+
from .nets import S3FDNet
7+
from .box_utils import nms_
8+
9+
PATH_WEIGHT = './detectors/s3fd/weights/sfd_face.pth'
10+
img_mean = np.array([104., 117., 123.])[:, np.newaxis, np.newaxis].astype('float32')
11+
12+
13+
class S3FD():
14+
15+
def __init__(self, device='cuda'):
16+
17+
tstamp = time.time()
18+
self.device = device
19+
20+
print('[S3FD] loading with', self.device)
21+
self.net = S3FDNet(device=self.device).to(self.device)
22+
state_dict = torch.load(PATH_WEIGHT, map_location=self.device)
23+
self.net.load_state_dict(state_dict)
24+
self.net.eval()
25+
print('[S3FD] finished loading (%.4f sec)' % (time.time() - tstamp))
26+
27+
def detect_faces(self, image, conf_th=0.8, scales=[1]):
28+
29+
w, h = image.shape[1], image.shape[0]
30+
31+
bboxes = np.empty(shape=(0, 5))
32+
33+
with torch.no_grad():
34+
for s in scales:
35+
scaled_img = cv2.resize(image, dsize=(0, 0), fx=s, fy=s, interpolation=cv2.INTER_LINEAR)
36+
37+
scaled_img = np.swapaxes(scaled_img, 1, 2)
38+
scaled_img = np.swapaxes(scaled_img, 1, 0)
39+
scaled_img = scaled_img[[2, 1, 0], :, :]
40+
scaled_img = scaled_img.astype('float32')
41+
scaled_img -= img_mean
42+
scaled_img = scaled_img[[2, 1, 0], :, :]
43+
x = torch.from_numpy(scaled_img).unsqueeze(0).to(self.device)
44+
y = self.net(x)
45+
46+
detections = y.data
47+
scale = torch.Tensor([w, h, w, h])
48+
49+
for i in range(detections.size(1)):
50+
j = 0
51+
while detections[0, i, j, 0] > conf_th:
52+
score = detections[0, i, j, 0]
53+
pt = (detections[0, i, j, 1:] * scale).cpu().numpy()
54+
bbox = (pt[0], pt[1], pt[2], pt[3], score)
55+
bboxes = np.vstack((bboxes, bbox))
56+
j += 1
57+
58+
keep = nms_(bboxes, 0.1)
59+
bboxes = bboxes[keep]
60+
61+
return bboxes

detectors/s3fd/box_utils.py

Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
import numpy as np
2+
from itertools import product as product
3+
import torch
4+
from torch.autograd import Function
5+
6+
7+
def nms_(dets, thresh):
8+
"""
9+
Courtesy of Ross Girshick
10+
[https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/nms/py_cpu_nms.py]
11+
"""
12+
x1 = dets[:, 0]
13+
y1 = dets[:, 1]
14+
x2 = dets[:, 2]
15+
y2 = dets[:, 3]
16+
scores = dets[:, 4]
17+
18+
areas = (x2 - x1) * (y2 - y1)
19+
order = scores.argsort()[::-1]
20+
21+
keep = []
22+
while order.size > 0:
23+
i = order[0]
24+
keep.append(int(i))
25+
xx1 = np.maximum(x1[i], x1[order[1:]])
26+
yy1 = np.maximum(y1[i], y1[order[1:]])
27+
xx2 = np.minimum(x2[i], x2[order[1:]])
28+
yy2 = np.minimum(y2[i], y2[order[1:]])
29+
30+
w = np.maximum(0.0, xx2 - xx1)
31+
h = np.maximum(0.0, yy2 - yy1)
32+
inter = w * h
33+
ovr = inter / (areas[i] + areas[order[1:]] - inter)
34+
35+
inds = np.where(ovr <= thresh)[0]
36+
order = order[inds + 1]
37+
38+
return np.array(keep).astype(np.int)
39+
40+
41+
def decode(loc, priors, variances):
42+
"""Decode locations from predictions using priors to undo
43+
the encoding we did for offset regression at train time.
44+
Args:
45+
loc (tensor): location predictions for loc layers,
46+
Shape: [num_priors,4]
47+
priors (tensor): Prior boxes in center-offset form.
48+
Shape: [num_priors,4].
49+
variances: (list[float]) Variances of priorboxes
50+
Return:
51+
decoded bounding box predictions
52+
"""
53+
54+
boxes = torch.cat((
55+
priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
56+
priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
57+
boxes[:, :2] -= boxes[:, 2:] / 2
58+
boxes[:, 2:] += boxes[:, :2]
59+
return boxes
60+
61+
62+
def nms(boxes, scores, overlap=0.5, top_k=200):
63+
"""Apply non-maximum suppression at test time to avoid detecting too many
64+
overlapping bounding boxes for a given object.
65+
Args:
66+
boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
67+
scores: (tensor) The class predscores for the img, Shape:[num_priors].
68+
overlap: (float) The overlap thresh for suppressing unnecessary boxes.
69+
top_k: (int) The Maximum number of box preds to consider.
70+
Return:
71+
The indices of the kept boxes with respect to num_priors.
72+
"""
73+
74+
keep = scores.new(scores.size(0)).zero_().long()
75+
if boxes.numel() == 0:
76+
return keep, 0
77+
x1 = boxes[:, 0]
78+
y1 = boxes[:, 1]
79+
x2 = boxes[:, 2]
80+
y2 = boxes[:, 3]
81+
area = torch.mul(x2 - x1, y2 - y1)
82+
v, idx = scores.sort(0) # sort in ascending order
83+
# I = I[v >= 0.01]
84+
idx = idx[-top_k:] # indices of the top-k largest vals
85+
xx1 = boxes.new()
86+
yy1 = boxes.new()
87+
xx2 = boxes.new()
88+
yy2 = boxes.new()
89+
w = boxes.new()
90+
h = boxes.new()
91+
92+
# keep = torch.Tensor()
93+
count = 0
94+
while idx.numel() > 0:
95+
i = idx[-1] # index of current largest val
96+
# keep.append(i)
97+
keep[count] = i
98+
count += 1
99+
if idx.size(0) == 1:
100+
break
101+
idx = idx[:-1] # remove kept element from view
102+
# load bboxes of next highest vals
103+
torch.index_select(x1, 0, idx, out=xx1)
104+
torch.index_select(y1, 0, idx, out=yy1)
105+
torch.index_select(x2, 0, idx, out=xx2)
106+
torch.index_select(y2, 0, idx, out=yy2)
107+
# store element-wise max with next highest score
108+
xx1 = torch.clamp(xx1, min=x1[i])
109+
yy1 = torch.clamp(yy1, min=y1[i])
110+
xx2 = torch.clamp(xx2, max=x2[i])
111+
yy2 = torch.clamp(yy2, max=y2[i])
112+
w.resize_as_(xx2)
113+
h.resize_as_(yy2)
114+
w = xx2 - xx1
115+
h = yy2 - yy1
116+
# check sizes of xx1 and xx2.. after each iteration
117+
w = torch.clamp(w, min=0.0)
118+
h = torch.clamp(h, min=0.0)
119+
inter = w * h
120+
# IoU = i / (area(a) + area(b) - i)
121+
rem_areas = torch.index_select(area, 0, idx) # load remaining areas)
122+
union = (rem_areas - inter) + area[i]
123+
IoU = inter / union # store result in iou
124+
# keep only elements with an IoU <= overlap
125+
idx = idx[IoU.le(overlap)]
126+
return keep, count
127+
128+
129+
class Detect(object):
130+
131+
def __init__(self, num_classes=2,
132+
top_k=750, nms_thresh=0.3, conf_thresh=0.05,
133+
variance=[0.1, 0.2], nms_top_k=5000):
134+
135+
self.num_classes = num_classes
136+
self.top_k = top_k
137+
self.nms_thresh = nms_thresh
138+
self.conf_thresh = conf_thresh
139+
self.variance = variance
140+
self.nms_top_k = nms_top_k
141+
142+
def forward(self, loc_data, conf_data, prior_data):
143+
144+
num = loc_data.size(0)
145+
num_priors = prior_data.size(0)
146+
147+
conf_preds = conf_data.view(num, num_priors, self.num_classes).transpose(2, 1)
148+
batch_priors = prior_data.view(-1, num_priors, 4).expand(num, num_priors, 4)
149+
batch_priors = batch_priors.contiguous().view(-1, 4)
150+
151+
decoded_boxes = decode(loc_data.view(-1, 4), batch_priors, self.variance)
152+
decoded_boxes = decoded_boxes.view(num, num_priors, 4)
153+
154+
output = torch.zeros(num, self.num_classes, self.top_k, 5)
155+
156+
for i in range(num):
157+
boxes = decoded_boxes[i].clone()
158+
conf_scores = conf_preds[i].clone()
159+
160+
for cl in range(1, self.num_classes):
161+
c_mask = conf_scores[cl].gt(self.conf_thresh)
162+
scores = conf_scores[cl][c_mask]
163+
164+
if scores.dim() == 0:
165+
continue
166+
l_mask = c_mask.unsqueeze(1).expand_as(boxes)
167+
boxes_ = boxes[l_mask].view(-1, 4)
168+
ids, count = nms(boxes_, scores, self.nms_thresh, self.nms_top_k)
169+
count = count if count < self.top_k else self.top_k
170+
171+
output[i, cl, :count] = torch.cat((scores[ids[:count]].unsqueeze(1), boxes_[ids[:count]]), 1)
172+
173+
return output
174+
175+
176+
class PriorBox(object):
177+
178+
def __init__(self, input_size, feature_maps,
179+
variance=[0.1, 0.2],
180+
min_sizes=[16, 32, 64, 128, 256, 512],
181+
steps=[4, 8, 16, 32, 64, 128],
182+
clip=False):
183+
184+
super(PriorBox, self).__init__()
185+
186+
self.imh = input_size[0]
187+
self.imw = input_size[1]
188+
self.feature_maps = feature_maps
189+
190+
self.variance = variance
191+
self.min_sizes = min_sizes
192+
self.steps = steps
193+
self.clip = clip
194+
195+
def forward(self):
196+
mean = []
197+
for k, fmap in enumerate(self.feature_maps):
198+
feath = fmap[0]
199+
featw = fmap[1]
200+
for i, j in product(range(feath), range(featw)):
201+
f_kw = self.imw / self.steps[k]
202+
f_kh = self.imh / self.steps[k]
203+
204+
cx = (j + 0.5) / f_kw
205+
cy = (i + 0.5) / f_kh
206+
207+
s_kw = self.min_sizes[k] / self.imw
208+
s_kh = self.min_sizes[k] / self.imh
209+
210+
mean += [cx, cy, s_kw, s_kh]
211+
212+
output = torch.FloatTensor(mean).view(-1, 4)
213+
214+
if self.clip:
215+
output.clamp_(max=1, min=0)
216+
217+
return output

0 commit comments

Comments
 (0)