joonson
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 2 deletions b/‎.gitignore‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎LICENSE.md‎
Lines changed: 19 additions & 0 deletions b/‎LICENSE.md‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 4 additions & 17 deletions b/‎README.md‎
Lines changed: 4 additions & 17 deletions
diff --git a/‎SyncNetModel.py‎
Lines changed: 3 additions & 0 deletions b/‎SyncNetModel.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎detectors/README.md‎
Lines changed: 3 additions & 0 deletions b/‎detectors/README.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎detectors/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎detectors/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎detectors/s3fd/__init__.py‎
Lines changed: 61 additions & 0 deletions b/‎detectors/s3fd/__init__.py‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎detectors/s3fd/box_utils.py‎
Lines changed: 217 additions & 0 deletions b/‎detectors/s3fd/box_utils.py‎
Lines changed: 217 additions & 0 deletions
@@ -41,5 +41,5 @@ Thumbs.db
 #########################
 data/
 protos/
-utils/__init__.py
-
+utils/
+*.pth
@@ -0,0 +1,19 @@
+Copyright (c) 2016-present Joon Son Chung.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
@@ -4,28 +4,15 @@ This repository contains the demo for the audio-to-video synchronisation network
 1. Removing temporal lags between the audio and visual streams in a video;
 2. Determining who is speaking amongst multiple faces in a video. 
 
-The model can be used for research purposes under <a href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License</a>. Please cite the paper below if you make use of the software. 
+Please cite the paper below if you make use of the software. 
 
-## Prerequisites
-The following packages are required to run the SyncNet demo:
+## Dependencies
 ```
-python (2.7.12)
-pytorch (0.4.0)
-numpy (1.14.3)
-scipy (1.0.1)
-opencv-python (3.4.0) - via opencv-contrib-python
-python_speech_features (0.6)
-cuda (8.0)
-ffmpeg (3.4.2)
+pip install -r requirements.txt
 ```
 
-In addition to above, these are required to run the full pipeline:
-```
-tensorflow (1.2, 1.4)
-pyscenedetect (0.5) 
-```
+In addition, `ffmpeg` is required.
 
-The demo has been tested with the package versions shown above, but may also work on other versions.
 
 ## Demo
 
 
@@ -1,3 +1,6 @@
+#!/usr/bin/python
+#-*- coding: utf-8 -*-
+
 import torch
 import torch.nn as nn
 
 
@@ -0,0 +1,3 @@
+# Face detector
+
+This face detector is adapted from `https://github.com/cs-giung/face-detection-pytorch`.
@@ -0,0 +1 @@
+from .s3fd import S3FD
@@ -0,0 +1,61 @@
+import time
+import numpy as np
+import cv2
+import torch
+from torchvision import transforms
+from .nets import S3FDNet
+from .box_utils import nms_
+
+PATH_WEIGHT = './detectors/s3fd/weights/sfd_face.pth'
+img_mean = np.array([104., 117., 123.])[:, np.newaxis, np.newaxis].astype('float32')
+
+
+class S3FD():
+
+    def __init__(self, device='cuda'):
+
+        tstamp = time.time()
+        self.device = device
+
+        print('[S3FD] loading with', self.device)
+        self.net = S3FDNet(device=self.device).to(self.device)
+        state_dict = torch.load(PATH_WEIGHT, map_location=self.device)
+        self.net.load_state_dict(state_dict)
+        self.net.eval()
+        print('[S3FD] finished loading (%.4f sec)' % (time.time() - tstamp))
+    
+    def detect_faces(self, image, conf_th=0.8, scales=[1]):
+
+        w, h = image.shape[1], image.shape[0]
+
+        bboxes = np.empty(shape=(0, 5))
+
+        with torch.no_grad():
+            for s in scales:
+                scaled_img = cv2.resize(image, dsize=(0, 0), fx=s, fy=s, interpolation=cv2.INTER_LINEAR)
+
+                scaled_img = np.swapaxes(scaled_img, 1, 2)
+                scaled_img = np.swapaxes(scaled_img, 1, 0)
+                scaled_img = scaled_img[[2, 1, 0], :, :]
+                scaled_img = scaled_img.astype('float32')
+                scaled_img -= img_mean
+                scaled_img = scaled_img[[2, 1, 0], :, :]
+                x = torch.from_numpy(scaled_img).unsqueeze(0).to(self.device)
+                y = self.net(x)
+
+                detections = y.data
+                scale = torch.Tensor([w, h, w, h])
+
+                for i in range(detections.size(1)):
+                    j = 0
+                    while detections[0, i, j, 0] > conf_th:
+                        score = detections[0, i, j, 0]
+                        pt = (detections[0, i, j, 1:] * scale).cpu().numpy()
+                        bbox = (pt[0], pt[1], pt[2], pt[3], score)
+                        bboxes = np.vstack((bboxes, bbox))
+                        j += 1
+
+            keep = nms_(bboxes, 0.1)
+            bboxes = bboxes[keep]
+
+        return bboxes
@@ -0,0 +1,217 @@
+import numpy as np
+from itertools import product as product
+import torch
+from torch.autograd import Function
+
+
+def nms_(dets, thresh):
+    """
+    Courtesy of Ross Girshick
+    [https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/nms/py_cpu_nms.py]
+    """
+    x1 = dets[:, 0]
+    y1 = dets[:, 1]
+    x2 = dets[:, 2]
+    y2 = dets[:, 3]
+    scores = dets[:, 4]
+
+    areas = (x2 - x1) * (y2 - y1)
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(int(i))
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+
+        w = np.maximum(0.0, xx2 - xx1)
+        h = np.maximum(0.0, yy2 - yy1)
+        inter = w * h
+        ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+        inds = np.where(ovr <= thresh)[0]
+        order = order[inds + 1]
+
+    return np.array(keep).astype(np.int)
+
+
+def decode(loc, priors, variances):
+    """Decode locations from predictions using priors to undo
+    the encoding we did for offset regression at train time.
+    Args:
+        loc (tensor): location predictions for loc layers,
+            Shape: [num_priors,4]
+        priors (tensor): Prior boxes in center-offset form.
+            Shape: [num_priors,4].
+        variances: (list[float]) Variances of priorboxes
+    Return:
+        decoded bounding box predictions
+    """
+
+    boxes = torch.cat((
+        priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
+        priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
+    boxes[:, :2] -= boxes[:, 2:] / 2
+    boxes[:, 2:] += boxes[:, :2]
+    return boxes
+
+
+def nms(boxes, scores, overlap=0.5, top_k=200):
+    """Apply non-maximum suppression at test time to avoid detecting too many
+    overlapping bounding boxes for a given object.
+    Args:
+        boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
+        scores: (tensor) The class predscores for the img, Shape:[num_priors].
+        overlap: (float) The overlap thresh for suppressing unnecessary boxes.
+        top_k: (int) The Maximum number of box preds to consider.
+    Return:
+        The indices of the kept boxes with respect to num_priors.
+    """
+
+    keep = scores.new(scores.size(0)).zero_().long()
+    if boxes.numel() == 0:
+        return keep, 0
+    x1 = boxes[:, 0]
+    y1 = boxes[:, 1]
+    x2 = boxes[:, 2]
+    y2 = boxes[:, 3]
+    area = torch.mul(x2 - x1, y2 - y1)
+    v, idx = scores.sort(0)  # sort in ascending order
+    # I = I[v >= 0.01]
+    idx = idx[-top_k:]  # indices of the top-k largest vals
+    xx1 = boxes.new()
+    yy1 = boxes.new()
+    xx2 = boxes.new()
+    yy2 = boxes.new()
+    w = boxes.new()
+    h = boxes.new()
+
+    # keep = torch.Tensor()
+    count = 0
+    while idx.numel() > 0:
+        i = idx[-1]  # index of current largest val
+        # keep.append(i)
+        keep[count] = i
+        count += 1
+        if idx.size(0) == 1:
+            break
+        idx = idx[:-1]  # remove kept element from view
+        # load bboxes of next highest vals
+        torch.index_select(x1, 0, idx, out=xx1)
+        torch.index_select(y1, 0, idx, out=yy1)
+        torch.index_select(x2, 0, idx, out=xx2)
+        torch.index_select(y2, 0, idx, out=yy2)
+        # store element-wise max with next highest score
+        xx1 = torch.clamp(xx1, min=x1[i])
+        yy1 = torch.clamp(yy1, min=y1[i])
+        xx2 = torch.clamp(xx2, max=x2[i])
+        yy2 = torch.clamp(yy2, max=y2[i])
+        w.resize_as_(xx2)
+        h.resize_as_(yy2)
+        w = xx2 - xx1
+        h = yy2 - yy1
+        # check sizes of xx1 and xx2.. after each iteration
+        w = torch.clamp(w, min=0.0)
+        h = torch.clamp(h, min=0.0)
+        inter = w * h
+        # IoU = i / (area(a) + area(b) - i)
+        rem_areas = torch.index_select(area, 0, idx)  # load remaining areas)
+        union = (rem_areas - inter) + area[i]
+        IoU = inter / union  # store result in iou
+        # keep only elements with an IoU <= overlap
+        idx = idx[IoU.le(overlap)]
+    return keep, count
+
+
+class Detect(object):
+
+    def __init__(self, num_classes=2,
+                    top_k=750, nms_thresh=0.3, conf_thresh=0.05,
+                    variance=[0.1, 0.2], nms_top_k=5000):
+        
+        self.num_classes = num_classes
+        self.top_k = top_k
+        self.nms_thresh = nms_thresh
+        self.conf_thresh = conf_thresh
+        self.variance = variance
+        self.nms_top_k = nms_top_k
+
+    def forward(self, loc_data, conf_data, prior_data):
+
+        num = loc_data.size(0)
+        num_priors = prior_data.size(0)
+
+        conf_preds = conf_data.view(num, num_priors, self.num_classes).transpose(2, 1)
+        batch_priors = prior_data.view(-1, num_priors, 4).expand(num, num_priors, 4)
+        batch_priors = batch_priors.contiguous().view(-1, 4)
+
+        decoded_boxes = decode(loc_data.view(-1, 4), batch_priors, self.variance)
+        decoded_boxes = decoded_boxes.view(num, num_priors, 4)
+
+        output = torch.zeros(num, self.num_classes, self.top_k, 5)
+
+        for i in range(num):
+            boxes = decoded_boxes[i].clone()
+            conf_scores = conf_preds[i].clone()
+
+            for cl in range(1, self.num_classes):
+                c_mask = conf_scores[cl].gt(self.conf_thresh)
+                scores = conf_scores[cl][c_mask]
+                
+                if scores.dim() == 0:
+                    continue
+                l_mask = c_mask.unsqueeze(1).expand_as(boxes)
+                boxes_ = boxes[l_mask].view(-1, 4)
+                ids, count = nms(boxes_, scores, self.nms_thresh, self.nms_top_k)
+                count = count if count < self.top_k else self.top_k
+
+                output[i, cl, :count] = torch.cat((scores[ids[:count]].unsqueeze(1), boxes_[ids[:count]]), 1)
+
+        return output
+
+
+class PriorBox(object):
+
+    def __init__(self, input_size, feature_maps,
+                    variance=[0.1, 0.2],
+                    min_sizes=[16, 32, 64, 128, 256, 512],
+                    steps=[4, 8, 16, 32, 64, 128],
+                    clip=False):
+
+        super(PriorBox, self).__init__()
+
+        self.imh = input_size[0]
+        self.imw = input_size[1]
+        self.feature_maps = feature_maps
+
+        self.variance = variance
+        self.min_sizes = min_sizes
+        self.steps = steps
+        self.clip = clip
+
+    def forward(self):
+        mean = []
+        for k, fmap in enumerate(self.feature_maps):
+            feath = fmap[0]
+            featw = fmap[1]
+            for i, j in product(range(feath), range(featw)):
+                f_kw = self.imw / self.steps[k]
+                f_kh = self.imh / self.steps[k]
+
+                cx = (j + 0.5) / f_kw
+                cy = (i + 0.5) / f_kh
+
+                s_kw = self.min_sizes[k] / self.imw
+                s_kh = self.min_sizes[k] / self.imh
+
+                mean += [cx, cy, s_kw, s_kh]
+
+        output = torch.FloatTensor(mean).view(-1, 4)
+        
+        if self.clip:
+            output.clamp_(max=1, min=0)
+        
+        return output
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# Face detector`
	`2`	`+`
	`3`	+This face detector is adapted from `https://github.com/cs-giung/face-detection-pytorch`.