1. 3D卷積神經網絡
相比于2D 卷積神經網絡,3D卷積神經網絡更能很好的利用視頻中的時序信息。因此,其主要應用視頻、行為識別等領域居多。3D卷積神經網絡是將時間維度看成了第三維。
人類行為識別的實際應用:
安防監控。(檢測識別異常行為:如打架,偷東西等)
監視和培訓新人工作來確保任務執行正確。(例如,雞蛋灌餅制作程序:和面,搟面團,打雞蛋,攤餅等動作)
判斷檢測食品服務人員是否按規定洗手。
自動對視頻數據分類。
人類的行為識別,在實際生活環境中,在不同的場景會存在著背景雜亂、遮擋和視角變化等等情況,對于人來說,是很容易就可以辨識出來,但對于計算機,就不是一件簡單的事了,比如目標尺度變化和視覺改變等。
2. 人類行為識別模型
abseiling
air drumming
answering questions
applauding
applying cream
archery
arm wrestling
arranging flowers
assembling computer
auctioning
baby waking up
baking cookies
balloon blowing
bandaging
barbequing
bartending
beatboxing
bee keeping
belly dancing
bench pressing
bending back
bending metal
biking through snow
blasting sand
blowing glass
blowing leaves
blowing nose
blowing out candles
bobsledding
bookbinding
bouncing on trampoline
bowling
braiding hair
breading or breadcrumbing
breakdancing
brush painting
brushing hair
brushing teeth
building cabinet
building shed
bungee jumping
busking
canoeing or kayaking
capoeira
carrying baby
...
import os
import numpy as np
import cv2 as cv
import argparse
from common import findFile
parser = argparse.ArgumentParser(description='Use this script to run action recognition using 3D ResNet34',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--input', '-i', help='Path to input video file. Skip this argument to capture frames from a camera.')
parser.add_argument('--model', required=True, help='Path to model.')
parser.add_argument('--classes', default=findFile('action_recongnition_kinetics.txt'), help='Path to classes list.')
# To get net download original repository https://github.com/kenshohara/video-classification-3d-cnn-pytorch
# For correct ONNX export modify file: video-classification-3d-cnn-pytorch/models/resnet.py
# change
# - def downsample_basic_block(x, planes, stride):
# - out = F.avg_pool3d(x, kernel_size=1, stride=stride)
# - zero_pads = torch.Tensor(out.size(0), planes - out.size(1),
# - out.size(2), out.size(3),
# - out.size(4)).zero_()
# - if isinstance(out.data, torch.cuda.FloatTensor):
# - zero_pads = zero_pads.cuda()
# -
# - out = Variable(torch.cat([out.data, zero_pads], dim=1))
# - return out
# To
# + def downsample_basic_block(x, planes, stride):
# + out = F.avg_pool3d(x, kernel_size=1, stride=stride)
# + out = F.pad(out, (0, 0, 0, 0, 0, 0, 0, int(planes - out.size(1)), 0, 0), "constant", 0)
# + return out
# To ONNX export use torch.onnx.export(model, inputs, model_name)
def get_class_names(path):
class_names = []
with open(path) as f:
for row in f:
class_names.append(row[:-1])
return class_names
def classify_video(video_path, net_path):
SAMPLE_DURATION = 16
SAMPLE_SIZE = 112
mean = (114.7748, 107.7354, 99.4750)
class_names = get_class_names(args.classes)
net = cv.dnn.readNet(net_path)
net.setPreferableBackend(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE)
net.setPreferableTarget(cv.dnn.DNN_TARGET_CPU)
winName = 'Deep learning image classification in OpenCV'
cv.namedWindow(winName, cv.WINDOW_AUTOSIZE)
cap = cv.VideoCapture(video_path)
while cv.waitKey(1) < 0:
frames = []
for _ in range(SAMPLE_DURATION):
hasFrame, frame = cap.read()
if not hasFrame:
exit(0)
frames.append(frame)
inputs = cv.dnn.blobFromImages(frames, 1, (SAMPLE_SIZE, SAMPLE_SIZE), mean, True, crop=True)
inputs = np.transpose(inputs, (1, 0, 2, 3))
inputs = np.expand_dims(inputs, axis=0)
net.setInput(inputs)
outputs = net.forward()
class_pred = np.argmax(outputs)
label = class_names[class_pred]
for frame in frames:
labelSize, baseLine = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, 0.5, 1)
cv.rectangle(frame, (0, 10 - labelSize[1]),
(labelSize[0], 10 + baseLine), (255, 255, 255), cv.FILLED)
cv.putText(frame, label, (0, 10), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0))
cv.imshow(winName, frame)
if cv.waitKey(1) & 0xFF == ord('q'):
break
if __name__ == "__main__":
args, _ = parser.parse_known_args()
classify_video(args.input if args.input else 0, args.model)
3.代碼
環境:
win10
pycharm
anaconda3
python3.7
文件結構:
代碼:
from collections import deque
import numpy as np
import argparse
import imutils
import cv2
# 構造參數
ap = argparse.ArgumentParser()
ap.add_argument("-m", "--model", required=True, help="path to trained human activity recognition model")
ap.add_argument("-c", "--classes", required=True, help="path to class labels file")
ap.add_argument("-i", "--input", type=str, default="", help="optional path to video file")
args = vars(ap.parse_args())
# 類別,樣本持續時間(幀數),樣本大小(空間尺寸)
CLASSES = open(args["classes"]).read().strip().split("\n")
SAMPLE_DURATION = 16
SAMPLE_SIZE = 112
print("處理中...")
# 創建幀隊列
frames = deque(maxlen=SAMPLE_DURATION)
# 讀取模型
net = cv2.dnn.readNet(args["model"])
# 待檢測視頻
vs = cv2.VideoCapture(args["input"] if args["input"] else 0)
writer = None
# 循環處理視頻流
while True:
# 讀取每幀
(grabbed, frame) = vs.read()
# 判斷視頻是否結束
if not grabbed:
print("無視頻讀取...")
break
# 調整大小,放入隊列中
frame = imutils.resize(frame, width=640)
frames.append(frame)
# 判斷是否填充到最大幀數
if len(frames) < SAMPLE_DURATION:
continue
# 隊列填充滿后繼續處理
blob = cv2.dnn.blobFromImages(frames, 1.0, (SAMPLE_SIZE, SAMPLE_SIZE), (114.7748, 107.7354, 99.4750),
swapRB=True, crop=True)
blob = np.transpose(blob, (1, 0, 2, 3))
blob = np.expand_dims(blob, axis=0)
# 識別預測
net.setInput(blob)
outputs = net.forward()
label = CLASSES[np.argmax(outputs)]
# 繪制框
cv2.rectangle(frame, (0, 0), (300, 40), (255, 0, 0), -1)
cv2.putText(frame, label, (10, 25), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 255), 2)
# cv2.imshow("Activity Recognition", frame)
# 檢測是否保存
if writer is None:
# 初始化視頻寫入器
# fourcc = cv2.VideoWriter_fourcc(*"MJPG")
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
writer = cv2.VideoWriter("E:\\work\\activity-recognition-demo\\videos\\output\\xishou1.mp4", fourcc, 30, (frame.shape[1], frame.shape[0]), True)
writer.write(frame)
# 按 q 鍵退出
# key = cv2.waitKey(1) & 0xFF
# if key == ord("q"):
# break
print("結束...")
writer.release()
vs.release()
4. 測試
測試1:洗手
OpenCV人類行為檢測-洗手
https://www.bilibili.com/video/av96440536/
視頻左上角打上了“washing hands”(洗手)標簽。
測試2:瑜伽
上圖視頻測試地址:https://www.bilibili.com/video/BV13E411c7QK/
檢測到視頻中是“yoga”(瑜伽),同時又識別到執行的動作是“stretching leg”(伸腿)。