python圖像人類檢測_OpenCV人類行為識別（3D卷積神經網絡）

1. 3D卷積神經網絡

相比于2D 卷積神經網絡，3D卷積神經網絡更能很好的利用視頻中的時序信息。因此，其主要應用視頻、行為識別等領域居多。3D卷積神經網絡是將時間維度看成了第三維。

人類行為識別的實際應用：

安防監控。(檢測識別異常行為：如打架，偷東西等)

監視和培訓新人工作來確保任務執行正確。(例如，雞蛋灌餅制作程序：和面，搟面團，打雞蛋，攤餅等動作)

判斷檢測食品服務人員是否按規定洗手。

自動對視頻數據分類。

人類的行為識別，在實際生活環境中，在不同的場景會存在著背景雜亂、遮擋和視角變化等等情況，對于人來說，是很容易就可以辨識出來，但對于計算機，就不是一件簡單的事了，比如目標尺度變化和視覺改變等。

2. 人類行為識別模型

abseiling

air drumming

answering questions

applauding

applying cream

archery

arm wrestling

arranging flowers

assembling computer

auctioning

baby waking up

baking cookies

balloon blowing

bandaging

barbequing

bartending

beatboxing

bee keeping

belly dancing

bench pressing

bending back

bending metal

biking through snow

blasting sand

blowing glass

blowing leaves

blowing nose

blowing out candles

bobsledding

bookbinding

bouncing on trampoline

bowling

braiding hair

breading or breadcrumbing

breakdancing

brush painting

brushing hair

brushing teeth

building cabinet

building shed

bungee jumping

busking

canoeing or kayaking

capoeira

carrying baby

...

import os

import numpy as np

import cv2 as cv

import argparse

from common import findFile

parser = argparse.ArgumentParser(description='Use this script to run action recognition using 3D ResNet34',

formatter_class=argparse.ArgumentDefaultsHelpFormatter)

parser.add_argument('--input', '-i', help='Path to input video file. Skip this argument to capture frames from a camera.')

parser.add_argument('--model', required=True, help='Path to model.')

parser.add_argument('--classes', default=findFile('action_recongnition_kinetics.txt'), help='Path to classes list.')

# To get net download original repository https://github.com/kenshohara/video-classification-3d-cnn-pytorch

# For correct ONNX export modify file: video-classification-3d-cnn-pytorch/models/resnet.py

# change

# - def downsample_basic_block(x, planes, stride):

# - out = F.avg_pool3d(x, kernel_size=1, stride=stride)

# - zero_pads = torch.Tensor(out.size(0), planes - out.size(1),

# - out.size(2), out.size(3),

# - out.size(4)).zero_()

# - if isinstance(out.data, torch.cuda.FloatTensor):

# - zero_pads = zero_pads.cuda()

# -

# - out = Variable(torch.cat([out.data, zero_pads], dim=1))

# - return out

# To

# + def downsample_basic_block(x, planes, stride):

# + out = F.avg_pool3d(x, kernel_size=1, stride=stride)

# + out = F.pad(out, (0, 0, 0, 0, 0, 0, 0, int(planes - out.size(1)), 0, 0), "constant", 0)

# + return out

# To ONNX export use torch.onnx.export(model, inputs, model_name)

def get_class_names(path):

class_names = []

with open(path) as f:

for row in f:

class_names.append(row[:-1])

return class_names

def classify_video(video_path, net_path):

SAMPLE_DURATION = 16

SAMPLE_SIZE = 112

mean = (114.7748, 107.7354, 99.4750)

class_names = get_class_names(args.classes)

net = cv.dnn.readNet(net_path)

net.setPreferableBackend(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE)

net.setPreferableTarget(cv.dnn.DNN_TARGET_CPU)

winName = 'Deep learning image classification in OpenCV'

cv.namedWindow(winName, cv.WINDOW_AUTOSIZE)

cap = cv.VideoCapture(video_path)

while cv.waitKey(1) < 0:

frames = []

for _ in range(SAMPLE_DURATION):

hasFrame, frame = cap.read()

if not hasFrame:

exit(0)

frames.append(frame)

inputs = cv.dnn.blobFromImages(frames, 1, (SAMPLE_SIZE, SAMPLE_SIZE), mean, True, crop=True)

inputs = np.transpose(inputs, (1, 0, 2, 3))

inputs = np.expand_dims(inputs, axis=0)

net.setInput(inputs)

outputs = net.forward()

class_pred = np.argmax(outputs)

label = class_names[class_pred]

for frame in frames:

labelSize, baseLine = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, 0.5, 1)

cv.rectangle(frame, (0, 10 - labelSize[1]),

(labelSize[0], 10 + baseLine), (255, 255, 255), cv.FILLED)

cv.putText(frame, label, (0, 10), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0))

cv.imshow(winName, frame)

if cv.waitKey(1) & 0xFF == ord('q'):

break

if __name__ == "__main__":

args, _ = parser.parse_known_args()

classify_video(args.input if args.input else 0, args.model)

3.代碼

環境：

win10

pycharm

anaconda3

python3.7

文件結構：

代碼：

from collections import deque

import numpy as np

import argparse

import imutils

import cv2

# 構造參數

ap = argparse.ArgumentParser()

ap.add_argument("-m", "--model", required=True, help="path to trained human activity recognition model")

ap.add_argument("-c", "--classes", required=True, help="path to class labels file")

ap.add_argument("-i", "--input", type=str, default="", help="optional path to video file")

args = vars(ap.parse_args())

# 類別，樣本持續時間(幀數)，樣本大小(空間尺寸)

CLASSES = open(args["classes"]).read().strip().split("\n")

SAMPLE_DURATION = 16

SAMPLE_SIZE = 112

print("處理中...")

# 創建幀隊列

frames = deque(maxlen=SAMPLE_DURATION)

# 讀取模型

net = cv2.dnn.readNet(args["model"])

# 待檢測視頻

vs = cv2.VideoCapture(args["input"] if args["input"] else 0)

writer = None

# 循環處理視頻流

while True:

# 讀取每幀

(grabbed, frame) = vs.read()

# 判斷視頻是否結束

if not grabbed:

print("無視頻讀取...")

break

# 調整大小，放入隊列中

frame = imutils.resize(frame, width=640)

frames.append(frame)

# 判斷是否填充到最大幀數

if len(frames) < SAMPLE_DURATION:

continue

# 隊列填充滿后繼續處理

blob = cv2.dnn.blobFromImages(frames, 1.0, (SAMPLE_SIZE, SAMPLE_SIZE), (114.7748, 107.7354, 99.4750),

swapRB=True, crop=True)

blob = np.transpose(blob, (1, 0, 2, 3))

blob = np.expand_dims(blob, axis=0)

# 識別預測

net.setInput(blob)

outputs = net.forward()

label = CLASSES[np.argmax(outputs)]

# 繪制框

cv2.rectangle(frame, (0, 0), (300, 40), (255, 0, 0), -1)

cv2.putText(frame, label, (10, 25), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 255), 2)

# cv2.imshow("Activity Recognition", frame)

# 檢測是否保存

if writer is None:

# 初始化視頻寫入器

# fourcc = cv2.VideoWriter_fourcc(*"MJPG")

fourcc = cv2.VideoWriter_fourcc(*"mp4v")

writer = cv2.VideoWriter("E:\\work\\activity-recognition-demo\\videos\\output\\xishou1.mp4", fourcc, 30, (frame.shape[1], frame.shape[0]), True)

writer.write(frame)

# 按 q 鍵退出

# key = cv2.waitKey(1) & 0xFF

# if key == ord("q"):

# break

print("結束...")

writer.release()

vs.release()

4. 測試

測試1：洗手

OpenCV人類行為檢測-洗手

https://www.bilibili.com/video/av96440536/

視頻左上角打上了“washing hands”(洗手)標簽。

測試2：瑜伽

上圖視頻測試地址：https://www.bilibili.com/video/BV13E411c7QK/

檢測到視頻中是“yoga”(瑜伽)，同時又識別到執行的動作是“stretching leg”(伸腿)。

本文來自互聯網用戶投稿，該文觀點僅代表作者本人，不代表本站立場。本站僅提供信息存儲空間服務，不擁有所有權，不承擔相關法律責任。
如若轉載，請注明出處：http://www.pswp.cn/news/528949.shtml
繁體地址，請注明出處：http://hk.pswp.cn/news/528949.shtml
英文地址，請注明出處：http://en.pswp.cn/news/528949.shtml

如若內容造成侵權/違法違規/事實不符，請聯系多彩編程網進行投訴反饋email:809451989@qq.com，一經查實，立即刪除！