added GetFrameSkip() / SetFrameSkip() to actionNet

dusty-nv · Mar 7, 2023 · 7deda0a · 7deda0a
1 parent aa268d6
commit 7deda0a
Show file tree

Hide file tree

Showing 5 changed files with 202 additions and 56 deletions.
diff --git a/c/actionNet.cpp b/c/actionNet.cpp
@@ -32,14 +32,19 @@
 // constructor
 actionNet::actionNet() : tensorNet()
 {
-	mNumClasses = 0;
-	mNumFrames  = 0;
+	mThreshold     = 0.01f;
+	mNumClasses    = 0;
+	mNumFrames     = 0;
+	mSkipFrames    = 1;
+	mFramesSkipped = 10000; // so the very first frame always gets processed
 
 	mInputBuffers[0] = NULL;
 	mInputBuffers[1] = NULL;
 
 	mCurrentInputBuffer = 0;
 	mCurrentFrameIndex  = 0;
+	mLastClassification = 0;
+	mLastConfidence     = 0.0f;
 }
 
 
@@ -118,6 +123,10 @@ actionNet* actionNet::Create( const commandLine& cmdLine )
 	if( cmdLine.GetFlag("profile") )
 		net->EnableLayerProfiler();
 
+	// parse additional arguments
+	net->SetThreshold(cmdLine.GetFloat("threshold", net->GetThreshold()));
+	net->SetSkipFrames(cmdLine.GetUnsignedInt("skip_frames", net->GetSkipFrames()));
+
 	return net;
 }
 
@@ -285,7 +294,7 @@ int actionNet::Classify( void* image, uint32_t width, uint32_t height, imageForm
 	if( !image || width == 0 || height == 0 )
 	{
 		LogError(LOG_TRT "actionNet::Classify( 0x%p, %u, %u ) -> invalid parameters\n", image, width, height);
-		return -1;
+		return -2;
 	}
 
 	if( !imageFormatIsRGB(format) )
@@ -300,18 +309,32 @@ int actionNet::Classify( void* image, uint32_t width, uint32_t height, imageForm
 		return false;
 	}
 
+	// skip frames as needed
+	if( mFramesSkipped < mSkipFrames )
+	{
+		//LogVerbose(LOG_TRT "actionNet::Classify() -- skipping frame (framesSkipped=%u skipFrames=%u)\n", mFramesSkipped, mSkipFrames);
+
+		if( confidence != NULL )
+			*confidence = mLastConfidence;
+
+		mFramesSkipped++;
+		return mLastClassification;
+	}
+
+	mFramesSkipped = 0;
+
 	// apply input pre-processing
 	if( !preProcess(image, width, height, format) )
 	{
 		LogError(LOG_TRT "actionNet::Classify() -- tensor pre-processing failed\n");
-		return -1;
+		return -2;
 	}
 
 	// process with TRT
 	PROFILER_BEGIN(PROFILER_NETWORK);
 
 	if( !ProcessNetwork() )
-		return -1;
+		return -2;
 
 	PROFILER_END(PROFILER_NETWORK);
 	PROFILER_BEGIN(PROFILER_POSTPROCESS);
@@ -321,26 +344,29 @@ int actionNet::Classify( void* image, uint32_t width, uint32_t height, imageForm
 
 	// determine the maximum class
 	int classIndex = -1;
-	float classMax = -1.0f;
+	float classMax = 0.0f;
 
 	for( size_t n=0; n < mNumClasses; n++ )
 	{
-		const float value = mOutputs[0].CPU[n];
+		const float conf = mOutputs[0].CPU[n];
 
-		if( value >= 0.01f )
-			LogDebug("class %04zu - %f  (%s)\n", n, value, mClassDesc[n].c_str());
+		if( conf < mThreshold )
+			continue;
 
-		if( value > classMax )
+		if( conf > classMax )
 		{
 			classIndex = n;
-			classMax   = value;
+			classMax = conf;
 		}
 	}
 
+	PROFILER_END(PROFILER_POSTPROCESS);	
+
 	if( confidence != NULL )
 		*confidence = classMax;
 
-	//printf("\nmaximum class:  #%i  (%f) (%s)\n", classIndex, classMax, mClassDesc[classIndex].c_str());
-	PROFILER_END(PROFILER_POSTPROCESS);	
+	mLastConfidence = classMax;
+	mLastClassification = classIndex;
+
 	return classIndex;
 }
diff --git a/c/actionNet.h b/c/actionNet.h
@@ -57,6 +57,8 @@
 		  "  --labels=LABELS      path to text file containing the labels for each class\n" 				\
 		  "  --input-blob=INPUT   name of the input layer (default is '" ACTIONNET_DEFAULT_INPUT "')\n" 	\
 		  "  --output-blob=OUTPUT name of the output layer (default is '" ACTIONNET_DEFAULT_OUTPUT "')\n" 	\
+		  "  --threshold=CONF     minimum confidence threshold for classification (default is 0.01)\n" 	\
+		  "  --skip-frames=SKIP   how many frames to skip between classifications (default is 1)\n"         \
 		  "  --profile            enable layer profiling in TensorRT\n\n"
 
 
@@ -112,7 +114,13 @@ class actionNet : public tensorNet
 	virtual ~actionNet();
 
 	/**
-	 * Append an image to the sequence and classify the action.  
+	 * Append an image to the sequence and classify the action, returning the index of the top class.
+	 * Either the class with the maximum confidence will be returned, or -1 if no class meets 
+	 * the threshold set by SetThreshold() or the `--threshold` command-line argument.
+	 *
+	 * If this frame was skipped due to SetSkipFrames() being used, then the last frame's results will
+	 * be returned.  By default, every other frame is skipped in order to lengthen the action's window.
+	 *
 	 * @param image input image in CUDA device memory.
 	 * @param width width of the input image in pixels.
 	 * @param height height of the input image in pixels.
@@ -122,7 +130,13 @@ class actionNet : public tensorNet
 	template<typename T> int Classify( T* image, uint32_t width, uint32_t height, float* confidence=NULL )		{ return Classify((void*)image, width, height, imageFormatFromType<T>(), confidence); }
 
 	/**
-	 * Append an image to the sequence and classify the action. 
+	 * Append an image to the sequence and classify the action, returning the index of the top class.
+	 * Either the class with the maximum confidence will be returned, or -1 if no class meets 
+	 * the threshold set by SetThreshold() or the `--threshold` command-line argument.
+	 *
+	 * If this frame was skipped due to SetSkipFrames() being used, then the last frame's results will
+	 * be returned.  By default, every other frame is skipped in order to lengthen the action's window.
+	 *
 	 * @param image input image in CUDA device memory.
 	 * @param width width of the input image in pixels.
 	 * @param height height of the input image in pixels.
@@ -139,18 +153,47 @@ class actionNet : public tensorNet
 	/**
 	 * Retrieve the description of a particular class.
 	 */
-	inline const char* GetClassLabel( uint32_t index ) const		{ return mClassDesc[index].c_str(); }
+	inline const char* GetClassLabel( int index ) const			{ return GetClassDesc(index); }
 
 	/**
 	 * Retrieve the description of a particular class.
 	 */
-	inline const char* GetClassDesc( uint32_t index )	const		{ return mClassDesc[index].c_str(); }
+	inline const char* GetClassDesc( int index )	const			{ return index >= 0 ? mClassDesc[index].c_str() : "none"; }
 
 	/**
  	 * Retrieve the path to the file containing the class descriptions.
 	 */
 	inline const char* GetClassPath() const						{ return mClassPath.c_str(); }
 
+	/**
+	 * Return the confidence threshold used for classification.
+	 */
+	inline float GetThreshold() const							{ return mThreshold; }
+
+	/**
+	 * Set the confidence threshold used for classification.
+	 * Classes with a confidence below this threshold will be ignored.
+	 * @note this can also be set using the `--threshold=N` command-line argument.
+	 */
+	inline void SetThreshold( float threshold ) 					{ mThreshold = threshold; }
+
+	/**
+	 * Return the number of frames that are skipped in between classifications.
+	 * @see SetFrameSkip for more info.
+	 */
+	inline uint32_t GetSkipFrames() const						{ return mSkipFrames; }
+
+	/**
+	 * Set the number of frames that are skipped in between classifications.
+	 * Since actionNet operates on video sequences, it's often helpful to skip frames 
+	 * to lengthen the window of time the model gets to 'see' an action being performed.
+	 *
+	 * The default setting is 1, where every other frame is skipped.
+	 * Setting this to 0 will disable it, and every frame will be processed.
+	 * When a frame is skipped, the classification results from the last frame are returned.
+	 */
+	inline void SetSkipFrames( uint32_t frames )					{ mSkipFrames = frames; }
+
 protected:
 	actionNet();
 
@@ -160,11 +203,17 @@ class actionNet : public tensorNet
 	float* mInputBuffers[2];
 
 	uint32_t mNumClasses;
-	uint32_t mNumFrames;
+	uint32_t mNumFrames;	// number of frames fed into the model
+	uint32_t mSkipFrames;	// number of frames to skip when processing
+	uint32_t mFramesSkipped;	// frame skip counter
 
 	uint32_t mCurrentInputBuffer;
 	uint32_t mCurrentFrameIndex;
-
+
+	float mThreshold;
+	float mLastConfidence;
+	int   mLastClassification;
+
 	std::vector<std::string> mClassDesc;
 
 	std::string mClassPath;

diff --git a/examples/actionnet/actionnet.cpp b/examples/actionnet/actionnet.cpp
@@ -49,7 +49,6 @@ int usage()
 	printf("optional arguments:\n");
 	printf("  --help            show this help message and exit\n");
 	printf("  --network=NETWORK pre-trained model to load (see below for options)\n");
-	printf("  --skip-frames=N   how many frames to skip between classifications (default: 2)\n");
 	printf("positional arguments:\n");
 	printf("    input_URI       resource URI of input stream  (see videoSource below)\n");
 	printf("    output_URI      resource URI of output stream (see videoOutput below)\n\n");
@@ -127,11 +126,6 @@ int main( int argc, char** argv )
 		return 1;
 	}
 
-	const uint32_t skip_frames = cmdLine.GetInt("skip-frames", 2);
-
-	uint32_t skipped = 0;
-	float confidence = 0.0f;
-	int class_id = 0;
 
 	/*
 	 * processing loop
@@ -151,19 +145,14 @@ int main( int argc, char** argv )
 			continue;
 		}
 
-		// run inference every N frames
-		skipped += 1;
-
-		if( skipped % skip_frames == 0 )
-		{
-			class_id = net->Classify(image, input->GetWidth(), input->GetHeight(), &confidence);
-			skipped = 0;
-
-			if( class_id >= 0 )
-				LogVerbose("actionnet:  %2.5f%% class #%i (%s)\n", confidence * 100.0f, class_id, net->GetClassDesc(class_id));	
-			else
-				LogError("actionnet:  failed to classify frame\n");
-		}
+		// classify the action sequence
+		float confidence = 0.0f;
+		const int class_id = net->Classify(image, input->GetWidth(), input->GetHeight(), &confidence);
+
+		if( class_id >= 0 )
+			LogVerbose("actionnet:  %2.5f%% class #%i (%s)\n", confidence * 100.0f, class_id, net->GetClassDesc(class_id));	
+		else
+			LogError("actionnet:  failed to classify frame\n");
 
 		// overlay the results
 		if( class_id >= 0 )

diff --git a/python/bindings/PyActionNet.cpp b/python/bindings/PyActionNet.cpp
@@ -195,7 +195,7 @@ static PyObject* PyActionNet_Classify( PyActionNet_Object* self, PyObject* args,
 	img_class = self->net->Classify(ptr, width, height, format, &confidence);
 	Py_END_ALLOW_THREADS
 
-	if( img_class < 0 )
+	if( img_class < -1 )
 	{
 		PyErr_SetString(PyExc_Exception, LOG_PY_INFERENCE "actionNet.Classify() encountered an error classifying the image");
 		return NULL;
@@ -204,9 +204,7 @@ static PyObject* PyActionNet_Classify( PyActionNet_Object* self, PyObject* args,
 	// create output objects
 	PyObject* pyClass = PYLONG_FROM_LONG(img_class);
 	PyObject* pyConf  = PyFloat_FromDouble(confidence);
-
-	// return tuple
-	PyObject* tuple = PyTuple_Pack(2, pyClass, pyConf);
+	PyObject* tuple   = PyTuple_Pack(2, pyClass, pyConf);
 
 	Py_DECREF(pyClass);
 	Py_DECREF(pyConf);
@@ -283,6 +281,95 @@ PyObject* PyActionNet_GetClassDesc( PyActionNet_Object* self, PyObject* args )
 }
 
 
+#define DOC_GET_THRESHOLD  "Return the minimum confidence threshold for classification.\n\n" \
+					  "Parameters:  (none)\n\n" \
+					  "Returns:\n" \
+					  "  (float) -- the confidence threshold for classification"
+
+// GetThreshold
+static PyObject* PyActionNet_GetThreshold( PyActionNet_Object* self )
+{
+	if( !self || !self->net )
+	{
+		PyErr_SetString(PyExc_Exception, LOG_PY_INFERENCE "actionNet invalid object instance");
+		return NULL;
+	}
+
+	return PyFloat_FromDouble(self->net->GetThreshold());
+}
+
+
+#define DOC_SET_THRESHOLD  "Set the minimum confidence threshold for classification.\n\n" \
+					  "Parameters:\n" \
+					  "  (float) -- confidence threshold\n\n" \
+					  "Returns:  (none)"
+
+// SetThreshold
+PyObject* PyActionNet_SetThreshold( PyActionNet_Object* self, PyObject* args )
+{
+	if( !self || !self->net )
+	{
+		PyErr_SetString(PyExc_Exception, LOG_PY_INFERENCE "actionNet invalid object instance");
+		return NULL;
+	}
+
+	float threshold = 0.0f;
+
+	if( !PyArg_ParseTuple(args, "f", &threshold) )
+		return NULL;
+
+	self->net->SetThreshold(threshold);
+	Py_RETURN_NONE;
+}
+
+
+#define DOC_GET_SKIP_FRAMES "Return the number of frames that are skipped in between classifications.\n\n" \
+				 	   "Parameters:  (none)\n\n" \
+					   "Returns:\n" \
+					   "  (int) -- the number of frames skipped in between classifications"
+
+// GetSkipFrames
+static PyObject* PyActionNet_GetSkipFrames( PyActionNet_Object* self )
+{
+	if( !self || !self->net )
+	{
+		PyErr_SetString(PyExc_Exception, LOG_PY_INFERENCE "actionNet invalid object instance");
+		return NULL;
+	}
+
+	return PYLONG_FROM_UNSIGNED_LONG(self->net->GetSkipFrames());
+}
+
+
+#define DOC_SET_SKIP_FRAMES  	"Set the number of frames that are skipped in between classifications.\n" \
+						"Since actionNet operates on video sequences, it's often helpful to skip frames\n" \
+						"to lengthen the window of time the model gets to 'see' an action being performed.\n\n" \
+						"The default setting is 1, where every other frame is skipped.\n" \
+						"Setting this to 0 will disable it, and every frame will be processed.\n" \
+						"When a frame is skipped, the classification results from the last frame are returned.\n\n" \
+						"Parameters:\n" \
+						"  (int) -- the number of frames skipped in between classifications\n\n" \
+						"Returns:  (none)"
+
+// SetSkipFrames
+PyObject* PyActionNet_SetSkipFrames( PyActionNet_Object* self, PyObject* args )
+{
+	if( !self || !self->net )
+	{
+		PyErr_SetString(PyExc_Exception, LOG_PY_INFERENCE "actionNet invalid object instance");
+		return NULL;
+	}
+
+	int skipFrames = 0;
+
+	if( !PyArg_ParseTuple(args, "i", &skipFrames) )
+		return NULL;
+
+	self->net->SetSkipFrames(skipFrames);
+	Py_RETURN_NONE;
+}
+
+
 #define DOC_USAGE_STRING     "Return the command line parameters accepted by __init__()\n\n" \
 					    "Parameters:  (none)\n\n" \
 					    "Returns:\n" \
@@ -307,6 +394,10 @@ static PyMethodDef pyActionNet_Methods[] =
      { "GetNumClasses", (PyCFunction)PyActionNet_GetNumClasses, METH_NOARGS, DOC_GET_NUM_CLASSES},
 	{ "GetClassLabel", (PyCFunction)PyActionNet_GetClassDesc, METH_VARARGS, DOC_GET_CLASS_DESC},
 	{ "GetClassDesc", (PyCFunction)PyActionNet_GetClassDesc, METH_VARARGS, DOC_GET_CLASS_DESC},
+	{ "GetThreshold", (PyCFunction)PyActionNet_GetThreshold, METH_NOARGS, DOC_GET_THRESHOLD},
+	{ "SetThreshold", (PyCFunction)PyActionNet_SetThreshold, METH_VARARGS, DOC_SET_THRESHOLD},
+	{ "GetSkipFrames", (PyCFunction)PyActionNet_GetSkipFrames, METH_NOARGS, DOC_GET_SKIP_FRAMES},
+	{ "SetSkipFrames", (PyCFunction)PyActionNet_SetSkipFrames, METH_VARARGS, DOC_SET_SKIP_FRAMES},
 	{ "Usage", (PyCFunction)PyActionNet_Usage, METH_NOARGS|METH_STATIC, DOC_USAGE_STRING},
 	{NULL}  /* Sentinel */
 };