MTG · palonso · May 26, 2023 · May 26, 2023 · May 26, 2023 · dbogdanov
diff --git a/src/algorithms/machinelearning/tensorflowpredicteffnetdiscogs.cpp b/src/algorithms/machinelearning/tensorflowpredicteffnetdiscogs.cpp
@@ -156,6 +156,11 @@ const char* TensorflowPredictEffnetDiscogs::description = DOC(
   "Note: This algorithm does not make any check on the input model so it is "
   "the user's responsibility to make sure it is a valid one.\n"
   "\n"
+  "Note: The output of this algorithm is 2D, which is suitable for extracting embeddings or "
+  "class activations (the output shape is, e.g., [time, number of classes]). If the output "
+  "parameter is set to an intermediate layer with more dimensions, the output will be "
+  "flattened to 2D.\n"
+  "\n"
   "References:\n"
   "\n"
   "1. Supported models at https://essentia.upf.edu/models/\n\n");

diff --git a/src/algorithms/machinelearning/tensorflowpredictfsdsinet.cpp b/src/algorithms/machinelearning/tensorflowpredictfsdsinet.cpp
@@ -159,12 +159,17 @@ const char* TensorflowPredictFSDSINet::description = DOC(
   "Note: This algorithm does not make any check on the input model so it is "
   "the user's responsibility to make sure it is a valid one.\n"
   "\n"
+  "Note: The output of this algorithm is 2D, which is suitable for extracting embeddings or "
+  "class activations (the output shape is, e.g., [time, number of classes]). If the output "
+  "parameter is set to an intermediate layer with more dimensions, the output will be "
+  "flattened to 2D.\n"
+  "\n"
   "Note: The FSD-SINet models were trained on normalized audio clips. "
   "Clip-level normalization is only implemented in standard mode since in streaming there is no access to the entire audio clip. "
   "In the streaming case, the user is responsible for controlling the dynamic range of the input signal. "
   "Ideally, the signal should be zero-mean (no DC) and normalized to the full dynamic range (-1, 1).\n\n"
   "References:\n"
-  "  [1] Fonseca, E., Ferraro, A., & Serra, X. (2021). Improving sound event classification by increasing shift invariance in convolutional neural networks. arXiv preprint arXiv:2107.00623.\n"
+  "  [1] Fonseca, E., Ferraro, A., & Serra, X. (2021). Improving sound event classification by increasing shift invariance in convolutional neural networks. arXiv preprint arXiv:2107.00623.\n\n"
   "  [2] https://github.com/edufonseca/shift_sec"
 );
 

diff --git a/src/algorithms/machinelearning/tensorflowpredictmusicnn.cpp b/src/algorithms/machinelearning/tensorflowpredictmusicnn.cpp
@@ -158,6 +158,11 @@ const char* TensorflowPredictMusiCNN::description = DOC(
   "Note: This algorithm does not make any check on the input model so it is "
   "the user's responsibility to make sure it is a valid one.\n"
   "\n"
+  "Note: The output of this algorithm is 2D, which is suitable for extracting embeddings or "
+  "class activations (the output shape is, e.g., [time, number of classes]). If the output "
+  "parameter is set to an intermediate layer with more dimensions, the output will be "
+  "flattened to 2D.\n"
+  "\n"
   "References:\n"
   "\n"
   "1. Pons, J., & Serra, X. (2019). musicnn: Pre-trained convolutional neural "

diff --git a/src/algorithms/machinelearning/tensorflowpredictvggish.cpp b/src/algorithms/machinelearning/tensorflowpredictvggish.cpp
@@ -156,6 +156,11 @@ const char* TensorflowPredictVGGish::description = DOC(
   "Note: This algorithm does not make any check on the input model so it is "
   "the user's responsibility to make sure it is a valid one.\n"
   "\n"
+  "Note: The output of this algorithm is 2D, which is suitable for extracting embeddings or "
+  "class activations (the output shape is, e.g., [time, number of classes]). If the output "
+  "parameter is set to an intermediate layer with more dimensions, the output will be "
+  "flattened to 2D.\n"
+  "\n"
   "References:\n"
   "\n"
   "1. Gemmeke, J. et. al., AudioSet: An ontology and human-labelled dataset "

diff --git a/src/algorithms/standard/tensortovectorreal.cpp b/src/algorithms/standard/tensortovectorreal.cpp
@@ -36,6 +36,7 @@ void TensorToVectorReal::configure() {
   _channels = 0;
   _timeStamps = 0;
   _featsSize = 0;
+  _warned = false;
 }
 
 
@@ -44,6 +45,7 @@ void TensorToVectorReal::reset() {
   _channels = 0;
   _timeStamps = 0;
   _featsSize = 0;
+  _warned = false;
 }
 
 
@@ -66,6 +68,11 @@ AlgorithmStatus TensorToVectorReal::process() {
     _timeStamps = tensor.dimension(2);
     _featsSize = tensor.dimension(3);
 
+    if (_channels != 1 && !_warned) {
+        E_WARNING("TensorToVectorReal: The channel axis (dimension 1) of the input tensor has size larger than 1, but the output of this algorithm is 2D. The batch, channel, and time axes (dimensions 0, 1, 2) will be flattened to the first dimension of the output matrix.");
+        _warned = true;
+    }
+
     _frame.setAcquireSize(_timeStamps * _channels * _batchSize);
     _frame.setReleaseSize(_timeStamps * _channels *_batchSize);
 

diff --git a/src/algorithms/standard/tensortovectorreal.h b/src/algorithms/standard/tensortovectorreal.h
@@ -37,6 +37,7 @@ class TensorToVectorReal : public Algorithm {
   int _channels;
   int _timeStamps;
   int _featsSize;
+  bool _warned;
 
  public:
   TensorToVectorReal(){