From 0500f9ad0e3811f8b79fb3acdeb1dd64171b7dfa Mon Sep 17 00:00:00 2001
From: palonso <pablo.alonso@upf.edu>
Date: Fri, 26 May 2023 10:23:55 +0200
Subject: [PATCH 1/3] Warn if channels>1 when converting tensor to frame

---
 src/algorithms/standard/tensortovectorreal.cpp | 7 +++++++
 src/algorithms/standard/tensortovectorreal.h   | 1 +
 2 files changed, 8 insertions(+)

diff --git a/src/algorithms/standard/tensortovectorreal.cpp b/src/algorithms/standard/tensortovectorreal.cpp
index a5092ec40..001e477ff 100644
--- a/src/algorithms/standard/tensortovectorreal.cpp
+++ b/src/algorithms/standard/tensortovectorreal.cpp
@@ -36,6 +36,7 @@ void TensorToVectorReal::configure() {
   _channels = 0;
   _timeStamps = 0;
   _featsSize = 0;
+  _warned = false;
 }
 
 
@@ -44,6 +45,7 @@ void TensorToVectorReal::reset() {
   _channels = 0;
   _timeStamps = 0;
   _featsSize = 0;
+  _warned = false;
 }
 
 
@@ -66,6 +68,11 @@ AlgorithmStatus TensorToVectorReal::process() {
     _timeStamps = tensor.dimension(2);
     _featsSize = tensor.dimension(3);
 
+    if (_channels != 1 && !_warned) {
+        E_WARNING("TensorToVectorReal: The channel axis (dimension 1) of the input tensor has size larger than 1, but the output of this algorithm is 2D. The batch, channel, and time axes (dimensions 0, 1, 2) will be flattened to the first dimension of the output matrix.");
+        _warned = true;
+    }
+
     _frame.setAcquireSize(_timeStamps * _channels * _batchSize);
     _frame.setReleaseSize(_timeStamps * _channels *_batchSize);
 
diff --git a/src/algorithms/standard/tensortovectorreal.h b/src/algorithms/standard/tensortovectorreal.h
index 1f196b18a..b2e04612b 100644
--- a/src/algorithms/standard/tensortovectorreal.h
+++ b/src/algorithms/standard/tensortovectorreal.h
@@ -37,6 +37,7 @@ class TensorToVectorReal : public Algorithm {
   int _channels;
   int _timeStamps;
   int _featsSize;
+  bool _warned;
 
  public:
   TensorToVectorReal(){

From 4af184c4a6c5cc7d92c8570d38593e22391d70d2 Mon Sep 17 00:00:00 2001
From: palonso <pablo.alonso@upf.edu>
Date: Fri, 26 May 2023 10:30:46 +0200
Subject: [PATCH 2/3] Add note explaining intermediate layer extraction

---
 .../machinelearning/tensorflowpredicteffnetdiscogs.cpp       | 5 +++++
 src/algorithms/machinelearning/tensorflowpredictfsdsinet.cpp | 5 +++++
 src/algorithms/machinelearning/tensorflowpredictmusicnn.cpp  | 5 +++++
 src/algorithms/machinelearning/tensorflowpredictvggish.cpp   | 5 +++++
 4 files changed, 20 insertions(+)

diff --git a/src/algorithms/machinelearning/tensorflowpredicteffnetdiscogs.cpp b/src/algorithms/machinelearning/tensorflowpredicteffnetdiscogs.cpp
index 5505d8878..3b6123ca9 100644
--- a/src/algorithms/machinelearning/tensorflowpredicteffnetdiscogs.cpp
+++ b/src/algorithms/machinelearning/tensorflowpredicteffnetdiscogs.cpp
@@ -156,6 +156,11 @@ const char* TensorflowPredictEffnetDiscogs::description = DOC(
   "Note: This algorithm does not make any check on the input model so it is "
   "the user's responsibility to make sure it is a valid one.\n"
   "\n"
+  "Note: The output of this algorithm is 2D, which is suitable for extracting embeddings or "
+  "class activations (the output shape is, e.g., [time, number of classes]). If the output "
+  "parameter is set to an intermediate layer with more dimensions, the output will be "
+  "flattened to 2D.\n"
+  "\n"
   "References:\n"
   "\n"
   "1. Supported models at https://essentia.upf.edu/models/\n\n");
diff --git a/src/algorithms/machinelearning/tensorflowpredictfsdsinet.cpp b/src/algorithms/machinelearning/tensorflowpredictfsdsinet.cpp
index a280c6dde..c99f9914e 100644
--- a/src/algorithms/machinelearning/tensorflowpredictfsdsinet.cpp
+++ b/src/algorithms/machinelearning/tensorflowpredictfsdsinet.cpp
@@ -159,6 +159,11 @@ const char* TensorflowPredictFSDSINet::description = DOC(
   "Note: This algorithm does not make any check on the input model so it is "
   "the user's responsibility to make sure it is a valid one.\n"
   "\n"
+  "Note: The output of this algorithm is 2D, which is suitable for extracting embeddings or "
+  "class activations (the output shape is, e.g., [time, number of classes]). If the output "
+  "parameter is set to an intermediate layer with more dimensions, the output will be "
+  "flattened to 2D.\n"
+  "\n"
   "Note: The FSD-SINet models were trained on normalized audio clips. "
   "Clip-level normalization is only implemented in standard mode since in streaming there is no access to the entire audio clip. "
   "In the streaming case, the user is responsible for controlling the dynamic range of the input signal. "
diff --git a/src/algorithms/machinelearning/tensorflowpredictmusicnn.cpp b/src/algorithms/machinelearning/tensorflowpredictmusicnn.cpp
index cd037085e..59aa13ce3 100644
--- a/src/algorithms/machinelearning/tensorflowpredictmusicnn.cpp
+++ b/src/algorithms/machinelearning/tensorflowpredictmusicnn.cpp
@@ -158,6 +158,11 @@ const char* TensorflowPredictMusiCNN::description = DOC(
   "Note: This algorithm does not make any check on the input model so it is "
   "the user's responsibility to make sure it is a valid one.\n"
   "\n"
+  "Note: The output of this algorithm is 2D, which is suitable for extracting embeddings or "
+  "class activations (the output shape is, e.g., [time, number of classes]). If the output "
+  "parameter is set to an intermediate layer with more dimensions, the output will be "
+  "flattened to 2D.\n"
+  "\n"
   "References:\n"
   "\n"
   "1. Pons, J., & Serra, X. (2019). musicnn: Pre-trained convolutional neural "
diff --git a/src/algorithms/machinelearning/tensorflowpredictvggish.cpp b/src/algorithms/machinelearning/tensorflowpredictvggish.cpp
index 3f628c16b..4f7076227 100644
--- a/src/algorithms/machinelearning/tensorflowpredictvggish.cpp
+++ b/src/algorithms/machinelearning/tensorflowpredictvggish.cpp
@@ -156,6 +156,11 @@ const char* TensorflowPredictVGGish::description = DOC(
   "Note: This algorithm does not make any check on the input model so it is "
   "the user's responsibility to make sure it is a valid one.\n"
   "\n"
+  "Note: The output of this algorithm is 2D, which is suitable for extracting embeddings or "
+  "class activations (the output shape is, e.g., [time, number of classes]). If the output "
+  "parameter is set to an intermediate layer with more dimensions, the output will be "
+  "flattened to 2D.\n"
+  "\n"
   "References:\n"
   "\n"
   "1. Gemmeke, J. et. al., AudioSet: An ontology and human-labelled dataset "

From 3d5cf823cd180a3d36b89f64e3bc0015a02ad606 Mon Sep 17 00:00:00 2001
From: palonso <pablo.alonso@upf.edu>
Date: Fri, 26 May 2023 10:31:53 +0200
Subject: [PATCH 3/3] Fix references

---
 src/algorithms/machinelearning/tensorflowpredictfsdsinet.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/algorithms/machinelearning/tensorflowpredictfsdsinet.cpp b/src/algorithms/machinelearning/tensorflowpredictfsdsinet.cpp
index c99f9914e..9f8f29204 100644
--- a/src/algorithms/machinelearning/tensorflowpredictfsdsinet.cpp
+++ b/src/algorithms/machinelearning/tensorflowpredictfsdsinet.cpp
@@ -169,7 +169,7 @@ const char* TensorflowPredictFSDSINet::description = DOC(
   "In the streaming case, the user is responsible for controlling the dynamic range of the input signal. "
   "Ideally, the signal should be zero-mean (no DC) and normalized to the full dynamic range (-1, 1).\n\n"
   "References:\n"
-  "  [1] Fonseca, E., Ferraro, A., & Serra, X. (2021). Improving sound event classification by increasing shift invariance in convolutional neural networks. arXiv preprint arXiv:2107.00623.\n"
+  "  [1] Fonseca, E., Ferraro, A., & Serra, X. (2021). Improving sound event classification by increasing shift invariance in convolutional neural networks. arXiv preprint arXiv:2107.00623.\n\n"
   "  [2] https://github.com/edufonseca/shift_sec"
 );