Skip to content

Commit

Permalink
Modify MAEST to return tensors instead of vectors
Browse files Browse the repository at this point in the history
In other TensorflowPredict algorithm we have prefered to return 2D
outputs since they typically fit in the schema (timestamps,
embeddings), or (timestamps, activations). Hoewer in some cases more
dimensions are required. For example, we need 3D to return the attention
layers (batch, tokens, dimensions).
A similar problem has happened before when trying to retrieve the
internal representations of VGGish:
MTG#1333
  • Loading branch information
palonso committed Oct 19, 2023
1 parent 13e965b commit 6d362d0
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 17 deletions.
35 changes: 24 additions & 11 deletions src/algorithms/machinelearning/tensorflowpredictmaest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ const char* TensorflowPredictMAEST::description = essentia::standard::Tensorflow

TensorflowPredictMAEST::TensorflowPredictMAEST() : AlgorithmComposite(),
_frameCutter(0), _tensorflowInputMusiCNN(0), _shift(0), _scale(0), _vectorRealToTensor(0),
_tensorToPool(0), _tensorflowPredict(0), _poolToTensor(0), _tensorToVectorReal(0), _configured(false) {
_tensorToPool(0), _tensorflowPredict(0), _poolToTensor(0), _configured(false) {

declareInput(_signal, 480000, "signal", "the input audio signal sampled at 16 kHz");
declareOutput(_predictions, 1, "predictions", "the output values from the model node named after `output`");
Expand All @@ -49,7 +49,6 @@ void TensorflowPredictMAEST::createInnerNetwork() {
_tensorToPool = factory.create("TensorToPool");
_tensorflowPredict = factory.create("TensorflowPredict");
_poolToTensor = factory.create("PoolToTensor");
_tensorToVectorReal = factory.create("TensorToVectorReal");

_shift->output("array").setBufferType(BufferUsage::forMultipleFrames);
_scale->output("array").setBufferType(BufferUsage::forMultipleFrames);
Expand All @@ -63,10 +62,8 @@ void TensorflowPredictMAEST::createInnerNetwork() {
_vectorRealToTensor->output("tensor") >> _tensorToPool->input("tensor");
_tensorToPool->output("pool") >> _tensorflowPredict->input("poolIn");
_tensorflowPredict->output("poolOut") >> _poolToTensor->input("pool");
_poolToTensor->output("tensor") >> _tensorToVectorReal->input("tensor");


attach(_tensorToVectorReal->output("frame"), _predictions);
attach(_poolToTensor->output("tensor"), _predictions);

_network = new scheduler::Network(_frameCutter);
}
Expand Down Expand Up @@ -180,14 +177,25 @@ const char* TensorflowPredictMAEST::category = "Machine Learning";
const char* TensorflowPredictMAEST::description = DOC(
"This algorithm makes predictions using MAEST-based models.\n"
"\n"
"Internally, it uses TensorflowInputMusiCNN for the input feature extraction "
"(mel bands). It feeds the model with mel-spectrogram patches and "
"jumps a constant amount of frames determined by `patchHopSize`.\n"
"Internally, it uses TensorflowInputMusiCNN for the input feature extraction. "
"It feeds the model with mel-spectrogram patches and jumps a constant amount "
"of frames determined by `patchHopSize`.\n"
"\n"
"By setting the `batchSize` parameter to -1 or 0 the patches are stored to run a single "
"TensorFlow session at the end of the stream. This allows to take advantage "
"of parallelization when GPUs are available, but at the same time it can be "
"memory exhausting for long files.\n"
"\n"
"For the official MAEST models, the algorithm outputs the probabilities for "
"400 music style labels by default. Additionally, it is possible to retrieve "
"the output of each attention layer by setting `output=StatefulParitionedCall:n`, "
"where `n` is the index of the layer (starting from 1).\n"
"The output from the attention layers should be interpreted as follows:\n"
" [batch_index, 1, token_number, embeddings_size]\n"
"Where the the fist and second tokens (e.g., [0, 0, :2, :]) correspond to the "
"CLS and DIST tokens respectively, and the following ones to input signal ( "
"refer to the original paper for details [1]).\n"

"\n"
"The recommended pipeline is as follows::\n"
"\n"
Expand Down Expand Up @@ -247,7 +255,7 @@ void TensorflowPredictMAEST::configure() {

void TensorflowPredictMAEST::compute() {
const vector<Real>& signal = _signal.get();
vector<vector<Real> >& predictions = _predictions.get();
Tensor<Real>& predictions = _predictions.get();

if (!signal.size()) {
throw EssentiaException("TensorflowPredictMAEST: empty input signal");
Expand All @@ -258,10 +266,15 @@ void TensorflowPredictMAEST::compute() {
_network->run();

try {
predictions = _pool.value<vector<vector<Real> > >("predictions");
vector<Tensor<Real> > predictions_vector = _pool.value<vector<Tensor<Real> > >("predictions");
predictions = predictions_vector[0];

for (int i = 1; i < (int)predictions_vector.size(); i++) {
Tensor<Real> new_predictions = predictions.concatenate(predictions_vector[i], 0).eval();
predictions = new_predictions;
}
}
catch (EssentiaException&) {
predictions.clear();
reset();

throw EssentiaException("TensorflowPredictMAEST: input signal is too short.");
Expand Down
7 changes: 3 additions & 4 deletions src/algorithms/machinelearning/tensorflowpredictmaest.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,9 @@ class TensorflowPredictMAEST : public AlgorithmComposite {
Algorithm* _tensorToPool;
Algorithm* _tensorflowPredict;
Algorithm* _poolToTensor;
Algorithm* _tensorToVectorReal;

SinkProxy<Real> _signal;
SourceProxy<std::vector<Real> > _predictions;
SourceProxy<Tensor<Real> > _predictions;

scheduler::Network* _network;
bool _configured;
Expand Down Expand Up @@ -101,7 +100,7 @@ namespace standard {
class TensorflowPredictMAEST : public Algorithm {
protected:
Input<std::vector<Real> > _signal;
Output<std::vector<std::vector<Real> > > _predictions;
Output<Tensor<Real> > _predictions;

streaming::Algorithm* _tensorflowPredictMAEST;
streaming::VectorInput<Real>* _vectorInput;
Expand All @@ -120,7 +119,7 @@ class TensorflowPredictMAEST : public Algorithm {
declareParameter("input", "the name of the input nodes in the Tensorflow graph", "", "serving_default_melspectrogram");
declareParameter("output", "the name of the node from which to retrieve the output tensors", "", "StatefulPartitionedCall");
declareParameter("isTrainingName", "the name of an additional input node indicating whether the model is to be run in a training mode (for models with a training mode, leave it empty otherwise)", "", "");
declareParameter("patchHopSize", "number of frames between the beginnings of adjacent patches. 0 to avoid overlap", "[0,inf)", 1876);
declareParameter("patchHopSize", "number of frames between the beginnings of adjacent patches. 0 to avoid overlap", "[0,inf)", 1875);
declareParameter("lastPatchMode", "what to do with the last frames: `repeat` them to fill the last patch or `discard` them", "{discard,repeat}", "discard");
declareParameter("batchSize", "the batch size for prediction. This allows parallelization when GPUs are available. Set it to -1 or 0 to accumulate all the patches and run a single TensorFlow session at the end of the stream", "[-1,inf)", 1);
declareParameter("patchSize", "number of frames required for each inference. This parameter should match the model's expected input shape.", "[0,inf)", 1876);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def setUpClass(self):
def tearDownClass(self):
essentia.log.warningActive = True

def testRegressionFrozenModel(self):
def testRegression(self):
expected = numpy.load(
join(
filedir(),
Expand All @@ -58,7 +58,7 @@ def testRegressionFrozenModel(self):
audio = MonoLoader(filename=filename, sampleRate=16000, resampleQuality=4)()

activations = self.model30s(audio)
found = numpy.mean(activations, axis=0)
found = numpy.mean(activations, axis=0).squeeze()

self.assertAlmostEqualVector(found, expected, 1e-1)

Expand Down

0 comments on commit 6d362d0

Please sign in to comment.