Merge pull request #1413 from xaviliz/add-new-algo-audio2pitch

Add new algo audio2pitch
MTG · Jul 11, 2024 · 52d8a35 · 52d8a35
2 parents 9f32d3c + e84bbcf
commit 52d8a35
Show file tree

Hide file tree

Showing 8 changed files with 501 additions and 86 deletions.
diff --git a/src/algorithms/tonal/audio2pitch.cpp b/src/algorithms/tonal/audio2pitch.cpp
@@ -0,0 +1,113 @@
+#include "audio2pitch.h"
+#include "essentiamath.h"
+
+using namespace essentia;
+using namespace standard;
+
+const char* Audio2Pitch::name = "Audio2Pitch";
+const char* Audio2Pitch::category = "Pitch";
+const char* Audio2Pitch::description = DOC("This algorithm computes pitch with various pitch algorithms, specifically targeted for real-time pitch detection on audio signals. The algorithm internally uses pitch estimation with PitchYin (pitchyin) and PitchYinFFT (pitchyinfft).");
+
+bool Audio2Pitch::isAboveThresholds(Real pitchConfidence, Real loudness) {
+  return (pitchConfidence >= _pitchConfidenceThreshold) && (loudness >= _loudnessThresholdGain);
+}
+
+void Audio2Pitch::configure() {
+
+  _sampleRate = parameter("sampleRate").toReal();
+  _frameSize = parameter("frameSize").toInt();
+  _minFrequency = parameter("minFrequency").toReal();
+  _maxFrequency = parameter("maxFrequency").toReal();
+  _pitchAlgorithmName = parameter("pitchAlgorithm").toString();
+  _tolerance = parameter("tolerance").toReal();
+  _pitchConfidenceThreshold = parameter("pitchConfidenceThreshold").toReal();
+  _loudnessThreshold = parameter("loudnessThreshold").toReal();
+  _loudnessThresholdGain = db2amp(_loudnessThreshold);
+
+  if (_maxFrequency > _sampleRate * 0.5) {
+    throw EssentiaException("Audio2Pitch: Max frequency cannot be higher than Nyquist frequency");
+  }
+  if (_maxFrequency <= _minFrequency) {
+    throw EssentiaException("Audio2Pitch: Max frequency cannot be lower or equal than the minimum frequency");
+  }
+
+  if (_pitchAlgorithmName != "pitchyinfft" && _pitchAlgorithmName != "pitchyin") {
+    throw EssentiaException("Audio2Pitch: Bad 'pitchAlgorithm' =", _pitchAlgorithmName);
+  }
+
+  if (_pitchAlgorithmName == "pitchyinfft") {
+    _windowing = AlgorithmFactory::create("Windowing");
+    _spectrum = AlgorithmFactory::create("Spectrum");
+    _pitchAlgorithm = AlgorithmFactory::create("PitchYinFFT");
+
+    _windowing->configure("type", "hann",
+                          "size", _frameSize);
+    _spectrum->configure("size", _frameSize);
+  }
+  else {
+    _pitchAlgorithm = AlgorithmFactory::create("PitchYin");
+  }
+
+  _loudnessAlgorithm = AlgorithmFactory::create("RMS");
+
+  // switch between pyin and pyin_fft to propagate the weighting parameter
+  if (_pitchAlgorithmName == "pitchyin") {
+    _pitchAlgorithm->configure(INHERIT("frameSize"),
+                               INHERIT("maxFrequency"),
+                               INHERIT("minFrequency"),
+                               INHERIT("sampleRate"),
+                               INHERIT("tolerance"));
+  }
+  else {
+    _pitchAlgorithm->configure(INHERIT("frameSize"),
+                               INHERIT("maxFrequency"),
+                               INHERIT("minFrequency"),
+                               INHERIT("sampleRate"),
+                               INHERIT("weighting"),
+                               INHERIT("tolerance"));
+  }
+}
+
+void Audio2Pitch::compute() {
+  const std::vector<Real>& frame = _frame.get();
+  Real& pitch = _pitch.get();
+  Real& pitchConfidence = _pitchConfidence.get();
+  Real& loudness = _loudness.get();
+  int& voiced = _voiced.get();
+
+  if (frame.empty()) {
+    throw EssentiaException("Audio2Pitch: cannot compute the pitch of an empty frame");
+  }
+
+  if (frame.size() == 1) {
+    throw EssentiaException("Audio2Pitch: cannot compute the pitch of a frame of size 1");
+  }
+
+  _loudnessAlgorithm->input("array").set(frame);
+  _loudnessAlgorithm->output("rms").set(loudness);
+  _loudnessAlgorithm->compute();
+
+  std::vector<Real> windowedFrame, spectrum;
+  if (_pitchAlgorithmName == "pitchyinfft") {
+    _windowing->input("frame").set(frame);
+    _windowing->output("frame").set(windowedFrame);
+    _windowing->compute();
+    _spectrum->input("frame").set(windowedFrame);
+    _spectrum->output("spectrum").set(spectrum);
+    _spectrum->compute();
+    _pitchAlgorithm->input("spectrum").set(spectrum);
+  }
+  else if (_pitchAlgorithmName == "pitchyin") {
+    _pitchAlgorithm->input("signal").set(frame);
+  }
+
+  _pitchAlgorithm->output("pitch").set(pitch);
+  _pitchAlgorithm->output("pitchConfidence").set(pitchConfidence);
+  _pitchAlgorithm->compute();
+
+  // define voiced by thresholding
+  voiced = 0; // initially assumes an unvoiced frame
+  if (isAboveThresholds(pitchConfidence, loudness)) {
+    voiced = 1;
+  }
+}
diff --git a/src/algorithms/tonal/audio2pitch.h b/src/algorithms/tonal/audio2pitch.h
@@ -0,0 +1,75 @@
+#ifndef ESSENTIA_AUDIO2PITCH_H
+#define ESSENTIA_AUDIO2PITCH_H
+
+#include "algorithmfactory.h"
+
+namespace essentia {
+namespace standard {
+
+class Audio2Pitch : public Algorithm {
+
+  protected: 
+    Input<std::vector<Real>> _frame;
+    Output<Real> _pitch;
+    Output<Real> _pitchConfidence;
+    Output<Real> _loudness;
+    Output<int> _voiced;
+
+    Algorithm* _pitchAlgorithm;
+    Algorithm* _loudnessAlgorithm;
+    // auxiliary algorithms for FFT-based pitch
+    Algorithm* _windowing;
+    Algorithm* _spectrum;
+
+    Real _sampleRate;
+    int _frameSize;
+    Real _minFrequency;
+    Real _maxFrequency;
+    std::string _pitchAlgorithmName;
+    Real _tolerance;
+    Real _pitchConfidenceThreshold;
+    Real _loudnessThreshold;
+    Real _loudnessThresholdGain;
+
+    bool isAboveThresholds(Real pitchConfidence, Real loudness);
+
+  public:
+    Audio2Pitch() {
+      declareInput(_frame, "frame", "the input frame to analyse");
+      declareOutput(_pitch, "pitch", "detected pitch in Hz");
+      declareOutput(_pitchConfidence, "pitchConfidence", "confidence of detected pitch (from 0.0 to 1.0)");
+      declareOutput(_loudness, "loudness", "detected loudness in decibels");
+      declareOutput(_voiced, "voiced", "voiced frame categorization, 1 for voiced and 0 for unvoiced frame");
+    }
+
+    ~Audio2Pitch() {
+      if (_pitchAlgorithm) delete _pitchAlgorithm;
+      if (_loudnessAlgorithm) delete _loudnessAlgorithm;
+      if (_windowing) delete _windowing;
+      if (_spectrum) delete _spectrum;
+    }
+
+    void declareParameters() {
+      declareParameter("sampleRate", "sample rate of incoming audio frames", "[8000,inf)", 44100);
+      declareParameter("frameSize", "size of input frame in samples", "[1,inf)", 1024);
+      declareParameter("minFrequency", "minimum frequency to detect in Hz", "[10,20000]", 60.0);
+      declareParameter("maxFrequency", "maximum frequency to detect in Hz", "[10,20000]", 2300.0);
+      declareParameter("pitchAlgorithm", "pitch algorithm to use", "{pitchyin,pitchyinfft}", "pitchyinfft");
+      declareParameter("weighting", "string to assign a weighting function", "{custom,A,B,C,D,Z}", "custom");
+      declareParameter("tolerance", "sets tolerance for peak detection on pitch algorithm", "[0,1]", 1.0);
+      declareParameter("pitchConfidenceThreshold", "level of pitch confidence above/below which note ON/OFF start to be considered", "[0,1]", 0.25);
+      declareParameter("loudnessThreshold", "loudness level above/below which note ON/OFF start to be considered, in decibels", "[-inf,0]", -51.0);
+        }
+
+    void configure();
+    void compute();
+
+    static const char* name;
+    static const char* category;
+    static const char* description;
+};
+
+} // namespace standard
+} // namespace essentia
+
+#endif
diff --git a/src/algorithms/tonal/pitchyinfft.cpp b/src/algorithms/tonal/pitchyinfft.cpp
@@ -37,7 +37,7 @@ static Real _weightMask[] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
 static const Real _weights[] = {-75.8, -70.1, -60.8, -52.1, -44.2, -37.5,
 	-31.3, -25.6, -20.9, -16.5, -12.6, -9.6, -7.0, -4.7, -3.0, -1.8, -0.8,
 	-0.2, -0.0, 0.5, 1.6, 3.2, 5.4, 7.8, 8.1, 5.3, -2.4, -11.1, -12.8,
-	-12.2, -7.4, -17.8, -17.8, -17.8}; // by default the original one is selected
+	-12.2, -7.4, -17.8, -17.8, -17.8}; // by default use custom weights designed specifically for the PitchYinFFT algorithm
 
 static const Real _aWeighting[] = {-148.6, -50.4, -44.8, -39.5, -34.5, -30.3,
     -26.2, -22.4, -19.1, -16.2, -13.2, -10.8, -8.7, -6.6, -4.8, -3.2, -1.9,
@@ -91,7 +91,7 @@ void PitchYinFFT::configure() {
   // configure algorithms
   _fft->configure("size", _frameSize);
 
-  if (_weighting != "default" && _weighting != "A" && _weighting != "B" && _weighting != "C" && _weighting != "D" && _weighting != "Z") {
+  if (_weighting != "custom" && _weighting != "A" && _weighting != "B" && _weighting != "C" && _weighting != "D" && _weighting != "Z") {
     E_INFO("PitchYinFFT: 'weighting' = "<<_weighting<<"\n");
     throw EssentiaException("PitchYinFFT: Bad 'weighting' parameter");
   }
@@ -118,7 +118,7 @@ void PitchYinFFT::spectralWeights(std::string weighting) {
   int i = 0, j = 1;
   Real freq = 0, a0 = 0, a1 = 0, f0 = 0, f1 = 0;
   int _maskSize = 34;
-  if (weighting == "default") {
+  if (weighting == "custom") {
     for (int n=0; n<_maskSize; n++)
       _weightMask[n] = _weights[n];
   }

diff --git a/src/algorithms/tonal/pitchyinfft.h b/src/algorithms/tonal/pitchyinfft.h
@@ -84,7 +84,7 @@ class PitchYinFFT : public Algorithm {
     declareParameter("maxFrequency", "the maximum allowed frequency [Hz]", "(0,inf)", 22050.0);
     declareParameter("interpolate", "boolean flag to enable interpolation", "{true,false}", true);
     declareParameter("tolerance", "tolerance for peak detection", "[0,1]", 1.0);
-    declareParameter("weighting", "string to assign a weighting function", "{default,A,B,C,D,Z}", "default");
+    declareParameter("weighting", "string to assign a weighting function", "{custom,A,B,C,D,Z}", "custom");
   }
 
   void configure();

diff --git a/.../tonal/pitchyinfft/vignesh_confidance.npy → .../tonal/pitchyinfft/vignesh_confidence.npy b/.../tonal/pitchyinfft/vignesh_confidance.npy → .../tonal/pitchyinfft/vignesh_confidence.npy