diff --git a/compare_derivative.c b/compare_derivative.c
index ab30bc0b7..0c287afed 100644
--- a/compare_derivative.c
+++ b/compare_derivative.c
@@ -25,6 +25,7 @@
 # include <omp.h>
 #endif
 #include <stdio.h>
+#include <math.h>
 #include "global.h"
 #include "monomial/monomial.h"
 
diff --git a/doc/quda.tex b/doc/quda.tex
index 3acb54d00..9c4636d82 100644
--- a/doc/quda.tex
+++ b/doc/quda.tex
@@ -12,11 +12,11 @@ \subsubsection{Design goals of the interface}
 The QUDA interface has been designed with the following goals in mind, sorted by priority:
 \begin{enumerate}
 	\item \emph{Safety.} Naturally, highest priority is given to the correctness of the output of the interface. 
-	This is trivially achieved by always checking the final residual on the CPU with the default tmLQCD routines.
+    For pure inversions this is trivially achieved by always checking the final residual on the CPU with the default tmLQCD routines. When QUDA is used in the HMC, however, the residual is only checked for {\ttfamily DebugLevel > 2} or when {\ttfamily StrictResidualCheck} is enabled.
 \item \emph{Ease of use.} Within the operator declarations of the input file (between {\ttfamily BeginOperator} and {\ttfamily EndOperator}) a simple flag {\ttfamily UseExternalInverter} is introduced which, when set to {\ttfamily quda}, will let QUDA perform the inversion of that operator. The operators {\ttfamily TMWILSON, WILSON, DBTMWILSON} and {\ttfamily CLOVER, DBCLOVER} are supported. 
     Within the monomial declarations of the input file (between {\ttfamily BeginMonomial} and {\ttfamily EndMonomial}) the same flag can be used to offload solves for the \texttt{DET, DETRATIO, CLOVERDET, CLOVERDETRATIO, RAT, RATCOR, NDRAT, NDRATCOR, NDCLOVERRAT} and \texttt{NDCLOVERRATCOR} monomials in the HMC.
-    Further, the flag {\ttfamily UseExternalLibrary} is introduced which, when set to {\ttfamily quda}, will let QUDA perform the force calculation for the given monomial with support currently limited to {\ttfamily GAUGE, CLOVERDET, CLOVERDETRATIO}.
-	\item \emph{Minimality.} Minimal changes in the form of {\ttfamily \#ifdef QUDA} precompiler directives to the tmLQCD code base. The main bulk of the interface lies in a single separate file {\ttfamily quda\_interface.c} (with corresponding header file). The QUDA interface is entered .
+    Further, the flag {\ttfamily UseExternalLibrary} is introduced which, when set to {\ttfamily quda}, will let QUDA perform the force calculation for the given monomial with support currently limited to {\ttfamily GAUGE, CLOVERDET, CLOVERDETRATIO} and {\ttfamily NDCLOVERRAT}.
+	\item \emph{Minimality.} Minimal changes in the form of {\ttfamily \#ifdef TM\_USE\_QUDA} precompiler directives to the tmLQCD code base. The main bulk of the interface lies in a single separate file {\ttfamily quda\_interface.c} (with corresponding header file). The QUDA interface is entered .
 	\item \emph{Performance.} The higher priority of the previous items results in small performance detriments. In particular:
 	\begin{itemize}
 		\item tmLQCD's $\theta$-boundary conditions are not compatible with QUDA's 8 and 12 parameter reconstruction of the gauge fields (as of QUDA-1.1.0). Therefore reconstruction/compression is deactivated by default, although it may be activated via the input file, see below.
@@ -83,9 +83,7 @@ \subsubsection{QUDA versions}
 \end{verbatim}
 so that the wrapper to the QUDA fermionic forces is not compiled.
 
-Thus, if \texttt{--enable-quda\_fermionic\_forces=no} setting {\ttfamily UseExternalLibrary=yes} in the inputfile for the {\ttfamily  CLOVERDET, CLOVERDETRATIO} monomials
-is not supported and tmLQCD will stop with an error.
-
+Thus, if \texttt{--enable-quda\_fermionic\_forces=no}, setting {\ttfamily UseExternalLibrary=yes} in the inputfile for the {\ttfamily  CLOVERDET, CLOVERDETRATIO} and {\ttfamily NDCLOVERRAT} monomials is not supported and tmLQCD will stop with an error.
 
 \subsubsection{Usage}
 Any main program that reads and handles the operator declaration from an input file can easily be set up to use the QUDA inverter by setting the {\ttfamily UseExternalInverter} flag to {\ttfamily quda}. For example, in the input file for the {\ttfamily invert} executable, add the flag to the operator declaration as
@@ -131,7 +129,7 @@ \subsubsection{Usage}
   \item \texttt{RefinementPrecision}: When the operator or monomial uses the multishift (\texttt{cgmms[nd]}) solver and offloads to QUDA, this parameter sets the inner solver precision of shift-by-shift refinement solves. In practice, one might set \texttt{UseSloppyPrecision = single} and \texttt{RefinementPrecision = half}. This will iterate the residuals in the multishift solver up to single precision and then refine each solution using a double-half mixed-precision CG.
 \end{itemize}
 
-In additition, for the gauge monomial, the parameter \texttt{UseExternalLibrary = quda} can be used to offload the gauge force to QUDA.
+In additition, for the \texttt{GAUGE, CLOVERDET, CLOVERDETRATIO} and \texttt{NDCLOVERRAT} monomials, the parameter \texttt{UseExternalLibrary = quda} can be used to offload the force calculation to QUDA.
 
 Finally, for the \texttt{GRADIENTFLOW} online measurement, the parameter \texttt{UseExternalLibrary = quda} will offload the gradient flow to QUDA.
 
diff --git a/monomial/cloverdet_monomial.c b/monomial/cloverdet_monomial.c
index b98a38e3c..b789468a8 100644
--- a/monomial/cloverdet_monomial.c
+++ b/monomial/cloverdet_monomial.c
@@ -62,17 +62,6 @@ void cloverdet_derivative(const int id, hamiltonian_field_t * const hf) {
   monomial * mnl = &monomial_list[id];
   int N = VOLUME/2;
   tm_stopwatch_push(&g_timers, __func__, mnl->name);
-  tm_stopwatch_push(&g_timers, "su3_zero", "");
-#ifdef TM_USE_OMP
-  #pragma omp parallel for
-#endif
-  for(int i = 0; i < VOLUME; i++) { 
-    for(int mu = 0; mu < 4; mu++) { 
-      _su3_zero(swm[i][mu]);
-      _su3_zero(swp[i][mu]);
-    }
-  }
-  tm_stopwatch_pop(&g_timers, 0, 1, "");
 
   mnl->forcefactor = 1.;
   /*********************************************************************
@@ -89,16 +78,29 @@ void cloverdet_derivative(const int id, hamiltonian_field_t * const hf) {
   g_kappa = mnl->kappa;
   boundary(mnl->kappa);
   
-  // we compute the clover term (1 + T_ee(oo)) for all sites x
-  sw_term( (const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); 
-  // we invert it for the even sites only
-  if(!mnl->even_odd_flag) {
-    N = VOLUME;
-  }
-  else {
-    sw_invert(EE, mnl->mu);
+  if( g_debug_level > 2 || g_strict_residual_check || !(mnl->external_library == QUDA_LIB && mnl->solver_params.external_inverter == QUDA_INVERTER) ){
+    tm_stopwatch_push(&g_timers, "su3_zero", "");
+  #ifdef TM_USE_OMP
+    #pragma omp parallel for
+  #endif
+    for(int i = 0; i < VOLUME; i++) { 
+      for(int mu = 0; mu < 4; mu++) { 
+        _su3_zero(swm[i][mu]);
+        _su3_zero(swp[i][mu]);
+      }
+    }
+    tm_stopwatch_pop(&g_timers, 0, 1, "");
+
+    // we compute the clover term (1 + T_ee(oo)) for all sites x
+    sw_term( (const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); 
+    // we invert it for the even sites only
+    if(!mnl->even_odd_flag) {
+      N = VOLUME;
+    }
+    else {
+      sw_invert(EE, mnl->mu);
+    }
   }
-  
   // Invert Q_{+} Q_{-}
   // X_o -> w_fields[1]
   chrono_guess(mnl->w_fields[1], mnl->pf, mnl->csg_field, mnl->csg_index_array,
@@ -274,15 +276,17 @@ double cloverdet_acc(const int id, hamiltonian_field_t * const hf) {
   g_kappa = mnl->kappa;
   boundary(mnl->kappa);
 
-  sw_term( (const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); 
+  if( g_debug_level > 2 || g_strict_residual_check || !(mnl->external_library == QUDA_LIB && mnl->solver_params.external_inverter == QUDA_INVERTER) ){
 
-  if(!mnl->even_odd_flag) {
-    N = VOLUME;
-  }
-  else {
-    sw_invert(EE, mnl->mu);
-  }
+    sw_term( (const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); 
 
+    if(!mnl->even_odd_flag) {
+      N = VOLUME;
+    }
+    else {
+      sw_invert(EE, mnl->mu);
+    }
+  }
   g_sloppy_precision_flag = 0;
 
   if( mnl->solver == MG || mnl->solver == BICGSTAB ){
diff --git a/monomial/monomial.c b/monomial/monomial.c
index 2227e3897..d2636749b 100644
--- a/monomial/monomial.c
+++ b/monomial/monomial.c
@@ -453,6 +453,12 @@ int init_monomials(const int V, const int even_odd_flag) {
               monomial_list[i].name,
               no_monomials);
         }
+        if(monomial_list[i].external_library==QUDA_LIB){
+          if(monomial_list[i].solver_params.external_inverter != QUDA_INVERTER){
+            tm_debug_printf(0,0,"Error: NDCLOVERRAT monomial of UseExternalLibrary = quda is not supported without UseExternalInverter = quda\n");
+            exit(1);
+          }
+        }
       }
       else if(monomial_list[i].type == NDRATCOR) {
         monomial_list[i].hbfunction = &ndratcor_heatbath;
diff --git a/monomial/ndrat_monomial.c b/monomial/ndrat_monomial.c
index 1dd669b33..43a159053 100644
--- a/monomial/ndrat_monomial.c
+++ b/monomial/ndrat_monomial.c
@@ -25,6 +25,7 @@
 #include <stdio.h>
 #include <math.h>
 #include <time.h>
+#include <string.h>
 #include "global.h"
 #include "su3.h"
 #include "linalg_eo.h"
@@ -48,6 +49,11 @@
 #include "phmc.h"
 #include "ndrat_monomial.h"
 #include "default_input_values.h"
+#include "compare_derivative.h"
+#include "xchange/xchange_deri.h"
+#ifdef TM_USE_QUDA
+#  include "quda_interface.h"
+#endif
 
 void nd_set_global_parameter(monomial * const mnl) {
 
@@ -66,7 +72,6 @@ void nd_set_global_parameter(monomial * const mnl) {
   return;
 }
 
-
 /********************************************
  *
  * Here \delta S_b is computed
@@ -78,23 +83,25 @@ void ndrat_derivative(const int id, hamiltonian_field_t * const hf) {
   tm_stopwatch_push(&g_timers, __func__, mnl->name);
   nd_set_global_parameter(mnl);
   if(mnl->type == NDCLOVERRAT) {
-    tm_stopwatch_push(&g_timers, "su3_zero", "");
-#ifdef TM_USE_OMP
-    #pragma omp parallel for
-#endif
-    for(int i = 0; i < VOLUME; i++) { 
-      for(int mu = 0; mu < 4; mu++) { 
-        _su3_zero(swm[i][mu]);
-        _su3_zero(swp[i][mu]);
+    if( g_debug_level > 2 || g_strict_residual_check || !(mnl->external_library == QUDA_LIB && mnl->solver_params.external_inverter == QUDA_INVERTER) ){
+      tm_stopwatch_push(&g_timers, "su3_zero", "");
+  #ifdef TM_USE_OMP
+      #pragma omp parallel for
+  #endif
+      for(int i = 0; i < VOLUME; i++) { 
+        for(int mu = 0; mu < 4; mu++) { 
+          _su3_zero(swm[i][mu]);
+          _su3_zero(swp[i][mu]);
+        }
       }
+      tm_stopwatch_pop(&g_timers, 0, 1, "");
+
+      // we compute the clover term (1 + T_ee(oo)) for all sites x
+      sw_term( (const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); 
+      // we invert it for the even sites only
+      sw_invert_nd(mnl->mubar*mnl->mubar - mnl->epsbar*mnl->epsbar);
+      copy_32_sw_fields();
     }
-    tm_stopwatch_pop(&g_timers, 0, 1, "");
-  
-    // we compute the clover term (1 + T_ee(oo)) for all sites x
-    sw_term( (const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); 
-    // we invert it for the even sites only
-    sw_invert_nd(mnl->mubar*mnl->mubar - mnl->epsbar*mnl->epsbar);
-    copy_32_sw_fields();
   }
   mnl->forcefactor = mnl->EVMaxInv;
 
@@ -116,87 +123,135 @@ void ndrat_derivative(const int id, hamiltonian_field_t * const hf) {
   mnl->iter1 += solve_mms_nd(g_chi_up_spinor_field, g_chi_dn_spinor_field,
                              mnl->pf, mnl->pf2, &(mnl->solver_params) );
 
-  for(int j = (mnl->rat.np-1); j > -1; j--) {
-    if(mnl->type == NDCLOVERRAT) {
-      // multiply with Q_h * tau^1 + i mu_j to get Y_j,o (odd sites)
-      // needs phmc_Cpol = 1 to work for ndrat!
-      tm_stopwatch_push(&g_timers, "Qsw_tau1_sub_const_ndpsi", "");
-      Qsw_tau1_sub_const_ndpsi(mnl->w_fields[0], mnl->w_fields[1],
-			       g_chi_up_spinor_field[j], g_chi_dn_spinor_field[j], 
-			       -I*mnl->rat.mu[j], 1., mnl->EVMaxInv);
-      tm_stopwatch_pop(&g_timers, 0, 1, "");
-      
-      /* Get the even parts X_j,e */
-      /* H_eo_... includes tau_1 */
-      tm_stopwatch_push(&g_timers, "H_eo_sw_ndpsi", "");
-      H_eo_sw_ndpsi(mnl->w_fields[2], mnl->w_fields[3], 
-		    g_chi_up_spinor_field[j], g_chi_dn_spinor_field[j]);
-      tm_stopwatch_pop(&g_timers, 0, 1, "");
-
-    } else {
-      // multiply with Q_h * tau^1 + i mu_j to get Y_j,o (odd sites)
-      // needs phmc_Cpol = 1 to work for ndrat!
-      tm_stopwatch_push(&g_timers, "Q_tau1_sub_const_ndpsi", "");
-      Q_tau1_sub_const_ndpsi(mnl->w_fields[0], mnl->w_fields[1],
-			     g_chi_up_spinor_field[j], g_chi_dn_spinor_field[j], 
-			     -I*mnl->rat.mu[j], 1., mnl->EVMaxInv);
-      tm_stopwatch_pop(&g_timers, 0, 1, "");
-      
-      /* Get the even parts X_j,e */
-      /* H_eo_... includes tau_1 */
-      tm_stopwatch_push(&g_timers, "H_eo_tm_ndpsi", "");
-      H_eo_tm_ndpsi(mnl->w_fields[2], mnl->w_fields[3], 
-		    g_chi_up_spinor_field[j], g_chi_dn_spinor_field[j], EO);
-      tm_stopwatch_pop(&g_timers, 0, 1, "");
+  if ( mnl->external_library == QUDA_LIB){
+    if(!mnl->even_odd_flag) {
+      fatal_error("QUDA support only even_odd_flag",__func__);
     }
-    /* X_j,e^dagger \delta M_eo Y_j,o */
-    deriv_Sb(EO, mnl->w_fields[2], mnl->w_fields[0], 
-	     hf, mnl->rat.rmu[j]*mnl->forcefactor);
-    deriv_Sb(EO, mnl->w_fields[3], mnl->w_fields[1],
-	     hf, mnl->rat.rmu[j]*mnl->forcefactor);
-
-    if(mnl->type == NDCLOVERRAT) {
-      /* Get the even parts Y_j,e */
-      tm_stopwatch_push(&g_timers, "H_eo_sw_ndpsi", "");
-      H_eo_sw_ndpsi(mnl->w_fields[4], mnl->w_fields[5], mnl->w_fields[0], mnl->w_fields[1]);
-      tm_stopwatch_pop(&g_timers, 0, 1, "");
+#ifdef TM_USE_QUDA
+    if (g_debug_level > 3) {
+#ifdef TM_USE_MPI
+      // FIXME: here we do not need to set to zero the interior but only the halo
+#ifdef TM_USE_OMP
+      # pragma omp parallel for
+#endif // end setting to zero the halo when using MPI
+      for(int i = 0; i < (VOLUMEPLUSRAND + g_dbw2rand);i++) { 
+        for(int mu=0;mu<4;mu++) { 
+          _zero_su3adj(debug_derivative[i][mu]);
+        }
+      }
+#endif
+      // we copy only the interior
+      memcpy(debug_derivative[0], hf->derivative[0], 4*VOLUME*sizeof(su3adj));
     }
-    else {
-      /* Get the even parts Y_j,e */
-      tm_stopwatch_push(&g_timers, "H_eo_tm_ndpsi", "");
-      H_eo_tm_ndpsi(mnl->w_fields[4], mnl->w_fields[5], mnl->w_fields[0], mnl->w_fields[1], EO);
-      tm_stopwatch_pop(&g_timers, 0, 1, "");
+    if(! (mnl->type == NDCLOVERRAT)){
+      fatal_error("QUDA support only mnl->type = NDCLOVERRAT",__func__);
     }
+    // corrispondence tmLQCD with QUDA:
+    // g_chi_up_spinor_field[j], g_chi_dn_spinor_field[j] = x[i][parity] 
+    // mnl->w_fields[2], mnl->w_fields[3] = x[i][other_parity] *kappa
+    // mnl->w_fields[0], mnl->w_fields[1] = p[i][parity] 
+    // mnl->w_fields[4], mnl->w_fields[5] = p[i][other_parity] *kappa
+    // further in quda parity = odd and other_parity = even
+    compute_ndcloverrat_derivative_quda(mnl, hf, g_chi_up_spinor_field, g_chi_dn_spinor_field, &(mnl->solver_params), !mnl->trlog );
+
+    if (g_debug_level > 3){
+      su3adj **given = hf->derivative;
+      hf->derivative = debug_derivative;
+      mnl->external_library = NO_EXT_LIB;
+      tm_debug_printf( 0, 3, "Recomputing the derivative from tmLQCD\n");
+      ndrat_derivative(id, hf);
+      #ifdef TM_USE_MPI
+        xchange_deri(hf->derivative);// this function use ddummy inside
+      #endif
+      compare_derivative(mnl, given, hf->derivative, 1e-9, "ndrat_derivative");
+      mnl->external_library = QUDA_LIB;
+      hf->derivative = given;
+    }
+  #endif // no other option, TM_USE_QUDA already checked by solver
+  }
+  else{
+    for(int j = 0; j < mnl->rat.np; j++) {
+      if(mnl->type == NDCLOVERRAT) {
+        // multiply with Q_h * tau^1 + i mu_j to get Y_j,o (odd sites)
+        // needs phmc_Cpol = 1 to work for ndrat!
+        tm_stopwatch_push(&g_timers, "Qsw_tau1_sub_const_ndpsi", "");
+        Qsw_tau1_sub_const_ndpsi(mnl->w_fields[0], mnl->w_fields[1],
+              g_chi_up_spinor_field[j], g_chi_dn_spinor_field[j], 
+              -I*mnl->rat.mu[j], 1., mnl->EVMaxInv);
+        tm_stopwatch_pop(&g_timers, 0, 1, "");
+        
+        /* Get the even parts X_j,e */
+        /* H_eo_... includes tau_1 */
+        tm_stopwatch_push(&g_timers, "H_eo_sw_ndpsi", "");
+        H_eo_sw_ndpsi(mnl->w_fields[2], mnl->w_fields[3], 
+          g_chi_up_spinor_field[j], g_chi_dn_spinor_field[j]);
+        tm_stopwatch_pop(&g_timers, 0, 1, "");
+       } else {
+        // multiply with Q_h * tau^1 + i mu_j to get Y_j,o (odd sites)
+        // needs phmc_Cpol = 1 to work for ndrat!
+        tm_stopwatch_push(&g_timers, "Q_tau1_sub_const_ndpsi", "");
+        Q_tau1_sub_const_ndpsi(mnl->w_fields[0], mnl->w_fields[1],
+            g_chi_up_spinor_field[j], g_chi_dn_spinor_field[j], 
+            -I*mnl->rat.mu[j], 1., mnl->EVMaxInv);
+        tm_stopwatch_pop(&g_timers, 0, 1, "");
+        
+        /* Get the even parts X_j,e */
+        /* H_eo_... includes tau_1 */
+        tm_stopwatch_push(&g_timers, "H_eo_tm_ndpsi", "");
+        H_eo_tm_ndpsi(mnl->w_fields[2], mnl->w_fields[3], 
+          g_chi_up_spinor_field[j], g_chi_dn_spinor_field[j], EO);
+        tm_stopwatch_pop(&g_timers, 0, 1, "");
+      }
+      /* X_j,e^dagger \delta M_eo Y_j,o */
+      deriv_Sb(EO, mnl->w_fields[2], mnl->w_fields[0], 
+        hf, mnl->rat.rmu[j]*mnl->forcefactor);
+      deriv_Sb(EO, mnl->w_fields[3], mnl->w_fields[1],
+        hf, mnl->rat.rmu[j]*mnl->forcefactor);
+
+      if(mnl->type == NDCLOVERRAT) {
+        /* Get the even parts Y_j,e */
+        tm_stopwatch_push(&g_timers, "H_eo_sw_ndpsi", "");
+        H_eo_sw_ndpsi(mnl->w_fields[4], mnl->w_fields[5], mnl->w_fields[0], mnl->w_fields[1]);
+        tm_stopwatch_pop(&g_timers, 0, 1, "");
+      }
+      else {
+        /* Get the even parts Y_j,e */
+        tm_stopwatch_push(&g_timers, "H_eo_tm_ndpsi", "");
+        H_eo_tm_ndpsi(mnl->w_fields[4], mnl->w_fields[5], mnl->w_fields[0], mnl->w_fields[1], EO);
+        tm_stopwatch_pop(&g_timers, 0, 1, "");
+      }
 
-    /* X_j,o \delta M_oe Y_j,e */
-    deriv_Sb(OE, g_chi_up_spinor_field[j], mnl->w_fields[4], 
-	     hf, mnl->rat.rmu[j]*mnl->forcefactor);
-    deriv_Sb(OE, g_chi_dn_spinor_field[j], mnl->w_fields[5], 
-	     hf, mnl->rat.rmu[j]*mnl->forcefactor);
-
+      /* X_j,o \delta M_oe Y_j,e */
+      deriv_Sb(OE, g_chi_up_spinor_field[j], mnl->w_fields[4], 
+        hf, mnl->rat.rmu[j]*mnl->forcefactor);
+      deriv_Sb(OE, g_chi_dn_spinor_field[j], mnl->w_fields[5], 
+        hf, mnl->rat.rmu[j]*mnl->forcefactor);
+
+      if(mnl->type == NDCLOVERRAT) {
+        // even/even sites sandwiched by tau_1 gamma_5 Y_e and gamma_5 X_e
+        sw_spinor_eo(EE, mnl->w_fields[5], mnl->w_fields[2], 
+        mnl->rat.rmu[j]*mnl->forcefactor);
+        // odd/odd sites sandwiched by tau_1 gamma_5 Y_o and gamma_5 X_o
+        sw_spinor_eo(OO, g_chi_up_spinor_field[j], mnl->w_fields[1],
+        mnl->rat.rmu[j]*mnl->forcefactor);
+        
+        // even/even sites sandwiched by tau_1 gamma_5 Y_e and gamma_5 X_e
+        sw_spinor_eo(EE, mnl->w_fields[4], mnl->w_fields[3], 
+        mnl->rat.rmu[j]*mnl->forcefactor);
+        // odd/odd sites sandwiched by tau_1 gamma_5 Y_o and gamma_5 X_o
+        sw_spinor_eo(OO, g_chi_dn_spinor_field[j], mnl->w_fields[0],
+        mnl->rat.rmu[j]*mnl->forcefactor);
+      }
+    }
+    // trlog part does not depend on the normalisation
+    if(mnl->type == NDCLOVERRAT && mnl->trlog) {
+      sw_deriv_nd(EE);
+    }
     if(mnl->type == NDCLOVERRAT) {
-      // even/even sites sandwiched by tau_1 gamma_5 Y_e and gamma_5 X_e
-      sw_spinor_eo(EE, mnl->w_fields[5], mnl->w_fields[2], 
-		   mnl->rat.rmu[j]*mnl->forcefactor);
-      // odd/odd sites sandwiched by tau_1 gamma_5 Y_o and gamma_5 X_o
-      sw_spinor_eo(OO, g_chi_up_spinor_field[j], mnl->w_fields[1],
-		   mnl->rat.rmu[j]*mnl->forcefactor);
-      
-      // even/even sites sandwiched by tau_1 gamma_5 Y_e and gamma_5 X_e
-      sw_spinor_eo(EE, mnl->w_fields[4], mnl->w_fields[3], 
-		   mnl->rat.rmu[j]*mnl->forcefactor);
-      // odd/odd sites sandwiched by tau_1 gamma_5 Y_o and gamma_5 X_o
-      sw_spinor_eo(OO, g_chi_dn_spinor_field[j], mnl->w_fields[0],
-		   mnl->rat.rmu[j]*mnl->forcefactor);
+      sw_all(hf, mnl->kappa, mnl->c_sw);
     }
   }
-  // trlog part does not depend on the normalisation
-  if(mnl->type == NDCLOVERRAT && mnl->trlog) {
-    sw_deriv_nd(EE);
-  }
-  if(mnl->type == NDCLOVERRAT) {
-    sw_all(hf, mnl->kappa, mnl->c_sw);
-  }
+  
   tm_stopwatch_pop(&g_timers, 0, 1, "");
   return;
 }
@@ -270,9 +325,11 @@ double ndrat_acc(const int id, hamiltonian_field_t * const hf) {
   tm_stopwatch_push(&g_timers, __func__, mnl->name);
   nd_set_global_parameter(mnl);
   if(mnl->type == NDCLOVERRAT) {
-    sw_term((const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); 
-    sw_invert_nd(mnl->mubar*mnl->mubar - mnl->epsbar*mnl->epsbar);
-    copy_32_sw_fields();
+    if( g_debug_level > 2 || g_strict_residual_check || !(mnl->external_library == QUDA_LIB && mnl->solver_params.external_inverter == QUDA_INVERTER) ){
+      sw_term((const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); 
+      sw_invert_nd(mnl->mubar*mnl->mubar - mnl->epsbar*mnl->epsbar);
+      copy_32_sw_fields();
+    }
   }
   mnl->energy1 = 0.;
 
diff --git a/quda_interface.c b/quda_interface.c
index 9d9c350c5..7801fa7e3 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -7,6 +7,7 @@
  *               2021       Bartosz Kostrzewa, Marco Garofalo, Ferenc Pittler, Simone Bacchio
  *               2022       Simone Romiti, Bartosz Kostrzewa
  *               2023       Aniket Sen, Bartosz Kostrzewa
+ *               2024       Aniket Sen, Marco Garofalo, Bartosz Kostrzewa
  *
  * This file is part of tmLQCD.
  *
@@ -107,6 +108,7 @@
 #include "boundary.h"
 #include "linalg/convert_eo_to_lexic.h"
 #include "linalg/mul_r.h"
+#include "linalg/mul_gamma5.h"
 #include "solver/solver.h"
 #include "solver/solver_field.h"
 #include "gettime.h"
@@ -2655,14 +2657,73 @@ void compute_cloverdet_derivative_quda(monomial * const mnl, hamiltonian_field_t
   }
   tm_stopwatch_pop(&g_timers, 0, 1, "TM_QUDA");
 }
+
+void compute_ndcloverrat_derivative_quda(monomial * const mnl, hamiltonian_field_t * const hf, spinor ** const Qup, spinor ** const Qdn, solver_params_t * solver_params, int detratio) {
+  tm_stopwatch_push(&g_timers, __func__, "");
+  
+  _initQuda();
+  _initMomQuda();
+  spinor ** in;
+  void *spinorPhi;
+  const int num_shifts = solver_params->no_shifts;
+  const int Vh = VOLUME/2;
+  
+  init_solver_field(&in, VOLUME, num_shifts);
+  void **spinorIn = (void**)in; 
+  for(int shift = 0; shift < num_shifts; shift++){
+    tm_stopwatch_push(&g_timers, "twoflavour_input_overhead", ""); 
+    memcpy(in[shift],      Qup[shift], Vh*24*sizeof(double));
+    memcpy(in[shift] + Vh, Qdn[shift], Vh*24*sizeof(double));
+    tm_stopwatch_pop(&g_timers, 0, 0, "TM_QUDA");
+
+    spinorIn[shift]=in[shift];
+    reorder_spinor_eo_toQuda((double*)spinorIn[shift], inv_param.cpu_prec, 1, 1);
+  }
+
+
+  _loadGaugeQuda(mnl->solver_params.compression_type);
+  _loadCloverQuda(&inv_param);
+  tm_stopwatch_push(&g_timers, "computeTMCloverForceQuda", ""); 
+  
+  double *coeff=(double*) malloc(sizeof(double)*num_shifts );
+  for(int shift = 0; shift < num_shifts; shift++){
+      coeff[shift] = 4. * mnl->kappa * mnl->kappa * mnl->rat.rmu[shift] * mnl->EVMaxInv;
+      inv_param.offset[shift] = mnl->rat.mu[shift];
+  }
+  inv_param.evmax= 1.0/mnl->EVMaxInv;
+  
+  
+  inv_param.kappa= mnl->kappa;
+  inv_param.clover_csw= mnl->c_sw;
+  inv_param.solution_type = QUDA_MATPCDAG_MATPC_SOLUTION;
+  // when using QUDA MG the following parameter need to be set 
+  inv_param.matpc_type = QUDA_MATPC_ODD_ODD_ASYMMETRIC;
+  inv_param.dagger = QUDA_DAG_YES;
+  computeTMCloverForceQuda(mom_quda, spinorIn, &spinorPhi, coeff,  num_shifts, &f_gauge_param,   &inv_param, detratio);
+
+  free(coeff);
+  tm_stopwatch_pop(&g_timers, 0, 1, "TM_QUDA");
+  
+  reorder_mom_fromQuda(mom_quda);
+  add_mom_to_derivative(hf->derivative);
+
+  finalize_solver(in, num_shifts);
+
+  tm_stopwatch_pop(&g_timers, 0, 1, "TM_QUDA");
+}
 #else 
 void compute_cloverdet_derivative_quda(monomial * const mnl, hamiltonian_field_t * const hf, spinor * const X_o, spinor * const phi, int detratio) {
   tm_debug_printf(0,0,"Error:   UseExternalLibrary = quda requires that tmLQCD is compiled with --enable-quda_fermionic=yes\n");
   exit(1);
 }
+void compute_ndcloverrat_derivative_quda(monomial * const mnl, hamiltonian_field_t * const hf, spinor ** const Qup, spinor ** const Qdn, solver_params_t * solver_params, int detratio) {
+  tm_debug_printf(0,0,"Error:   UseExternalLibrary = quda requires that tmLQCD is compiled with --enable-quda_fermionic=yes\n");
+  exit(1);
+}
 #endif
 
 
+
 void  compute_WFlow_quda(const double eps, const double tmax, const int traj, FILE* outfile){
   tm_stopwatch_push(&g_timers, __func__, "");
   
diff --git a/quda_interface.h b/quda_interface.h
index a9fc9cc8d..453ff0d75 100644
--- a/quda_interface.h
+++ b/quda_interface.h
@@ -175,7 +175,10 @@ int invert_eo_quda_twoflavour_mshift(spinor ** const out_up, spinor ** const out
 
 void compute_gauge_derivative_quda(monomial * const mnl, hamiltonian_field_t * const hf);
 void compute_cloverdet_derivative_quda(monomial * const mnl, hamiltonian_field_t * const hf,
-                                     spinor * const X_o, spinor * const phi, int ratio);
+                                     spinor * const X_o, spinor * const phi, int detratio);
+void compute_ndcloverrat_derivative_quda(monomial * const mnl, hamiltonian_field_t * const hf,
+                                        spinor ** const Qup, spinor ** const Qdn, solver_params_t * solver_params, int detratio);
+
 void compute_WFlow_quda(const double eps ,const double tmax, const int traj, FILE* outfile);