Merge remote-tracking branch 'origin/master' into deriv_mg_tune

etmc · Jun 14, 2024 · 7f90d21 · 7f90d21
2 parents 2812388 + 950a3a1
commit 7f90d21
Show file tree

Hide file tree

Showing 11 changed files with 701 additions and 249 deletions.
diff --git a/CITATION.cff b/CITATION.cff
diff --git a/compare_derivative.c b/compare_derivative.c
@@ -25,6 +25,7 @@
 # include <omp.h>
 #endif
 #include <stdio.h>
+#include <math.h>
 #include "global.h"
 #include "monomial/monomial.h"
 

diff --git a/doc/quda.tex b/doc/quda.tex
@@ -12,11 +12,11 @@ \subsubsection{Design goals of the interface}
 The QUDA interface has been designed with the following goals in mind, sorted by priority:
 \begin{enumerate}
 	\item \emph{Safety.} Naturally, highest priority is given to the correctness of the output of the interface. 
-	This is trivially achieved by always checking the final residual on the CPU with the default tmLQCD routines.
+    For pure inversions this is trivially achieved by always checking the final residual on the CPU with the default tmLQCD routines. When QUDA is used in the HMC, however, the residual is only checked for {\ttfamily DebugLevel > 2} or when {\ttfamily StrictResidualCheck} is enabled.
 \item \emph{Ease of use.} Within the operator declarations of the input file (between {\ttfamily BeginOperator} and {\ttfamily EndOperator}) a simple flag {\ttfamily UseExternalInverter} is introduced which, when set to {\ttfamily quda}, will let QUDA perform the inversion of that operator. The operators {\ttfamily TMWILSON, WILSON, DBTMWILSON} and {\ttfamily CLOVER, DBCLOVER} are supported. 
     Within the monomial declarations of the input file (between {\ttfamily BeginMonomial} and {\ttfamily EndMonomial}) the same flag can be used to offload solves for the \texttt{DET, DETRATIO, CLOVERDET, CLOVERDETRATIO, RAT, RATCOR, NDRAT, NDRATCOR, NDCLOVERRAT} and \texttt{NDCLOVERRATCOR} monomials in the HMC.
-    Further, the flag {\ttfamily UseExternalLibrary} is introduced which, when set to {\ttfamily quda}, will let QUDA perform the force calculation for the given monomial with support currently limited to {\ttfamily GAUGE, CLOVERDET, CLOVERDETRATIO}.
-	\item \emph{Minimality.} Minimal changes in the form of {\ttfamily \#ifdef QUDA} precompiler directives to the tmLQCD code base. The main bulk of the interface lies in a single separate file {\ttfamily quda\_interface.c} (with corresponding header file). The QUDA interface is entered .
+    Further, the flag {\ttfamily UseExternalLibrary} is introduced which, when set to {\ttfamily quda}, will let QUDA perform the force calculation for the given monomial with support currently limited to {\ttfamily GAUGE, CLOVERDET, CLOVERDETRATIO} and {\ttfamily NDCLOVERRAT}.
+	\item \emph{Minimality.} Minimal changes in the form of {\ttfamily \#ifdef TM\_USE\_QUDA} precompiler directives to the tmLQCD code base. The main bulk of the interface lies in a single separate file {\ttfamily quda\_interface.c} (with corresponding header file). The QUDA interface is entered .
 	\item \emph{Performance.} The higher priority of the previous items results in small performance detriments. In particular:
 	\begin{itemize}
 		\item tmLQCD's $\theta$-boundary conditions are not compatible with QUDA's 8 and 12 parameter reconstruction of the gauge fields (as of QUDA-1.1.0). Therefore reconstruction/compression is deactivated by default, although it may be activated via the input file, see below.
@@ -81,10 +81,9 @@ \subsubsection{QUDA versions}
 \begin{verbatim}
   --enable-quda_fermionic_forces=no
 \end{verbatim}
-so that the wrapper to the QUDA fermionic forces is not compiled,
-thus if \texttt{--enable-quda_fermionic_forces=no} setting {\ttfamily UseExternalLibrary=yes} in the inputfile for the {\ttfamily  CLOVERDET, CLOVERDETRATIO} monomials
-is not supported and tmLQCD will stop with an error.
+so that the wrapper to the QUDA fermionic forces is not compiled.
 
+Thus, if \texttt{--enable-quda\_fermionic\_forces=no}, setting {\ttfamily UseExternalLibrary=yes} in the inputfile for the {\ttfamily  CLOVERDET, CLOVERDETRATIO} and {\ttfamily NDCLOVERRAT} monomials is not supported and tmLQCD will stop with an error.
 
 \subsubsection{Usage}
 Any main program that reads and handles the operator declaration from an input file can easily be set up to use the QUDA inverter by setting the {\ttfamily UseExternalInverter} flag to {\ttfamily quda}. For example, in the input file for the {\ttfamily invert} executable, add the flag to the operator declaration as
@@ -130,7 +129,7 @@ \subsubsection{Usage}
   \item \texttt{RefinementPrecision}: When the operator or monomial uses the multishift (\texttt{cgmms[nd]}) solver and offloads to QUDA, this parameter sets the inner solver precision of shift-by-shift refinement solves. In practice, one might set \texttt{UseSloppyPrecision = single} and \texttt{RefinementPrecision = half}. This will iterate the residuals in the multishift solver up to single precision and then refine each solution using a double-half mixed-precision CG.
 \end{itemize}
 
-In additition, for the gauge monomial, the parameter \texttt{UseExternalLibrary = quda} can be used to offload the gauge force to QUDA.
+In additition, for the \texttt{GAUGE, CLOVERDET, CLOVERDETRATIO} and \texttt{NDCLOVERRAT} monomials, the parameter \texttt{UseExternalLibrary = quda} can be used to offload the force calculation to QUDA.
 
 Finally, for the \texttt{GRADIENTFLOW} online measurement, the parameter \texttt{UseExternalLibrary = quda} will offload the gradient flow to QUDA.
 

diff --git a/monomial/cloverdet_monomial.c b/monomial/cloverdet_monomial.c
@@ -62,17 +62,6 @@ void cloverdet_derivative(const int id, hamiltonian_field_t * const hf) {
   monomial * mnl = &monomial_list[id];
   int N = VOLUME/2;
   tm_stopwatch_push(&g_timers, __func__, mnl->name);
-  tm_stopwatch_push(&g_timers, "su3_zero", "");
-#ifdef TM_USE_OMP
-  #pragma omp parallel for
-#endif
-  for(int i = 0; i < VOLUME; i++) { 
-    for(int mu = 0; mu < 4; mu++) { 
-      _su3_zero(swm[i][mu]);
-      _su3_zero(swp[i][mu]);
-    }
-  }
-  tm_stopwatch_pop(&g_timers, 0, 1, "");
 
   mnl->forcefactor = 1.;
   /*********************************************************************
@@ -89,16 +78,29 @@ void cloverdet_derivative(const int id, hamiltonian_field_t * const hf) {
   g_kappa = mnl->kappa;
   boundary(mnl->kappa);
 
-  // we compute the clover term (1 + T_ee(oo)) for all sites x
-  sw_term( (const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); 
-  // we invert it for the even sites only
-  if(!mnl->even_odd_flag) {
-    N = VOLUME;
-  }
-  else {
-    sw_invert(EE, mnl->mu);
+  if( g_debug_level > 2 || g_strict_residual_check || !(mnl->external_library == QUDA_LIB && mnl->solver_params.external_inverter == QUDA_INVERTER) ){
+    tm_stopwatch_push(&g_timers, "su3_zero", "");
+  #ifdef TM_USE_OMP
+    #pragma omp parallel for
+  #endif
+    for(int i = 0; i < VOLUME; i++) { 
+      for(int mu = 0; mu < 4; mu++) { 
+        _su3_zero(swm[i][mu]);
+        _su3_zero(swp[i][mu]);
+      }
+    }
+    tm_stopwatch_pop(&g_timers, 0, 1, "");
+
+    // we compute the clover term (1 + T_ee(oo)) for all sites x
+    sw_term( (const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); 
+    // we invert it for the even sites only
+    if(!mnl->even_odd_flag) {
+      N = VOLUME;
+    }
+    else {
+      sw_invert(EE, mnl->mu);
+    }
   }
-
   // Invert Q_{+} Q_{-}
   // X_o -> w_fields[1]
   chrono_guess(mnl->w_fields[1], mnl->pf, mnl->csg_field, mnl->csg_index_array,
@@ -274,15 +276,17 @@ double cloverdet_acc(const int id, hamiltonian_field_t * const hf) {
   g_kappa = mnl->kappa;
   boundary(mnl->kappa);
 
-  sw_term( (const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); 
+  if( g_debug_level > 2 || g_strict_residual_check || !(mnl->external_library == QUDA_LIB && mnl->solver_params.external_inverter == QUDA_INVERTER) ){
 
-  if(!mnl->even_odd_flag) {
-    N = VOLUME;
-  }
-  else {
-    sw_invert(EE, mnl->mu);
-  }
+    sw_term( (const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); 
 
+    if(!mnl->even_odd_flag) {
+      N = VOLUME;
+    }
+    else {
+      sw_invert(EE, mnl->mu);
+    }
+  }
   g_sloppy_precision_flag = 0;
 
   if( mnl->solver == MG || mnl->solver == BICGSTAB ){

diff --git a/monomial/monomial.c b/monomial/monomial.c
@@ -453,6 +453,12 @@ int init_monomials(const int V, const int even_odd_flag) {
               monomial_list[i].name,
               no_monomials);
         }
+        if(monomial_list[i].external_library==QUDA_LIB){
+          if(monomial_list[i].solver_params.external_inverter != QUDA_INVERTER){
+            tm_debug_printf(0,0,"Error: NDCLOVERRAT monomial of UseExternalLibrary = quda is not supported without UseExternalInverter = quda\n");
+            exit(1);
+          }
+        }
       }
       else if(monomial_list[i].type == NDRATCOR) {
         monomial_list[i].hbfunction = &ndratcor_heatbath;