From aae0128d9488050bd5791726e2757db1448d392a Mon Sep 17 00:00:00 2001
From: Dave Collins <davec@conformal.com>
Date: Wed, 9 Mar 2022 01:10:37 -0600
Subject: [PATCH] secp256k1: Reduce scalar base mult copies.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Profiling shows that around 7.5% of the time in scalar base
multiplication is attributed to duffcopy.  Upon further examination,
this is the result of a combination of the range statement making copies
of the bytes and the need to construct a Jacobian point from the
individual field values stored in the in-memory byte points table.

This optimizes the function to avoid that as follows:

- Perform the conversion to Jacobian once when the affine byte table is
  decompressed from the stored values
- Make use of those Jacobian points directly
- Use an indexed for loop instead of a range over the bytes
- Perform the calculation using the result variable directly instead of
  via a local variable that is copied to the result

The following benchmark results show the speedup is in line with the
expected gains per the profiling results:

name                     old time/op   new time/op    delta
------------------------------------------------------------------------------
ScalarBaseMultNonConst   24.1µs ±22%   22.5µs ± 2%   -6.97%  (p=0.000 n=98+96)
---
 dcrec/secp256k1/curve.go           | 19 ++++++++-----------
 dcrec/secp256k1/loadprecomputed.go | 10 +++++-----
 2 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/dcrec/secp256k1/curve.go b/dcrec/secp256k1/curve.go
index 354f5826ca..c9d47f3078 100644
--- a/dcrec/secp256k1/curve.go
+++ b/dcrec/secp256k1/curve.go
@@ -1223,23 +1223,20 @@ func ScalarMultNonConst(k *ModNScalar, point, result *JacobianPoint) {
 func ScalarBaseMultNonConst(k *ModNScalar, result *JacobianPoint) {
 	bytePoints := s256BytePoints()
 
-	// Point Q = ∞ (point at infinity).
-	var q JacobianPoint
+	// Start with the point at infinity.
+	result.X.Zero()
+	result.Y.Zero()
+	result.Z.Zero()
 
 	// bytePoints has all 256 byte points for each 8-bit window.  The strategy
 	// is to add up the byte points.  This is best understood by expressing k in
 	// base-256 which it already sort of is.  Each "digit" in the 8-bit window
 	// can be looked up using bytePoints and added together.
-	var pt JacobianPoint
-	for i, byteVal := range k.Bytes() {
-		p := bytePoints[i][byteVal]
-		pt.X.Set(&p[0])
-		pt.Y.Set(&p[1])
-		pt.Z.SetInt(1)
-		AddNonConst(&q, &pt, &q)
+	kb := k.Bytes()
+	for i := 0; i < len(kb); i++ {
+		pt := &bytePoints[i][kb[i]]
+		AddNonConst(result, pt, result)
 	}
-
-	result.Set(&q)
 }
 
 // isOnCurve returns whether or not the affine point (x,y) is on the curve.
diff --git a/dcrec/secp256k1/loadprecomputed.go b/dcrec/secp256k1/loadprecomputed.go
index a5b47990db..91c3d37769 100644
--- a/dcrec/secp256k1/loadprecomputed.go
+++ b/dcrec/secp256k1/loadprecomputed.go
@@ -17,7 +17,7 @@ import (
 
 // bytePointTable describes a table used to house pre-computed values for
 // accelerating scalar base multiplication.
-type bytePointTable [32][256][2]FieldVal
+type bytePointTable [32][256]JacobianPoint
 
 // compressedBytePointsFn is set to a real function by the code generation to
 // return the compressed pre-computed values for accelerating scalar base
@@ -66,12 +66,12 @@ var s256BytePoints = func() func() *bytePointTable {
 		for byteNum := 0; byteNum < len(bytePoints); byteNum++ {
 			// All points in this window.
 			for i := 0; i < len(bytePoints[byteNum]); i++ {
-				px := &bytePoints[byteNum][i][0]
-				py := &bytePoints[byteNum][i][1]
-				px.SetByteSlice(serialized[offset:])
+				p := &bytePoints[byteNum][i]
+				p.X.SetByteSlice(serialized[offset:])
 				offset += 32
-				py.SetByteSlice(serialized[offset:])
+				p.Y.SetByteSlice(serialized[offset:])
 				offset += 32
+				p.Z.SetInt(1)
 			}
 		}
 		data = &bytePoints