From aae0128d9488050bd5791726e2757db1448d392a Mon Sep 17 00:00:00 2001 From: Dave Collins Date: Wed, 9 Mar 2022 01:10:37 -0600 Subject: [PATCH] secp256k1: Reduce scalar base mult copies. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Profiling shows that around 7.5% of the time in scalar base multiplication is attributed to duffcopy. Upon further examination, this is the result of a combination of the range statement making copies of the bytes and the need to construct a Jacobian point from the individual field values stored in the in-memory byte points table. This optimizes the function to avoid that as follows: - Perform the conversion to Jacobian once when the affine byte table is decompressed from the stored values - Make use of those Jacobian points directly - Use an indexed for loop instead of a range over the bytes - Perform the calculation using the result variable directly instead of via a local variable that is copied to the result The following benchmark results show the speedup is in line with the expected gains per the profiling results: name old time/op new time/op delta ------------------------------------------------------------------------------ ScalarBaseMultNonConst 24.1µs ±22% 22.5µs ± 2% -6.97% (p=0.000 n=98+96) --- dcrec/secp256k1/curve.go | 19 ++++++++----------- dcrec/secp256k1/loadprecomputed.go | 10 +++++----- 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/dcrec/secp256k1/curve.go b/dcrec/secp256k1/curve.go index 354f5826ca..c9d47f3078 100644 --- a/dcrec/secp256k1/curve.go +++ b/dcrec/secp256k1/curve.go @@ -1223,23 +1223,20 @@ func ScalarMultNonConst(k *ModNScalar, point, result *JacobianPoint) { func ScalarBaseMultNonConst(k *ModNScalar, result *JacobianPoint) { bytePoints := s256BytePoints() - // Point Q = ∞ (point at infinity). - var q JacobianPoint + // Start with the point at infinity. + result.X.Zero() + result.Y.Zero() + result.Z.Zero() // bytePoints has all 256 byte points for each 8-bit window. The strategy // is to add up the byte points. This is best understood by expressing k in // base-256 which it already sort of is. Each "digit" in the 8-bit window // can be looked up using bytePoints and added together. - var pt JacobianPoint - for i, byteVal := range k.Bytes() { - p := bytePoints[i][byteVal] - pt.X.Set(&p[0]) - pt.Y.Set(&p[1]) - pt.Z.SetInt(1) - AddNonConst(&q, &pt, &q) + kb := k.Bytes() + for i := 0; i < len(kb); i++ { + pt := &bytePoints[i][kb[i]] + AddNonConst(result, pt, result) } - - result.Set(&q) } // isOnCurve returns whether or not the affine point (x,y) is on the curve. diff --git a/dcrec/secp256k1/loadprecomputed.go b/dcrec/secp256k1/loadprecomputed.go index a5b47990db..91c3d37769 100644 --- a/dcrec/secp256k1/loadprecomputed.go +++ b/dcrec/secp256k1/loadprecomputed.go @@ -17,7 +17,7 @@ import ( // bytePointTable describes a table used to house pre-computed values for // accelerating scalar base multiplication. -type bytePointTable [32][256][2]FieldVal +type bytePointTable [32][256]JacobianPoint // compressedBytePointsFn is set to a real function by the code generation to // return the compressed pre-computed values for accelerating scalar base @@ -66,12 +66,12 @@ var s256BytePoints = func() func() *bytePointTable { for byteNum := 0; byteNum < len(bytePoints); byteNum++ { // All points in this window. for i := 0; i < len(bytePoints[byteNum]); i++ { - px := &bytePoints[byteNum][i][0] - py := &bytePoints[byteNum][i][1] - px.SetByteSlice(serialized[offset:]) + p := &bytePoints[byteNum][i] + p.X.SetByteSlice(serialized[offset:]) offset += 32 - py.SetByteSlice(serialized[offset:]) + p.Y.SetByteSlice(serialized[offset:]) offset += 32 + p.Z.SetInt(1) } } data = &bytePoints