[crypto] Use constant-time big integer multiplication

Big integer multiplication currently performs immediate carry propagation from each step of the long multiplication, relying on the fact that the overall result has a known maximum value to minimise the number of carries performed without ever needing to explicitly check against the result buffer size. This is not a constant-time algorithm, since the number of carries performed will be a function of the input values. We could make it constant-time by always continuing to propagate the carry until reaching the end of the result buffer, but this would introduce a large number of redundant zero carries. Require callers of bigint_multiply() to provide a temporary carry storage buffer, of the same size as the result buffer. This allows the carry-out from the accumulation of each double-element product to be accumulated in the temporary carry space, and then added in via a single call to bigint_add() after the multiplication is complete. Since the structure of big integer multiplication is identical across all current CPU architectures, provide a single shared implementation of bigint_multiply(). The architecture-specific operation then becomes the multiplication of two big integer elements and the accumulation of the double-element product. Note that any intermediate carry arising from accumulating the lower half of the double-element product may be added to the upper half of the double-element product without risk of overflow, since the result of multiplying two n-bit integers can never have all n bits set in its upper half. This simplifies the carry calculations for architectures such as RISC-V and LoongArch64 that do not have a carry flag. Signed-off-by: Michael Brown <mcb30@ipxe.org>
2026-02-08 05:48:46 +03:00 · 2024-09-19 16:23:32 +01:00
parent 59d123658b
commit 3def13265d
14 changed files with 355 additions and 612 deletions
--- a/src/crypto/bigint.c
+++ b/src/crypto/bigint.c
@@ -75,6 +75,115 @@ void bigint_swap_raw ( bigint_element_t *first0, bigint_element_t *second0,
 	}
 }

+/**
+ * Multiply big integers
+ *
+ * @v multiplicand0	Element 0 of big integer to be multiplied
+ * @v multiplicand_size	Number of elements in multiplicand
+ * @v multiplier0	Element 0 of big integer to be multiplied
+ * @v multiplier_size	Number of elements in multiplier
+ * @v result0		Element 0 of big integer to hold result
+ * @v carry0		Element 0 of big integer to hold temporary carry
+ */
+void bigint_multiply_raw ( const bigint_element_t *multiplicand0,
+			   unsigned int multiplicand_size,
+			   const bigint_element_t *multiplier0,
+			   unsigned int multiplier_size,
+			   bigint_element_t *result0,
+			   bigint_element_t *carry0 ) {
+	unsigned int result_size = ( multiplicand_size + multiplier_size );
+	const bigint_t ( multiplicand_size ) __attribute__ (( may_alias ))
+		*multiplicand = ( ( const void * ) multiplicand0 );
+	const bigint_t ( multiplier_size ) __attribute__ (( may_alias ))
+		*multiplier = ( ( const void * ) multiplier0 );
+	bigint_t ( result_size ) __attribute__ (( may_alias ))
+		*result = ( ( void * ) result0 );
+	bigint_t ( result_size ) __attribute__ (( may_alias ))
+		*carry = ( ( void * ) carry0 );
+	bigint_element_t multiplicand_element;
+	const bigint_element_t *multiplier_element;
+	bigint_element_t *result_elements;
+	bigint_element_t *carry_element;
+	unsigned int i;
+	unsigned int j;
+
+	/* Zero result and temporary carry space */
+	memset ( result, 0, sizeof ( *result ) );
+	memset ( carry, 0, sizeof ( *carry ) );
+
+	/* Multiply integers one element at a time, adding the double
+	 * element directly into the result and accumulating any
+	 * overall carry out from this double-element addition into
+	 * the temporary carry space.
+	 *
+	 * We could propagate the carry immediately instead of using a
+	 * temporary carry space.  However, this would cause the
+	 * multiplication to run in non-constant time, which is
+	 * undesirable.
+	 *
+	 * The carry elements can never overflow, provided that the
+	 * element size is large enough to accommodate any plausible
+	 * big integer.  The total number of potential carries (across
+	 * all elements) is the sum of the number of elements in the
+	 * multiplicand and multiplier.  With a 16-bit element size,
+	 * this therefore allows for up to a 1Mbit multiplication
+	 * result (e.g. a 512kbit integer multiplied by another
+	 * 512kbit integer), which is around 100x higher than could be
+	 * needed in practice.  With a more realistic 32-bit element
+	 * size, the limit becomes a totally implausible 128Gbit
+	 * multiplication result.
+	 */
+	for ( i = 0 ; i < multiplicand_size ; i++ ) {
+		multiplicand_element = multiplicand->element[i];
+		multiplier_element = &multiplier->element[0];
+		result_elements = &result->element[i];
+		carry_element = &carry->element[i];
+		for ( j = 0 ; j < multiplier_size ; j++ ) {
+			bigint_multiply_one ( multiplicand_element,
+					      *(multiplier_element++),
+					      result_elements++,
+					      carry_element++ );
+		}
+	}
+
+	/* Add the temporary carry into the result.  The least
+	 * significant element of the carry represents the carry out
+	 * from multiplying the least significant elements of the
+	 * multiplicand and multiplier, and therefore must be added to
+	 * the third-least significant element of the result (i.e. the
+	 * carry needs to be shifted left by two elements before being
+	 * adding to the result).
+	 *
+	 * The most significant two elements of the carry are
+	 * guaranteed to be zero, since:
+	 *
+	 *     a < 2^{n}, b < 2^{m} => ab < 2^{n+m}
+	 *
+	 * and the overall result of the multiplication (including
+	 * adding in the shifted carries) is therefore guaranteed not
+	 * to overflow beyond the end of the result.
+	 *
+	 * We could avoid this shifting by writing the carry directly
+	 * into the "correct" element during the element-by-element
+	 * multiplication stage above.  However, this would add
+	 * complexity to the loop since we would have to arrange for
+	 * the (provably zero) most significant two carry out results
+	 * to be discarded, in order to avoid writing beyond the end
+	 * of the temporary carry space.
+	 *
+	 * Performing the logical shift is essentially free, since we
+	 * simply adjust the element pointers.
+	 *
+	 * To avoid requiring additional checks in each architecture's
+	 * implementation of bigint_add_raw(), we explicitly avoid
+	 * calling bigint_add_raw() with a size of zero.
+	 */
+	if ( result_size > 2 ) {
+		bigint_add_raw ( &carry->element[0], &result->element[2],
+				 ( result_size - 2 ) );
+	}
+}
+
 /**
 * Perform modular multiplication of big integers
 *
@@ -100,7 +209,10 @@ void bigint_mod_multiply_raw ( const bigint_element_t *multiplicand0,
 		( ( void * ) result0 );
 	struct {
 		bigint_t ( size * 2 ) result;
-		bigint_t ( size * 2 ) modulus;
+		union {
+			bigint_t ( size * 2 ) modulus;
+			bigint_t ( size * 2 ) carry;
+		};
 	} *temp = tmp;
 	int rotation;
 	int i;
@@ -113,7 +225,8 @@ void bigint_mod_multiply_raw ( const bigint_element_t *multiplicand0,

 	/* Perform multiplication */
 	profile_start ( &bigint_mod_multiply_multiply_profiler );
-	bigint_multiply ( multiplicand, multiplier, &temp->result );
+	bigint_multiply ( multiplicand, multiplier, &temp->result,
+			  &temp->carry );
 	profile_stop ( &bigint_mod_multiply_multiply_profiler );

 	/* Rescale modulus to match result */
--- a/src/crypto/x25519.c
+++ b/src/crypto/x25519.c
@@ -43,7 +43,7 @@ FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
 * Storage size of each big integer	     128	      40
 * (in bytes)
 *
- * Stack usage for key exchange		    1144	     360
+ * Stack usage for key exchange		    1144	     424
 * (in bytes, large objects only)
 *
 * Cost of big integer addition		      16	       5
@@ -207,35 +207,60 @@ union x25519_multiply_step3 {
 * We overlap the buffers used by each step of the multiplication
 * calculation to reduce the total stack space required:
 *
- * |--------------------------------------------------------|
- * | <- pad -> | <------------ step 1 result -------------> |
- * |           | <- low 256 bits -> | <-- high 260 bits --> |
- * | <------- step 2 result ------> | <-- step 3 result --> |
- * |--------------------------------------------------------|
+ * |--------------------------------------------------------------------------|
+ * | <------- step 1 carry ------> | <----------- step 1 result ------------> |
+ * |                               | <- low 256 bits -> | <- high 260 bits -> |
+ * | <- step 2 carry -> | <-- step 2 result --> | <pad> |                     |
+ * | <- s3 carry -> | <--------- pad ---------> | <- step 3 result -> |       |
+ * |--------------------------------------------------------------------------|
 */
 union x25519_multiply_workspace {
-	/** Step 1 result */
+	/** Step 1 */
 	struct {
-		/** Padding to avoid collision between steps 1 and 2
-		 *
-		 * The step 2 multiplication consumes the high 260
-		 * bits of step 1, and so the step 2 multiplication
-		 * result must not overlap this portion of the step 1
-		 * result.
-		 */
-		uint8_t pad[ sizeof ( union x25519_multiply_step2 ) -
-			     offsetof ( union x25519_multiply_step1,
-					parts.high_260bit ) ];
+		/** Step 1 temporary carry workspace */
+		union x25519_multiply_step1 carry;
 		/** Step 1 result */
-		union x25519_multiply_step1 step1;
-	} __attribute__ (( packed ));
-	/** Steps 2 and 3 results */
+		union x25519_multiply_step1 result;
+	} __attribute__ (( packed )) step1;
+	/** Step 2
+	 *
+	 * The step 2 multiplication consumes the high 260 bits of
+	 * step 1, and so the step 2 multiplication result (and
+	 * temporary carry workspace) must not overlap this portion of
+	 * the step 1 result.
+	 */
 	struct {
+		/** Step 2 temporary carry workspace */
+		union x25519_multiply_step2 carry;
 		/** Step 2 result */
-		union x25519_multiply_step2 step2;
+		union x25519_multiply_step2 result;
+		/** Avoid collision between step 1 result and step 2 result */
+		uint8_t pad[ ( int )
+			     ( sizeof ( union x25519_multiply_step1 ) +
+			       offsetof ( union x25519_multiply_step1,
+					  parts.high_260bit ) -
+			       sizeof ( union x25519_multiply_step2 ) -
+			       sizeof ( union x25519_multiply_step2 ) ) ];
+	} __attribute__ (( packed )) step2;
+	/** Step 3
+	 *
+	 * The step 3 multiplication consumes the high 11 bits of step
+	 * 2, and so the step 3 multiplication result (and temporary
+	 * carry workspace) must not overlap this portion of the step
+	 * 2 result.
+	 */
+	struct {
+		/** Step 3 temporary carry workspace */
+		union x25519_multiply_step3 carry;
+		/** Avoid collision between step 2 result and step 3 carry */
+		uint8_t pad1[ ( int )
+			      ( sizeof ( union x25519_multiply_step2 ) -
+				sizeof ( union x25519_multiply_step3 ) ) ];
+		/** Avoid collision between step 2 result and step 3 result */
+		uint8_t pad2[ sizeof ( union x25519_multiply_step2 ) ];
 		/** Step 3 result */
-		union x25519_multiply_step3 step3;
-	} __attribute__ (( packed ));
+		union x25519_multiply_step3 result;
+	} __attribute__ (( packed )) step3;
 };

 /** An X25519 elliptic curve point in projective coordinates
@@ -426,9 +451,9 @@ void x25519_multiply ( const union x25519_oct258 *multiplicand,
 		       const union x25519_oct258 *multiplier,
 		       union x25519_quad257 *result ) {
 	union x25519_multiply_workspace tmp;
-	union x25519_multiply_step1 *step1 = &tmp.step1;
-	union x25519_multiply_step2 *step2 = &tmp.step2;
-	union x25519_multiply_step3 *step3 = &tmp.step3;
+	union x25519_multiply_step1 *step1 = &tmp.step1.result;
+	union x25519_multiply_step2 *step2 = &tmp.step2.result;
+	union x25519_multiply_step3 *step3 = &tmp.step3.result;

 	/* Step 1: perform raw multiplication
 	 *
@@ -439,7 +464,7 @@ void x25519_multiply ( const union x25519_oct258 *multiplicand,
 	 */
 	static_assert ( sizeof ( step1->product ) >= sizeof ( step1->parts ) );
 	bigint_multiply ( &multiplicand->value, &multiplier->value,
-			  &step1->product );
+			  &step1->product, &tmp.step1.carry.product );

 	/* Step 2: reduce high-order 516-256=260 bits of step 1 result
 	 *
@@ -465,7 +490,7 @@ void x25519_multiply ( const union x25519_oct258 *multiplicand,
 	static_assert ( sizeof ( step2->product ) >= sizeof ( step2->parts ) );
 	bigint_grow ( &step1->parts.low_256bit, &result->value );
 	bigint_multiply ( &step1->parts.high_260bit, &x25519_reduce_256,
-			  &step2->product );
+			  &step2->product, &tmp.step2.carry.product );
 	bigint_add ( &result->value, &step2->value );

 	/* Step 3: reduce high-order 267-256=11 bits of step 2 result
@@ -503,7 +528,7 @@ void x25519_multiply ( const union x25519_oct258 *multiplicand,
 	memset ( &step3->value, 0, sizeof ( step3->value ) );
 	bigint_grow ( &step2->parts.low_256bit, &result->value );
 	bigint_multiply ( &step2->parts.high_11bit, &x25519_reduce_256,
-			  &step3->product );
+			  &step3->product, &tmp.step3.carry.product );
 	bigint_add ( &step3->value, &result->value );

 	/* Step 1 calculates the product of the input operands, and