[crypto] Use constant-time big integer multiplication

Big integer multiplication currently performs immediate carry
propagation from each step of the long multiplication, relying on the
fact that the overall result has a known maximum value to minimise the
number of carries performed without ever needing to explicitly check
against the result buffer size.

This is not a constant-time algorithm, since the number of carries
performed will be a function of the input values.  We could make it
constant-time by always continuing to propagate the carry until
reaching the end of the result buffer, but this would introduce a
large number of redundant zero carries.

Require callers of bigint_multiply() to provide a temporary carry
storage buffer, of the same size as the result buffer.  This allows
the carry-out from the accumulation of each double-element product to
be accumulated in the temporary carry space, and then added in via a
single call to bigint_add() after the multiplication is complete.

Since the structure of big integer multiplication is identical across
all current CPU architectures, provide a single shared implementation
of bigint_multiply().  The architecture-specific operation then
becomes the multiplication of two big integer elements and the
accumulation of the double-element product.

Note that any intermediate carry arising from accumulating the lower
half of the double-element product may be added to the upper half of
the double-element product without risk of overflow, since the result
of multiplying two n-bit integers can never have all n bits set in its
upper half.  This simplifies the carry calculations for architectures
such as RISC-V and LoongArch64 that do not have a carry flag.

Signed-off-by: Michael Brown <mcb30@ipxe.org>
This commit is contained in:
Michael Brown
2024-09-19 16:23:32 +01:00
parent 59d123658b
commit 3def13265d
14 changed files with 355 additions and 612 deletions

View File

@@ -75,6 +75,115 @@ void bigint_swap_raw ( bigint_element_t *first0, bigint_element_t *second0,
}
}
/**
* Multiply big integers
*
* @v multiplicand0 Element 0 of big integer to be multiplied
* @v multiplicand_size Number of elements in multiplicand
* @v multiplier0 Element 0 of big integer to be multiplied
* @v multiplier_size Number of elements in multiplier
* @v result0 Element 0 of big integer to hold result
* @v carry0 Element 0 of big integer to hold temporary carry
*/
void bigint_multiply_raw ( const bigint_element_t *multiplicand0,
unsigned int multiplicand_size,
const bigint_element_t *multiplier0,
unsigned int multiplier_size,
bigint_element_t *result0,
bigint_element_t *carry0 ) {
unsigned int result_size = ( multiplicand_size + multiplier_size );
const bigint_t ( multiplicand_size ) __attribute__ (( may_alias ))
*multiplicand = ( ( const void * ) multiplicand0 );
const bigint_t ( multiplier_size ) __attribute__ (( may_alias ))
*multiplier = ( ( const void * ) multiplier0 );
bigint_t ( result_size ) __attribute__ (( may_alias ))
*result = ( ( void * ) result0 );
bigint_t ( result_size ) __attribute__ (( may_alias ))
*carry = ( ( void * ) carry0 );
bigint_element_t multiplicand_element;
const bigint_element_t *multiplier_element;
bigint_element_t *result_elements;
bigint_element_t *carry_element;
unsigned int i;
unsigned int j;
/* Zero result and temporary carry space */
memset ( result, 0, sizeof ( *result ) );
memset ( carry, 0, sizeof ( *carry ) );
/* Multiply integers one element at a time, adding the double
* element directly into the result and accumulating any
* overall carry out from this double-element addition into
* the temporary carry space.
*
* We could propagate the carry immediately instead of using a
* temporary carry space. However, this would cause the
* multiplication to run in non-constant time, which is
* undesirable.
*
* The carry elements can never overflow, provided that the
* element size is large enough to accommodate any plausible
* big integer. The total number of potential carries (across
* all elements) is the sum of the number of elements in the
* multiplicand and multiplier. With a 16-bit element size,
* this therefore allows for up to a 1Mbit multiplication
* result (e.g. a 512kbit integer multiplied by another
* 512kbit integer), which is around 100x higher than could be
* needed in practice. With a more realistic 32-bit element
* size, the limit becomes a totally implausible 128Gbit
* multiplication result.
*/
for ( i = 0 ; i < multiplicand_size ; i++ ) {
multiplicand_element = multiplicand->element[i];
multiplier_element = &multiplier->element[0];
result_elements = &result->element[i];
carry_element = &carry->element[i];
for ( j = 0 ; j < multiplier_size ; j++ ) {
bigint_multiply_one ( multiplicand_element,
*(multiplier_element++),
result_elements++,
carry_element++ );
}
}
/* Add the temporary carry into the result. The least
* significant element of the carry represents the carry out
* from multiplying the least significant elements of the
* multiplicand and multiplier, and therefore must be added to
* the third-least significant element of the result (i.e. the
* carry needs to be shifted left by two elements before being
* adding to the result).
*
* The most significant two elements of the carry are
* guaranteed to be zero, since:
*
* a < 2^{n}, b < 2^{m} => ab < 2^{n+m}
*
* and the overall result of the multiplication (including
* adding in the shifted carries) is therefore guaranteed not
* to overflow beyond the end of the result.
*
* We could avoid this shifting by writing the carry directly
* into the "correct" element during the element-by-element
* multiplication stage above. However, this would add
* complexity to the loop since we would have to arrange for
* the (provably zero) most significant two carry out results
* to be discarded, in order to avoid writing beyond the end
* of the temporary carry space.
*
* Performing the logical shift is essentially free, since we
* simply adjust the element pointers.
*
* To avoid requiring additional checks in each architecture's
* implementation of bigint_add_raw(), we explicitly avoid
* calling bigint_add_raw() with a size of zero.
*/
if ( result_size > 2 ) {
bigint_add_raw ( &carry->element[0], &result->element[2],
( result_size - 2 ) );
}
}
/**
* Perform modular multiplication of big integers
*
@@ -100,7 +209,10 @@ void bigint_mod_multiply_raw ( const bigint_element_t *multiplicand0,
( ( void * ) result0 );
struct {
bigint_t ( size * 2 ) result;
bigint_t ( size * 2 ) modulus;
union {
bigint_t ( size * 2 ) modulus;
bigint_t ( size * 2 ) carry;
};
} *temp = tmp;
int rotation;
int i;
@@ -113,7 +225,8 @@ void bigint_mod_multiply_raw ( const bigint_element_t *multiplicand0,
/* Perform multiplication */
profile_start ( &bigint_mod_multiply_multiply_profiler );
bigint_multiply ( multiplicand, multiplier, &temp->result );
bigint_multiply ( multiplicand, multiplier, &temp->result,
&temp->carry );
profile_stop ( &bigint_mod_multiply_multiply_profiler );
/* Rescale modulus to match result */

View File

@@ -43,7 +43,7 @@ FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
* Storage size of each big integer 128 40
* (in bytes)
*
* Stack usage for key exchange 1144 360
* Stack usage for key exchange 1144 424
* (in bytes, large objects only)
*
* Cost of big integer addition 16 5
@@ -207,35 +207,60 @@ union x25519_multiply_step3 {
* We overlap the buffers used by each step of the multiplication
* calculation to reduce the total stack space required:
*
* |--------------------------------------------------------|
* | <- pad -> | <------------ step 1 result -------------> |
* | | <- low 256 bits -> | <-- high 260 bits --> |
* | <------- step 2 result ------> | <-- step 3 result --> |
* |--------------------------------------------------------|
* |--------------------------------------------------------------------------|
* | <------- step 1 carry ------> | <----------- step 1 result ------------> |
* | | <- low 256 bits -> | <- high 260 bits -> |
* | <- step 2 carry -> | <-- step 2 result --> | <pad> | |
* | <- s3 carry -> | <--------- pad ---------> | <- step 3 result -> | |
* |--------------------------------------------------------------------------|
*/
union x25519_multiply_workspace {
/** Step 1 result */
/** Step 1 */
struct {
/** Padding to avoid collision between steps 1 and 2
*
* The step 2 multiplication consumes the high 260
* bits of step 1, and so the step 2 multiplication
* result must not overlap this portion of the step 1
* result.
*/
uint8_t pad[ sizeof ( union x25519_multiply_step2 ) -
offsetof ( union x25519_multiply_step1,
parts.high_260bit ) ];
/** Step 1 temporary carry workspace */
union x25519_multiply_step1 carry;
/** Step 1 result */
union x25519_multiply_step1 step1;
} __attribute__ (( packed ));
/** Steps 2 and 3 results */
union x25519_multiply_step1 result;
} __attribute__ (( packed )) step1;
/** Step 2
*
* The step 2 multiplication consumes the high 260 bits of
* step 1, and so the step 2 multiplication result (and
* temporary carry workspace) must not overlap this portion of
* the step 1 result.
*/
struct {
/** Step 2 temporary carry workspace */
union x25519_multiply_step2 carry;
/** Step 2 result */
union x25519_multiply_step2 step2;
union x25519_multiply_step2 result;
/** Avoid collision between step 1 result and step 2 result */
uint8_t pad[ ( int )
( sizeof ( union x25519_multiply_step1 ) +
offsetof ( union x25519_multiply_step1,
parts.high_260bit ) -
sizeof ( union x25519_multiply_step2 ) -
sizeof ( union x25519_multiply_step2 ) ) ];
} __attribute__ (( packed )) step2;
/** Step 3
*
* The step 3 multiplication consumes the high 11 bits of step
* 2, and so the step 3 multiplication result (and temporary
* carry workspace) must not overlap this portion of the step
* 2 result.
*/
struct {
/** Step 3 temporary carry workspace */
union x25519_multiply_step3 carry;
/** Avoid collision between step 2 result and step 3 carry */
uint8_t pad1[ ( int )
( sizeof ( union x25519_multiply_step2 ) -
sizeof ( union x25519_multiply_step3 ) ) ];
/** Avoid collision between step 2 result and step 3 result */
uint8_t pad2[ sizeof ( union x25519_multiply_step2 ) ];
/** Step 3 result */
union x25519_multiply_step3 step3;
} __attribute__ (( packed ));
union x25519_multiply_step3 result;
} __attribute__ (( packed )) step3;
};
/** An X25519 elliptic curve point in projective coordinates
@@ -426,9 +451,9 @@ void x25519_multiply ( const union x25519_oct258 *multiplicand,
const union x25519_oct258 *multiplier,
union x25519_quad257 *result ) {
union x25519_multiply_workspace tmp;
union x25519_multiply_step1 *step1 = &tmp.step1;
union x25519_multiply_step2 *step2 = &tmp.step2;
union x25519_multiply_step3 *step3 = &tmp.step3;
union x25519_multiply_step1 *step1 = &tmp.step1.result;
union x25519_multiply_step2 *step2 = &tmp.step2.result;
union x25519_multiply_step3 *step3 = &tmp.step3.result;
/* Step 1: perform raw multiplication
*
@@ -439,7 +464,7 @@ void x25519_multiply ( const union x25519_oct258 *multiplicand,
*/
static_assert ( sizeof ( step1->product ) >= sizeof ( step1->parts ) );
bigint_multiply ( &multiplicand->value, &multiplier->value,
&step1->product );
&step1->product, &tmp.step1.carry.product );
/* Step 2: reduce high-order 516-256=260 bits of step 1 result
*
@@ -465,7 +490,7 @@ void x25519_multiply ( const union x25519_oct258 *multiplicand,
static_assert ( sizeof ( step2->product ) >= sizeof ( step2->parts ) );
bigint_grow ( &step1->parts.low_256bit, &result->value );
bigint_multiply ( &step1->parts.high_260bit, &x25519_reduce_256,
&step2->product );
&step2->product, &tmp.step2.carry.product );
bigint_add ( &result->value, &step2->value );
/* Step 3: reduce high-order 267-256=11 bits of step 2 result
@@ -503,7 +528,7 @@ void x25519_multiply ( const union x25519_oct258 *multiplicand,
memset ( &step3->value, 0, sizeof ( step3->value ) );
bigint_grow ( &step2->parts.low_256bit, &result->value );
bigint_multiply ( &step2->parts.high_11bit, &x25519_reduce_256,
&step3->product );
&step3->product, &tmp.step3.carry.product );
bigint_add ( &step3->value, &result->value );
/* Step 1 calculates the product of the input operands, and