1 files changed, 758 insertions, 81 deletions
diff --git a/src/crypto/bigint.c b/src/crypto/bigint.c
index 656f979e5..5d2f7b560 100644
--- a/src/crypto/bigint.c
+++ b/src/crypto/bigint.c
@@ -22,11 +22,12 @@
  */
 
 FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
+FILE_SECBOOT ( PERMITTED );
 
 #include <stdint.h>
 #include <string.h>
 #include <assert.h>
-#include <ipxe/profile.h>
+#include <stdio.h>
 #include <ipxe/bigint.h>
 
 /** @file
@@ -34,21 +35,51 @@ FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
  * Big integer support
  */
 
-/** Modular multiplication overall profiler */
-static struct profiler bigint_mod_multiply_profiler __profiler =
-	{ .name = "bigint_mod_multiply" };
+/** Minimum number of least significant bytes included in transcription */
+#define BIGINT_NTOA_LSB_MIN 16
 
-/** Modular multiplication multiply step profiler */
-static struct profiler bigint_mod_multiply_multiply_profiler __profiler =
-	{ .name = "bigint_mod_multiply.multiply" };
+/**
+ * Transcribe big integer (for debugging)
+ *
+ * @v value0		Element 0 of big integer to be transcribed
+ * @v size		Number of elements
+ * @ret string		Big integer in string form (may be abbreviated)
+ */
+const char * bigint_ntoa_raw ( const bigint_element_t *value0,
+			       unsigned int size ) {
+	const bigint_t ( size ) __attribute__ (( may_alias ))
+		*value = ( ( const void * ) value0 );
+	bigint_element_t element;
+	static char buf[256];
+	unsigned int count;
+	int threshold;
+	uint8_t byte;
+	char *tmp;
+	int i;
+
+	/* Calculate abbreviation threshold, if any */
+	count = ( size * sizeof ( element ) );
+	threshold = count;
+	if ( count >= ( ( sizeof ( buf ) - 1 /* NUL */ ) / 2 /* "xx" */ ) ) {
+		threshold -= ( ( sizeof ( buf ) - 3 /* "..." */
+				 - ( BIGINT_NTOA_LSB_MIN * 2 /* "xx" */ )
+				 - 1 /* NUL */ ) / 2 /* "xx" */ );
+	}
 
-/** Modular multiplication rescale step profiler */
-static struct profiler bigint_mod_multiply_rescale_profiler __profiler =
-	{ .name = "bigint_mod_multiply.rescale" };
+	/* Transcribe bytes, abbreviating with "..." if necessary */
+	for ( tmp = buf, i = ( count - 1 ) ; i >= 0 ; i-- ) {
+		element = value->element[ i / sizeof ( element ) ];
+		byte = ( element >> ( 8 * ( i % sizeof ( element ) ) ) );
+		tmp += sprintf ( tmp, "%02x", byte );
+		if ( i == threshold ) {
+			tmp += sprintf ( tmp, "..." );
+			i = BIGINT_NTOA_LSB_MIN;
+		}
+	}
+	assert ( tmp < ( buf + sizeof ( buf ) ) );
 
-/** Modular multiplication subtract step profiler */
-static struct profiler bigint_mod_multiply_subtract_profiler __profiler =
-	{ .name = "bigint_mod_multiply.subtract" };
+	return buf;
+}
 
 /**
  * Conditionally swap big integers (in constant time)
@@ -76,72 +107,635 @@ void bigint_swap_raw ( bigint_element_t *first0, bigint_element_t *second0,
 }
 
 /**
- * Perform modular multiplication of big integers
+ * Multiply big integers
  *
  * @v multiplicand0	Element 0 of big integer to be multiplied
+ * @v multiplicand_size	Number of elements in multiplicand
  * @v multiplier0	Element 0 of big integer to be multiplied
+ * @v multiplier_size	Number of elements in multiplier
+ * @v result0		Element 0 of big integer to hold result
+ */
+void bigint_multiply_raw ( const bigint_element_t *multiplicand0,
+			   unsigned int multiplicand_size,
+			   const bigint_element_t *multiplier0,
+			   unsigned int multiplier_size,
+			   bigint_element_t *result0 ) {
+	unsigned int result_size = ( multiplicand_size + multiplier_size );
+	const bigint_t ( multiplicand_size ) __attribute__ (( may_alias ))
+		*multiplicand = ( ( const void * ) multiplicand0 );
+	const bigint_t ( multiplier_size ) __attribute__ (( may_alias ))
+		*multiplier = ( ( const void * ) multiplier0 );
+	bigint_t ( result_size ) __attribute__ (( may_alias ))
+		*result = ( ( void * ) result0 );
+	bigint_element_t multiplicand_element;
+	const bigint_element_t *multiplier_element;
+	bigint_element_t *result_element;
+	bigint_element_t carry_element;
+	unsigned int i;
+	unsigned int j;
+
+	/* Zero required portion of result
+	 *
+	 * All elements beyond the length of the multiplier will be
+	 * written before they are read, and so do not need to be
+	 * zeroed in advance.
+	 */
+	memset ( result, 0, sizeof ( *multiplier ) );
+
+	/* Multiply integers one element at a time, adding the low
+	 * half of the double-element product directly into the
+	 * result, and maintaining a running single-element carry.
+	 *
+	 * The running carry can never overflow beyond a single
+	 * element.  At each step, the calculation we perform is:
+	 *
+	 *   carry:result[i+j] := ( ( multiplicand[i] * multiplier[j] )
+	 *                          + result[i+j] + carry )
+	 *
+	 * The maximum value (for n-bit elements) is therefore:
+	 *
+	 *   (2^n - 1)*(2^n - 1) + (2^n - 1) + (2^n - 1) = 2^(2n) - 1
+	 *
+	 * This is precisely the maximum value for a 2n-bit integer,
+	 * and so the carry out remains within the range of an n-bit
+	 * integer, i.e. a single element.
+	 */
+	for ( i = 0 ; i < multiplicand_size ; i++ ) {
+		multiplicand_element = multiplicand->element[i];
+		multiplier_element = &multiplier->element[0];
+		result_element = &result->element[i];
+		carry_element = 0;
+		for ( j = 0 ; j < multiplier_size ; j++ ) {
+			bigint_multiply_one ( multiplicand_element,
+					      *(multiplier_element++),
+					      result_element++,
+					      &carry_element );
+		}
+		*result_element = carry_element;
+	}
+}
+
+/**
+ * Reduce big integer R^2 modulo N
+ *
  * @v modulus0		Element 0 of big integer modulus
  * @v result0		Element 0 of big integer to hold result
- * @v size		Number of elements in base, modulus, and result
- * @v tmp		Temporary working space
+ * @v size		Number of elements in modulus and result
+ *
+ * Reduce the value R^2 modulo N, where R=2^n and n is the number of
+ * bits in the representation of the modulus N, including any leading
+ * zero bits.
  */
-void bigint_mod_multiply_raw ( const bigint_element_t *multiplicand0,
-			       const bigint_element_t *multiplier0,
-			       const bigint_element_t *modulus0,
-			       bigint_element_t *result0,
-			       unsigned int size, void *tmp ) {
-	const bigint_t ( size ) __attribute__ (( may_alias )) *multiplicand =
-		( ( const void * ) multiplicand0 );
-	const bigint_t ( size ) __attribute__ (( may_alias )) *multiplier =
-		( ( const void * ) multiplier0 );
-	const bigint_t ( size ) __attribute__ (( may_alias )) *modulus =
-		( ( const void * ) modulus0 );
-	bigint_t ( size ) __attribute__ (( may_alias )) *result =
-		( ( void * ) result0 );
-	struct {
-		bigint_t ( size * 2 ) result;
-		bigint_t ( size * 2 ) modulus;
-	} *temp = tmp;
-	int rotation;
-	int i;
+void bigint_reduce_raw ( const bigint_element_t *modulus0,
+			 bigint_element_t *result0, unsigned int size ) {
+	const bigint_t ( size ) __attribute__ (( may_alias ))
+		*modulus = ( ( const void * ) modulus0 );
+	bigint_t ( size ) __attribute__ (( may_alias ))
+		*result = ( ( void * ) result0 );
+	const unsigned int width = ( 8 * sizeof ( bigint_element_t ) );
+	unsigned int shift;
+	int max;
+	int sign;
+	int msb;
+	int carry;
+
+	/* We have the constants:
+	 *
+	 *    N = modulus
+	 *
+	 *    n = number of bits in the modulus (including any leading zeros)
+	 *
+	 *    R = 2^n
+	 *
+	 * Let r be the extension of the n-bit result register by a
+	 * separate two's complement sign bit, such that -R <= r < R,
+	 * and define:
+	 *
+	 *    x = r * 2^k
+	 *
+	 * as the value being reduced modulo N, where k is a
+	 * non-negative integer bit shift.
+	 *
+	 * We want to reduce the initial value R^2=2^(2n), which we
+	 * may trivially represent using r=1 and k=2n.
+	 *
+	 * We then iterate over decrementing k, maintaining the loop
+	 * invariant:
+	 *
+	 *    -N <= r < N
+	 *
+	 * On each iteration we must first double r, to compensate for
+	 * having decremented k:
+	 *
+	 *    k' = k - 1
+	 *
+	 *    r' = 2r
+	 *
+	 *    x = r * 2^k = 2r * 2^(k-1) = r' * 2^k'
+	 *
+	 * Note that doubling the n-bit result register will create a
+	 * value of n+1 bits: this extra bit needs to be handled
+	 * separately during the calculation.
+	 *
+	 * We then subtract N (if r is currently non-negative) or add
+	 * N (if r is currently negative) to restore the loop
+	 * invariant:
+	 *
+	 *      0 <= r < N    =>   r" = 2r - N    =>   -N <= r" < N
+	 *     -N <= r < 0    =>   r" = 2r + N    =>   -N <= r" < N
+	 *
+	 * Note that since N may use all n bits, the most significant
+	 * bit of the n-bit result register is not a valid two's
+	 * complement sign bit for r: the extra sign bit therefore
+	 * also needs to be handled separately.
+	 *
+	 * Once we reach k=0, we have x=r and therefore:
+	 *
+	 *    -N <= x < N
+	 *
+	 * After this last loop iteration (with k=0), we may need to
+	 * add a single multiple of N to ensure that x is positive,
+	 * i.e. lies within the range 0 <= x < N.
+	 *
+	 * Since neither the modulus nor the value R^2 are secret, we
+	 * may elide approximately half of the total number of
+	 * iterations by constructing the initial representation of
+	 * R^2 as r=2^m and k=2n-m (for some m such that 2^m < N).
+	 */
+
+	/* Initialise x=R^2 */
+	memset ( result, 0, sizeof ( *result ) );
+	max = ( bigint_max_set_bit ( modulus ) - 2 );
+	if ( max < 0 ) {
+		/* Degenerate case of N=0 or N=1: return a zero result */
+		return;
+	}
+	bigint_set_bit ( result, max );
+	shift = ( ( 2 * size * width ) - max );
+	sign = 0;
+
+	/* Iterate as described above */
+	while ( shift-- ) {
+
+		/* Calculate 2r, storing extra bit separately */
+		msb = bigint_shl ( result );
+
+		/* Add or subtract N according to current sign */
+		if ( sign ) {
+			carry = bigint_add ( modulus, result );
+		} else {
+			carry = bigint_subtract ( modulus, result );
+		}
+
+		/* Calculate new sign of result
+		 *
+		 * We know the result lies in the range -N <= r < N
+		 * and so the tuple (old sign, msb, carry) cannot ever
+		 * take the values (0, 1, 0) or (1, 0, 0).  We can
+		 * therefore treat these as don't-care inputs, which
+		 * allows us to simplify the boolean expression by
+		 * ignoring the old sign completely.
+		 */
+		assert ( ( sign == msb ) || carry );
+		sign = ( msb ^ carry );
+	}
 
-	/* Start profiling */
-	profile_start ( &bigint_mod_multiply_profiler );
+	/* Add N to make result positive if necessary */
+	if ( sign )
+		bigint_add ( modulus, result );
 
 	/* Sanity check */
-	assert ( sizeof ( *temp ) == bigint_mod_multiply_tmp_len ( modulus ) );
-
-	/* Perform multiplication */
-	profile_start ( &bigint_mod_multiply_multiply_profiler );
-	bigint_multiply ( multiplicand, multiplier, &temp->result );
-	profile_stop ( &bigint_mod_multiply_multiply_profiler );
-
-	/* Rescale modulus to match result */
-	profile_start ( &bigint_mod_multiply_rescale_profiler );
-	bigint_grow ( modulus, &temp->modulus );
-	rotation = ( bigint_max_set_bit ( &temp->result ) -
-		     bigint_max_set_bit ( &temp->modulus ) );
-	for ( i = 0 ; i < rotation ; i++ )
-		bigint_rol ( &temp->modulus );
-	profile_stop ( &bigint_mod_multiply_rescale_profiler );
-
-	/* Subtract multiples of modulus */
-	profile_start ( &bigint_mod_multiply_subtract_profiler );
-	for ( i = 0 ; i <= rotation ; i++ ) {
-		if ( bigint_is_geq ( &temp->result, &temp->modulus ) )
-			bigint_subtract ( &temp->modulus, &temp->result );
-		bigint_ror ( &temp->modulus );
+	assert ( ! bigint_is_geq ( result, modulus ) );
+}
+
+/**
+ * Compute inverse of odd big integer modulo any power of two
+ *
+ * @v invertend0	Element 0 of odd big integer to be inverted
+ * @v inverse0		Element 0 of big integer to hold result
+ * @v size		Number of elements in invertend and result
+ */
+void bigint_mod_invert_raw ( const bigint_element_t *invertend0,
+			     bigint_element_t *inverse0, unsigned int size ) {
+	const bigint_t ( size ) __attribute__ (( may_alias ))
+		*invertend = ( ( const void * ) invertend0 );
+	bigint_t ( size ) __attribute__ (( may_alias ))
+		*inverse = ( ( void * ) inverse0 );
+	bigint_element_t accum;
+	bigint_element_t bit;
+	unsigned int i;
+
+	/* Sanity check */
+	assert ( bigint_bit_is_set ( invertend, 0 ) );
+
+	/* Initialise output */
+	memset ( inverse, 0xff, sizeof ( *inverse ) );
+
+	/* Compute inverse modulo 2^(width)
+	 *
+	 * This method is a lightly modified version of the pseudocode
+	 * presented in "A New Algorithm for Inversion mod p^k (Koç,
+	 * 2017)".
+	 *
+	 * Each inner loop iteration calculates one bit of the
+	 * inverse.  The residue value is the two's complement
+	 * negation of the value "b" as used by Koç, to allow for
+	 * division by two using a logical right shift (since we have
+	 * no arithmetic right shift operation for big integers).
+	 *
+	 * The residue is stored in the as-yet uncalculated portion of
+	 * the inverse.  The size of the residue therefore decreases
+	 * by one element for each outer loop iteration.  Trivial
+	 * inspection of the algorithm shows that any higher bits
+	 * could not contribute to the eventual output value, and so
+	 * we may safely reuse storage this way.
+	 *
+	 * Due to the suffix property of inverses mod 2^k, the result
+	 * represents the least significant bits of the inverse modulo
+	 * an arbitrarily large 2^k.
+	 */
+	for ( i = size ; i > 0 ; i-- ) {
+		const bigint_t ( i ) __attribute__ (( may_alias ))
+			*addend = ( ( const void * ) invertend );
+		bigint_t ( i ) __attribute__ (( may_alias ))
+			*residue = ( ( void * ) inverse );
+
+		/* Calculate one element's worth of inverse bits */
+		for ( accum = 0, bit = 1 ; bit ; bit <<= 1 ) {
+			if ( bigint_bit_is_set ( residue, 0 ) ) {
+				accum |= bit;
+				bigint_add ( addend, residue );
+			}
+			bigint_shr ( residue );
+		}
+
+		/* Store in the element no longer required to hold residue */
+		inverse->element[ i - 1 ] = accum;
+	}
+
+	/* Correct order of inverse elements */
+	for ( i = 0 ; i < ( size / 2 ) ; i++ ) {
+		accum = inverse->element[i];
+		inverse->element[i] = inverse->element[ size - 1 - i ];
+		inverse->element[ size - 1 - i ] = accum;
+	}
+}
+
+/**
+ * Perform relaxed Montgomery reduction (REDC) of a big integer
+ *
+ * @v modulus0		Element 0 of big integer odd modulus
+ * @v value0		Element 0 of big integer to be reduced
+ * @v result0		Element 0 of big integer to hold result
+ * @v size		Number of elements in modulus and result
+ * @ret carry		Carry out
+ *
+ * The value to be reduced will be made divisible by the size of the
+ * modulus while retaining its residue class (i.e. multiples of the
+ * modulus will be added until the low half of the value is zero).
+ *
+ * The result may be expressed as
+ *
+ *    tR = x + mN
+ *
+ * where x is the input value, N is the modulus, R=2^n (where n is the
+ * number of bits in the representation of the modulus, including any
+ * leading zero bits), and m is the number of multiples of the modulus
+ * added to make the result tR divisible by R.
+ *
+ * The maximum addend is mN <= (R-1)*N (and such an m can be proven to
+ * exist since N is limited to being odd and therefore coprime to R).
+ *
+ * Since the result of this addition is one bit larger than the input
+ * value, a carry out bit is also returned.  The caller may be able to
+ * prove that the carry out is always zero, in which case it may be
+ * safely ignored.
+ *
+ * The upper half of the output value (i.e. t) will also be copied to
+ * the result pointer.  It is permissible for the result pointer to
+ * overlap the lower half of the input value.
+ *
+ * External knowledge of constraints on the modulus and the input
+ * value may be used to prove constraints on the result.  The
+ * constraint on the modulus may be generally expressed as
+ *
+ *    R > kN
+ *
+ * for some positive integer k.  The value k=1 is allowed, and simply
+ * expresses that the modulus fits within the number of bits in its
+ * own representation.
+ *
+ * For classic Montgomery reduction, we have k=1, i.e. R > N and a
+ * separate constraint that the input value is in the range x < RN.
+ * This gives the result constraint
+ *
+ *    tR < RN + (R-1)N
+ *       < 2RN - N
+ *       < 2RN
+ *     t < 2N
+ *
+ * A single subtraction of the modulus may therefore be required to
+ * bring it into the range t < N.
+ *
+ * When the input value is known to be a product of two integers A and
+ * B, with A < aN and B < bN, we get the result constraint
+ *
+ *    tR < abN^2 + (R-1)N
+ *       < (ab/k)RN + RN - N
+ *       < (1 + ab/k)RN
+ *     t < (1 + ab/k)N
+ *
+ * If we have k=a=b=1, i.e. R > N with A < N and B < N, then the
+ * result is in the range t < 2N and may require a single subtraction
+ * of the modulus to bring it into the range t < N so that it may be
+ * used as an input on a subsequent iteration.
+ *
+ * If we have k=4 and a=b=2, i.e. R > 4N with A < 2N and B < 2N, then
+ * the result is in the range t < 2N and may immediately be used as an
+ * input on a subsequent iteration, without requiring a subtraction.
+ *
+ * Larger values of k may be used to allow for larger values of a and
+ * b, which can be useful to elide intermediate reductions in a
+ * calculation chain that involves additions and subtractions between
+ * multiplications (as used in elliptic curve point addition, for
+ * example).  As a general rule: each intermediate addition or
+ * subtraction will require k to be doubled.
+ *
+ * When the input value is known to be a single integer A, with A < aN
+ * (as used when converting out of Montgomery form), we get the result
+ * constraint
+ *
+ *    tR < aN + (R-1)N
+ *       < RN + (a-1)N
+ *
+ * If we have a=1, i.e. A < N, then the constraint becomes
+ *
+ *    tR < RN
+ *     t < N
+ *
+ * and so the result is immediately in the range t < N with no
+ * subtraction of the modulus required.
+ *
+ * For any larger value of a, the result value t=N becomes possible.
+ * Additional external knowledge may potentially be used to prove that
+ * t=N cannot occur.  For example: if the caller is performing modular
+ * exponentiation with a prime modulus (or, more generally, a modulus
+ * that is coprime to the base), then there is no way for a non-zero
+ * base value to end up producing an exact multiple of the modulus.
+ * If t=N cannot be disproved, then conversion out of Montgomery form
+ * may require an additional subtraction of the modulus.
+ */
+int bigint_montgomery_relaxed_raw ( const bigint_element_t *modulus0,
+				    bigint_element_t *value0,
+				    bigint_element_t *result0,
+				    unsigned int size ) {
+	const bigint_t ( size ) __attribute__ (( may_alias ))
+		*modulus = ( ( const void * ) modulus0 );
+	union {
+		bigint_t ( size * 2 ) full;
+		struct {
+			bigint_t ( size ) low;
+			bigint_t ( size ) high;
+		} __attribute__ (( packed ));
+	} __attribute__ (( may_alias )) *value = ( ( void * ) value0 );
+	bigint_t ( size ) __attribute__ (( may_alias ))
+		*result = ( ( void * ) result0 );
+	static bigint_t ( 1 ) cached;
+	static bigint_t ( 1 ) negmodinv;
+	bigint_element_t multiple;
+	bigint_element_t carry;
+	unsigned int i;
+	unsigned int j;
+	int overflow;
+
+	/* Sanity checks */
+	assert ( bigint_bit_is_set ( modulus, 0 ) );
+
+	/* Calculate inverse (or use cached version) */
+	if ( cached.element[0] != modulus->element[0] ) {
+		bigint_mod_invert ( modulus, &negmodinv );
+		negmodinv.element[0] = -negmodinv.element[0];
+		cached.element[0] = modulus->element[0];
+	}
+
+	/* Perform multiprecision Montgomery reduction */
+	for ( i = 0 ; i < size ; i++ ) {
+
+		/* Determine scalar multiple for this round */
+		multiple = ( value->low.element[i] * negmodinv.element[0] );
+
+		/* Multiply value to make it divisible by 2^(width*i) */
+		carry = 0;
+		for ( j = 0 ; j < size ; j++ ) {
+			bigint_multiply_one ( multiple, modulus->element[j],
+					      &value->full.element[ i + j ],
+					      &carry );
+		}
+
+		/* Since value is now divisible by 2^(width*i), we
+		 * know that the current low element must have been
+		 * zeroed.
+		 */
+		assert ( value->low.element[i] == 0 );
+
+		/* Store the multiplication carry out in the result,
+		 * avoiding the need to immediately propagate the
+		 * carry through the remaining elements.
+		 */
+		result->element[i] = carry;
 	}
-	profile_stop ( &bigint_mod_multiply_subtract_profiler );
 
-	/* Resize result */
-	bigint_shrink ( &temp->result, result );
+	/* Add the accumulated carries */
+	overflow = bigint_add ( result, &value->high );
+
+	/* Copy to result buffer */
+	bigint_copy ( &value->high, result );
+
+	return overflow;
+}
+
+/**
+ * Perform classic Montgomery reduction (REDC) of a big integer
+ *
+ * @v modulus0		Element 0 of big integer odd modulus
+ * @v value0		Element 0 of big integer to be reduced
+ * @v result0		Element 0 of big integer to hold result
+ * @v size		Number of elements in modulus and result
+ */
+void bigint_montgomery_raw ( const bigint_element_t *modulus0,
+			     bigint_element_t *value0,
+			     bigint_element_t *result0,
+			     unsigned int size ) {
+	const bigint_t ( size ) __attribute__ (( may_alias ))
+		*modulus = ( ( const void * ) modulus0 );
+	union {
+		bigint_t ( size * 2 ) full;
+		struct {
+			bigint_t ( size ) low;
+			bigint_t ( size ) high;
+		} __attribute__ (( packed ));
+	} __attribute__ (( may_alias )) *value = ( ( void * ) value0 );
+	bigint_t ( size ) __attribute__ (( may_alias ))
+		*result = ( ( void * ) result0 );
+	int overflow;
+	int underflow;
 
 	/* Sanity check */
-	assert ( bigint_is_geq ( modulus, result ) );
+	assert ( ! bigint_is_geq ( &value->high, modulus ) );
+
+	/* Perform relaxed Montgomery reduction */
+	overflow = bigint_montgomery_relaxed ( modulus, &value->full, result );
 
-	/* Stop profiling */
-	profile_stop ( &bigint_mod_multiply_profiler );
+	/* Conditionally subtract the modulus once */
+	underflow = bigint_subtract ( modulus, result );
+	bigint_swap ( result, &value->high, ( underflow & ~overflow ) );
+
+	/* Sanity check */
+	assert ( ! bigint_is_geq ( result, modulus ) );
+}
+
+/**
+ * Perform generalised exponentiation via a Montgomery ladder
+ *
+ * @v result0		Element 0 of result (initialised to identity element)
+ * @v multiple0		Element 0 of multiple (initialised to generator)
+ * @v size		Number of elements in result and multiple
+ * @v exponent0		Element 0 of exponent
+ * @v exponent_size	Number of elements in exponent
+ * @v op		Montgomery ladder commutative operation
+ * @v ctx		Operation context (if needed)
+ * @v tmp		Temporary working space (if needed)
+ *
+ * The Montgomery ladder may be used to perform any operation that is
+ * isomorphic to exponentiation, i.e. to compute the result
+ *
+ *    r = g^e = g * g * g * g * .... * g
+ *
+ * for an arbitrary commutative operation "*", generator "g" and
+ * exponent "e".
+ *
+ * The result "r" is computed in constant time (assuming that the
+ * underlying operation is constant time) in k steps, where k is the
+ * number of bits in the big integer representation of the exponent.
+ *
+ * The result "r" must be initialised to the operation's identity
+ * element, and the multiple must be initialised to the generator "g".
+ * On exit, the multiple will contain
+ *
+ *    m = r * g = g^(e+1)
+ *
+ * Note that the terminology used here refers to exponentiation
+ * defined as repeated multiplication, but that the ladder may equally
+ * well be used to perform any isomorphic operation (such as
+ * multiplication defined as repeated addition).
+ */
+void bigint_ladder_raw ( bigint_element_t *result0,
+			 bigint_element_t *multiple0, unsigned int size,
+			 const bigint_element_t *exponent0,
+			 unsigned int exponent_size, bigint_ladder_op_t *op,
+			 const void *ctx, void *tmp ) {
+	bigint_t ( size ) __attribute__ (( may_alias ))
+		*result = ( ( void * ) result0 );
+	bigint_t ( size ) __attribute__ (( may_alias ))
+		*multiple = ( ( void * ) multiple0 );
+	const bigint_t ( exponent_size ) __attribute__ (( may_alias ))
+		*exponent = ( ( const void * ) exponent0 );
+	const unsigned int width = ( 8 * sizeof ( bigint_element_t ) );
+	unsigned int bit = ( exponent_size * width );
+	int toggle = 0;
+	int previous;
+
+	/* We have two physical storage locations: Rres (the "result"
+	 * register) and Rmul (the "multiple" register).
+	 *
+	 * On entry we have:
+	 *
+	 *   Rres = g^0 = 1  (the identity element)
+	 *   Rmul = g        (the generator)
+	 *
+	 * For calculation purposes, define two virtual registers R[0]
+	 * and R[1], mapped to the underlying physical storage
+	 * locations via a boolean toggle state "t":
+	 *
+	 *   R[t]  -> Rres
+	 *   R[~t] -> Rmul
+	 *
+	 * Define the initial toggle state to be t=0, so that we have:
+	 *
+	 *   R[0] = g^0 = 1  (the identity element)
+	 *   R[1] = g        (the generator)
+	 *
+	 * The Montgomery ladder then iterates over each bit "b" of
+	 * the exponent "e" (from MSB down to LSB), computing:
+	 *
+	 *   R[1]  := R[0] * R[1],    R[0]  := R[0] * R[0]    (if b=0)
+	 *   R[0]  := R[1] * R[0],    R[1]  := R[1] * R[1]    (if b=1)
+	 *
+	 * i.e.
+	 *
+	 *   R[~b] := R[b] * R[~b],   R[b]  := R[b] * R[b]
+	 *
+	 * To avoid data-dependent memory access patterns, we
+	 * implement this as:
+	 *
+	 *   Rmul := Rres * Rmul
+	 *   Rres := Rres * Rres
+	 *
+	 * and update the toggle state "t" (via constant-time
+	 * conditional swaps) as needed to ensure that R[b] always
+	 * maps to Rres and R[~b] always maps to Rmul.
+	 */
+	while ( bit-- ) {
+
+		/* Update toggle state t := b
+		 *
+		 * If the new toggle state is not equal to the old
+		 * toggle state, then we must swap R[0] and R[1] (in
+		 * constant time).
+		 */
+		previous = toggle;
+		toggle = bigint_bit_is_set ( exponent, bit );
+		bigint_swap ( result, multiple, ( toggle ^ previous ) );
+
+		/* Calculate Rmul = R[~b] := R[b] * R[~b] = Rres * Rmul */
+		op ( result->element, multiple->element, size, ctx, tmp );
+
+		/* Calculate Rres = R[b]  := R[b] * R[b]  = Rres * Rres */
+		op ( result->element, result->element, size, ctx, tmp );
+	}
+
+	/* Reset toggle state to zero */
+	bigint_swap ( result, multiple, toggle );
+}
+
+/**
+ * Perform modular multiplication as part of a Montgomery ladder
+ *
+ * @v operand		Element 0 of first input operand (may overlap result)
+ * @v result		Element 0 of second input operand and result
+ * @v size		Number of elements in operands and result
+ * @v ctx		Operation context (odd modulus, or NULL)
+ * @v tmp		Temporary working space
+ */
+void bigint_mod_exp_ladder ( const bigint_element_t *multiplier0,
+			     bigint_element_t *result0, unsigned int size,
+			     const void *ctx, void *tmp ) {
+	const bigint_t ( size ) __attribute__ (( may_alias ))
+		*multiplier = ( ( const void * ) multiplier0 );
+	bigint_t ( size ) __attribute__ (( may_alias ))
+		*result = ( ( void * ) result0 );
+	const bigint_t ( size ) __attribute__ (( may_alias ))
+		*modulus = ( ( const void * ) ctx );
+	bigint_t ( size * 2 ) __attribute__ (( may_alias ))
+		*product = ( ( void * ) tmp );
+
+	/* Multiply and reduce */
+	bigint_multiply ( result, multiplier, product );
+	if ( modulus ) {
+		bigint_montgomery ( modulus, product, result );
+	} else {
+		bigint_shrink ( product, result );
+	}
 }
 
 /**
@@ -169,25 +763,108 @@ void bigint_mod_exp_raw ( const bigint_element_t *base0,
 		*exponent = ( ( const void * ) exponent0 );
 	bigint_t ( size ) __attribute__ (( may_alias )) *result =
 		( ( void * ) result0 );
-	size_t mod_multiply_len = bigint_mod_multiply_tmp_len ( modulus );
+	const unsigned int width = ( 8 * sizeof ( bigint_element_t ) );
 	struct {
-		bigint_t ( size ) base;
-		bigint_t ( exponent_size ) exponent;
-		uint8_t mod_multiply[mod_multiply_len];
+		struct {
+			bigint_t ( size ) modulus;
+			bigint_t ( size ) stash;
+		};
+		union {
+			bigint_t ( 2 * size ) full;
+			bigint_t ( size ) low;
+		} product;
 	} *temp = tmp;
-	static const uint8_t start[1] = { 0x01 };
+	const uint8_t one[1] = { 1 };
+	bigint_element_t submask;
+	unsigned int subsize;
+	unsigned int scale;
+	unsigned int i;
 
-	memcpy ( &temp->base, base, sizeof ( temp->base ) );
-	memcpy ( &temp->exponent, exponent, sizeof ( temp->exponent ) );
-	bigint_init ( result, start, sizeof ( start ) );
+	/* Sanity check */
+	assert ( sizeof ( *temp ) == bigint_mod_exp_tmp_len ( modulus ) );
 
-	while ( ! bigint_is_zero ( &temp->exponent ) ) {
-		if ( bigint_bit_is_set ( &temp->exponent, 0 ) ) {
-			bigint_mod_multiply ( result, &temp->base, modulus,
-					      result, temp->mod_multiply );
-		}
-		bigint_ror ( &temp->exponent );
-		bigint_mod_multiply ( &temp->base, &temp->base, modulus,
-				      &temp->base, temp->mod_multiply );
+	/* Handle degenerate case of zero modulus */
+	if ( ! bigint_max_set_bit ( modulus ) ) {
+		memset ( result, 0, sizeof ( *result ) );
+		return;
+	}
+
+	/* Factor modulus as (N * 2^scale) where N is odd */
+	bigint_copy ( modulus, &temp->modulus );
+	for ( scale = 0 ; ( ! bigint_bit_is_set ( &temp->modulus, 0 ) ) ;
+	      scale++ ) {
+		bigint_shr ( &temp->modulus );
+	}
+	subsize = ( ( scale + width - 1 ) / width );
+	submask = ( ( 1UL << ( scale % width ) ) - 1 );
+	if ( ! submask )
+		submask = ~submask;
+
+	/* Calculate (R^2 mod N) */
+	bigint_reduce ( &temp->modulus, &temp->stash );
+
+	/* Initialise result = Montgomery(1, R^2 mod N) */
+	bigint_grow ( &temp->stash, &temp->product.full );
+	bigint_montgomery ( &temp->modulus, &temp->product.full, result );
+
+	/* Convert base into Montgomery form */
+	bigint_multiply ( base, &temp->stash, &temp->product.full );
+	bigint_montgomery ( &temp->modulus, &temp->product.full,
+			    &temp->stash );
+
+	/* Calculate x1 = base^exponent modulo N */
+	bigint_ladder ( result, &temp->stash, exponent, bigint_mod_exp_ladder,
+			&temp->modulus, &temp->product );
+
+	/* Convert back out of Montgomery form */
+	bigint_grow ( result, &temp->product.full );
+	bigint_montgomery_relaxed ( &temp->modulus, &temp->product.full,
+				    result );
+
+	/* Handle even moduli via Garner's algorithm */
+	if ( subsize ) {
+		const bigint_t ( subsize ) __attribute__ (( may_alias ))
+			*subbase = ( ( const void * ) base );
+		bigint_t ( subsize ) __attribute__ (( may_alias ))
+			*submodulus = ( ( void * ) &temp->modulus );
+		bigint_t ( subsize ) __attribute__ (( may_alias ))
+			*substash = ( ( void * ) &temp->stash );
+		bigint_t ( subsize ) __attribute__ (( may_alias ))
+			*subresult = ( ( void * ) result );
+		union {
+			bigint_t ( 2 * subsize ) full;
+			bigint_t ( subsize ) low;
+		} __attribute__ (( may_alias ))
+			*subproduct = ( ( void * ) &temp->product.full );
+
+		/* Calculate x2 = base^exponent modulo 2^k */
+		bigint_init ( substash, one, sizeof ( one ) );
+		bigint_copy ( subbase, submodulus );
+		bigint_ladder ( substash, submodulus, exponent,
+				bigint_mod_exp_ladder, NULL, subproduct );
+
+		/* Reconstruct N */
+		bigint_copy ( modulus, &temp->modulus );
+		for ( i = 0 ; i < scale ; i++ )
+			bigint_shr ( &temp->modulus );
+
+		/* Calculate N^-1 modulo 2^k */
+		bigint_mod_invert ( submodulus, &subproduct->low );
+		bigint_copy ( &subproduct->low, submodulus );
+
+		/* Calculate y = (x2 - x1) * N^-1 modulo 2^k */
+		bigint_subtract ( subresult, substash );
+		bigint_multiply ( substash, submodulus, &subproduct->full );
+		subproduct->low.element[ subsize - 1 ] &= submask;
+		bigint_grow ( &subproduct->low, &temp->stash );
+
+		/* Reconstruct N */
+		bigint_mod_invert ( submodulus, &subproduct->low );
+		bigint_copy ( &subproduct->low, submodulus );
+
+		/* Calculate x = x1 + N * y */
+		bigint_multiply ( &temp->modulus, &temp->stash,
+				  &temp->product.full );
+		bigint_add ( &temp->product.low, result );
 	}
 }