From 5ed8388f6b5f549ee208854eecf828dcdb60834b Mon Sep 17 00:00:00 2001 From: KonstantinKondrashov Date: Sat, 19 Oct 2019 18:03:18 +0800 Subject: [PATCH 1/2] mbedtls: Add Montgomery exponentiation implementation (HAC 14.94) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It gave us a better performance of RSA operations. (2~11 times) The old modexp implementation (Z = X ^ Y mod M) loaded all the data into the hw registers and was waiting for completion, but due to the hardware RSA implementation, the calculations always started with 4096 bit, which took a lot of time. Measurement results (measurements were made for keys: 2048, 3072 and 4096 bits) (Old) - Sliding-window exponentiation (HAC 14.85): keysize = 2048 bits RSA key operation (performance): public [93206 us], private [280189 us] keysize = 3072 bits RSA key operation (performance): public [293614 us], private [858157 us] keysize = 4096 bits RSA key operation (performance): public [653192 us], private [1912126 us] Instead (Old) - Sliding-window exponentiation (HAC 14.85) was implemented (New) - Montgomery exponentiation (HAC 14.94) which showed better performance on private and public keys. keysize = 2048 bits RSA key operation (performance): public [14504 us], private [149456 us] keysize = 3072 bits RSA key operation (performance): public [35073 us], private [392743 us] keysize = 4096 bits RSA key operation (performance): public [58650 us], private [787186 us] For this reason, the old implementation was removed and the MBEDTLS_HARDWARE_MPI option was turned on by default. Why the MPI_INTERRUPT option is removed: the old implementation used calculations on the hardware and it took a lot of time (10ms - 500ms). And in order not to stand idle while waiting for completion, an interrupt option was added. This made it possible to carry out other tasks during the calculation, and this one to block. The new method is free from such a drawback and the maximum duration of one RSA HW operation does not exceed 70us (usually 2-70 μs). This option is no longer needed. Closes: IDF-965 --- components/mbedtls/Kconfig | 12 +- components/mbedtls/port/esp_bignum.c | 180 ++++++++++++++++--------- tools/ldgen/samples/sdkconfig | 1 - tools/unit-test-app/sdkconfig.defaults | 2 - 4 files changed, 120 insertions(+), 75 deletions(-) diff --git a/components/mbedtls/Kconfig b/components/mbedtls/Kconfig index 89bfa9abe..4a4a491d7 100644 --- a/components/mbedtls/Kconfig +++ b/components/mbedtls/Kconfig @@ -140,7 +140,7 @@ menu "mbedTLS" config MBEDTLS_HARDWARE_MPI bool "Enable hardware MPI (bignum) acceleration" - default n + default y help Enable hardware accelerated multiple precision integer operations. @@ -149,16 +149,6 @@ menu "mbedTLS" These operations are used by RSA. - config MBEDTLS_MPI_USE_INTERRUPT - bool "Use interrupt for MPI operations" - depends on MBEDTLS_HARDWARE_MPI - default n - help - Use an interrupt to coordinate MPI operations. - - This allows other code to run on the CPU while an MPI operation is pending. - Otherwise the CPU busy-waits. - config MBEDTLS_HARDWARE_SHA bool "Enable hardware SHA acceleration" default n diff --git a/components/mbedtls/port/esp_bignum.c b/components/mbedtls/port/esp_bignum.c index 09bb774b9..5e6fc90f3 100644 --- a/components/mbedtls/port/esp_bignum.c +++ b/components/mbedtls/port/esp_bignum.c @@ -60,29 +60,6 @@ static const __attribute__((unused)) char *TAG = "bignum"; #define ciL (sizeof(mbedtls_mpi_uint)) /* chars in limb */ #define biL (ciL << 3) /* bits in limb */ -#if defined(CONFIG_MBEDTLS_MPI_USE_INTERRUPT) -static SemaphoreHandle_t op_complete_sem; - -static IRAM_ATTR void rsa_complete_isr(void *arg) -{ - BaseType_t higher_woken; - DPORT_REG_WRITE(RSA_INTERRUPT_REG, 1); - xSemaphoreGiveFromISR(op_complete_sem, &higher_woken); - if (higher_woken) { - portYIELD_FROM_ISR(); - } -} - -static void rsa_isr_initialise(void) -{ - if (op_complete_sem == NULL) { - op_complete_sem = xSemaphoreCreateBinary(); - esp_intr_alloc(ETS_RSA_INTR_SOURCE, 0, rsa_complete_isr, NULL, NULL); - } -} - -#endif /* CONFIG_MBEDTLS_MPI_USE_INTERRUPT */ - static _lock_t mpi_lock; void esp_mpi_acquire_hardware( void ) @@ -96,10 +73,6 @@ void esp_mpi_acquire_hardware( void ) while(DPORT_REG_READ(RSA_CLEAN_REG) != 1); // Note: from enabling RSA clock to here takes about 1.3us - -#ifdef CONFIG_MBEDTLS_MPI_USE_INTERRUPT - rsa_isr_initialise(); -#endif } void esp_mpi_release_hardware( void ) @@ -264,20 +237,11 @@ static inline void start_op(uint32_t op_reg) */ static inline void wait_op_complete(uint32_t op_reg) { -#ifdef CONFIG_MBEDTLS_MPI_USE_INTERRUPT - if (!xSemaphoreTake(op_complete_sem, 2000 / portTICK_PERIOD_MS)) { - ESP_LOGE(TAG, "Timed out waiting for RSA operation (op_reg 0x%x int_reg 0x%x)", - op_reg, DPORT_REG_READ(RSA_INTERRUPT_REG)); - abort(); /* indicates a fundamental problem with driver */ - } -#else while(DPORT_REG_READ(RSA_INTERRUPT_REG) != 1) { } /* clear the interrupt */ DPORT_REG_WRITE(RSA_INTERRUPT_REG, 1); -#endif - } /* Sub-stages of modulo multiplication/exponentiation operations */ @@ -335,8 +299,124 @@ int esp_mpi_mul_mpi_mod(mbedtls_mpi *Z, const mbedtls_mpi *X, const mbedtls_mpi #if defined(MBEDTLS_MPI_EXP_MOD_ALT) +static int mont(mbedtls_mpi* Z, const mbedtls_mpi* X, const mbedtls_mpi* Y, const mbedtls_mpi* M, + mbedtls_mpi_uint Mprime, + size_t hw_words, + bool again) +{ + // Note Z may be the same pointer as X or Y + int ret = 0; + + // montgomery mult prepare + if (again == false) { + mpi_to_mem_block(RSA_MEM_M_BLOCK_BASE, M, hw_words); + DPORT_REG_WRITE(RSA_M_DASH_REG, Mprime); + DPORT_REG_WRITE(RSA_MULT_MODE_REG, hw_words / 16 - 1); + } + + mpi_to_mem_block(RSA_MEM_X_BLOCK_BASE, X, hw_words); + mpi_to_mem_block(RSA_MEM_RB_BLOCK_BASE, Y, hw_words); + + start_op(RSA_MULT_START_REG); + + MBEDTLS_MPI_CHK( mbedtls_mpi_grow(Z, hw_words) ); + + wait_op_complete(RSA_MULT_START_REG); + + /* Read back the result */ + mem_block_to_mpi(Z, RSA_MEM_Z_BLOCK_BASE, hw_words); + + /* from HAC 14.36 - 3. If Z >= M then Z = Z - M */ + if (mbedtls_mpi_cmp_mpi(Z, M) >= 0) { + MBEDTLS_MPI_CHK(mbedtls_mpi_sub_mpi(Z, Z, M)); + } + cleanup: + return ret; +} + /* - * Sliding-window exponentiation: Z = X^Y mod M (HAC 14.85) + * Return the most significant one-bit. + */ +static size_t mbedtls_mpi_msb( const mbedtls_mpi* X ) +{ + int i, j; + if (X != NULL && X->n != 0) { + for (i = X->n - 1; i >= 0; i--) { + if (X->p[i] != 0) { + for (j = biL - 1; j >= 0; j--) { + if ((X->p[i] & (1 << j)) != 0) { + return (i * biL) + j; + } + } + } + } + } + return 0; +} + +/* + * Montgomery exponentiation: Z = X ^ Y mod M (HAC 14.94) + */ +static int mpi_montgomery_exp_calc( mbedtls_mpi* Z, const mbedtls_mpi* X, const mbedtls_mpi* Y, const mbedtls_mpi* M, + mbedtls_mpi* Rinv, + size_t hw_words, + mbedtls_mpi_uint Mprime ) +{ + int ret = 0; + mbedtls_mpi X_, one; + + mbedtls_mpi_init(&X_); + mbedtls_mpi_init(&one); + if( ( ( ret = mbedtls_mpi_grow(&one, hw_words) ) != 0 ) || + ( ( ret = mbedtls_mpi_set_bit(&one, 0, 1) ) != 0 ) ) { + goto cleanup2; + } + + // Algorithm from HAC 14.94 + { + // 0 determine t (highest bit set in y) + int t = mbedtls_mpi_msb(Y); + + esp_mpi_acquire_hardware(); + + // 1.1 x_ = mont(x, R^2 mod m) + // = mont(x, rb) + MBEDTLS_MPI_CHK( mont(&X_, X, Rinv, M, Mprime, hw_words, false) ); + + // 1.2 z = R mod m + // now z = R mod m = Mont (R^2 mod m, 1) mod M (as Mont(x) = X&R^-1 mod M) + MBEDTLS_MPI_CHK( mont(Z, Rinv, &one, M, Mprime, hw_words, true) ); + + // 2 for i from t down to 0 + for (int i = t; i >= 0; i--) { + // 2.1 z = mont(z,z) + if (i != t) { // skip on the first iteration as is still unity + MBEDTLS_MPI_CHK( mont(Z, Z, Z, M, Mprime, hw_words, true) ); + } + + // 2.2 if y[i] = 1 then z = mont(A, x_) + if (mbedtls_mpi_get_bit(Y, i)) { + MBEDTLS_MPI_CHK( mont(Z, Z, &X_, M, Mprime, hw_words, true) ); + } + } + + // 3 z = Mont(z, 1) + MBEDTLS_MPI_CHK( mont(Z, Z, &one, M, Mprime, hw_words, true) ); + } + + cleanup: + mbedtls_mpi_free(&X_); + mbedtls_mpi_free(&one); + esp_mpi_release_hardware(); + return ret; + + cleanup2: + mbedtls_mpi_free(&one); + return ret; +} + +/* + * Z = X ^ Y mod M * * _Rinv is optional pre-calculated version of Rinv (via calculate_rinv()). * @@ -389,30 +469,8 @@ int mbedtls_mpi_exp_mod( mbedtls_mpi* Z, const mbedtls_mpi* X, const mbedtls_mpi Mprime = modular_inverse(M); - esp_mpi_acquire_hardware(); - - /* "mode" register loaded with number of 512-bit blocks, minus 1 */ - DPORT_REG_WRITE(RSA_MODEXP_MODE_REG, (hw_words / 16) - 1); - - /* Load M, X, Rinv, M-prime (M-prime is mod 2^32) */ - mpi_to_mem_block(RSA_MEM_X_BLOCK_BASE, X, hw_words); - mpi_to_mem_block(RSA_MEM_Y_BLOCK_BASE, Y, hw_words); - mpi_to_mem_block(RSA_MEM_M_BLOCK_BASE, M, hw_words); - mpi_to_mem_block(RSA_MEM_RB_BLOCK_BASE, Rinv, hw_words); - DPORT_REG_WRITE(RSA_M_DASH_REG, Mprime); - - start_op(RSA_START_MODEXP_REG); - - /* X ^ Y may actually be shorter than M, but unlikely when used for crypto */ - if ((ret = mbedtls_mpi_grow(Z, m_words)) != 0) { - esp_mpi_release_hardware(); - goto cleanup; - } - - wait_op_complete(RSA_START_MODEXP_REG); - - mem_block_to_mpi(Z, RSA_MEM_Z_BLOCK_BASE, m_words); - esp_mpi_release_hardware(); + // Montgomery exponentiation: Z = X ^ Y mod M (HAC 14.94) + MBEDTLS_MPI_CHK( mpi_montgomery_exp_calc(Z, X, Y, M, Rinv, hw_words, Mprime) ); // Compensate for negative X if (X->s == -1 && (Y->p[0] & 1) != 0) { diff --git a/tools/ldgen/samples/sdkconfig b/tools/ldgen/samples/sdkconfig index 112abbec6..b28c4dd66 100644 --- a/tools/ldgen/samples/sdkconfig +++ b/tools/ldgen/samples/sdkconfig @@ -404,7 +404,6 @@ CONFIG_MBEDTLS_SSL_MAX_CONTENT_LEN=16384 CONFIG_MBEDTLS_DEBUG= CONFIG_MBEDTLS_HARDWARE_AES=y CONFIG_MBEDTLS_HARDWARE_MPI=y -CONFIG_MBEDTLS_MPI_USE_INTERRUPT=y CONFIG_MBEDTLS_HARDWARE_SHA= CONFIG_MBEDTLS_HAVE_TIME=y CONFIG_MBEDTLS_HAVE_TIME_DATE= diff --git a/tools/unit-test-app/sdkconfig.defaults b/tools/unit-test-app/sdkconfig.defaults index 947c23287..79469ddd3 100644 --- a/tools/unit-test-app/sdkconfig.defaults +++ b/tools/unit-test-app/sdkconfig.defaults @@ -13,8 +13,6 @@ CONFIG_FREERTOS_WATCHPOINT_END_OF_STACK=y CONFIG_FREERTOS_THREAD_LOCAL_STORAGE_POINTERS=3 CONFIG_FREERTOS_USE_TRACE_FACILITY=y CONFIG_HEAP_POISONING_COMPREHENSIVE=y -CONFIG_MBEDTLS_HARDWARE_MPI=y -CONFIG_MBEDTLS_MPI_USE_INTERRUPT=y CONFIG_MBEDTLS_HARDWARE_SHA=y CONFIG_SPI_FLASH_ENABLE_COUNTERS=y CONFIG_ESP32_ULP_COPROC_ENABLED=y From e8d3b80e4baa775903028c6341b7d84fa50d141b Mon Sep 17 00:00:00 2001 From: KonstantinKondrashov Date: Tue, 22 Oct 2019 18:22:02 +0800 Subject: [PATCH 2/2] mbedtls: Add an UT for performance RSA key operations (New) - Montgomery exponentiation: Z = X ^ Y mod M (HAC 14.94) keysize = 2048 bits RSA key operation (performance): public [21894 us], private [199119 us] RSA key operation (performance): public [18768 us], private [189051 us] RSA key operation (performance): public [16242 us], private [190821 us] keysize = 3072 bits RSA key operation (performance): public [39762 us], private [437480 us] RSA key operation (performance): public [36550 us], private [449422 us] RSA key operation (performance): public [40536 us], private [443451 us] keysize = 4096 bits RSA key operation (performance): public [65671 us], private [885215 us] RSA key operation (performance): public [60770 us], private [880936 us] RSA key operation (performance): public [68951 us], private [872027 us] (Old) - Sliding-window exponentiation: Z = X ^ Y mod M (HAC 14.85) keysize = 2048 bits RSA key operation (performance): public [93206 us], private [280189 us] RSA key operation (performance): public [93060 us], private [278893 us] RSA key operation (performance): public [97520 us], private [283252 us] keysize = 3072 bits RSA key operation (performance): public [293614 us], private [858157 us] RSA key operation (performance): public [289902 us], private [843701 us] RSA key operation (performance): public [291495 us], private [845232 us] keysize = 4096 bits RSA key operation (performance): public [653192 us], private [1912126 us] RSA key operation (performance): public [656661 us], private [1901792 us] RSA key operation (performance): public [641390 us], private [1938911 us] --- components/idf_test/include/idf_performance.h | 4 ++ components/mbedtls/test/test_rsa.c | 55 +++++++++++++++++++ 2 files changed, 59 insertions(+) diff --git a/components/idf_test/include/idf_performance.h b/components/idf_test/include/idf_performance.h index 8a952c406..bb640cfcf 100644 --- a/components/idf_test/include/idf_performance.h +++ b/components/idf_test/include/idf_performance.h @@ -31,3 +31,7 @@ #define IDF_PERFORMANCE_MAX_ESP32_CYCLES_PER_DIV 70 #define IDF_PERFORMANCE_MAX_ESP32_CYCLES_PER_SQRT 140 +#define IDF_PERFORMANCE_MAX_RSA_2048KEY_PUBLIC_OP 19000 +#define IDF_PERFORMANCE_MAX_RSA_2048KEY_PRIVATE_OP 180000 +#define IDF_PERFORMANCE_MAX_RSA_4096KEY_PUBLIC_OP 65000 +#define IDF_PERFORMANCE_MAX_RSA_4096KEY_PRIVATE_OP 850000 diff --git a/components/mbedtls/test/test_rsa.c b/components/mbedtls/test/test_rsa.c index df6d8f238..80f36a469 100644 --- a/components/mbedtls/test/test_rsa.c +++ b/components/mbedtls/test/test_rsa.c @@ -11,11 +11,13 @@ #include "mbedtls/rsa.h" #include "mbedtls/pk.h" #include "mbedtls/x509_crt.h" +#include "mbedtls/entropy_poll.h" #include "freertos/FreeRTOS.h" #include "freertos/task.h" #include "freertos/semphr.h" #include "unity.h" #include "sdkconfig.h" +#include "test_utils.h" /* Taken from openssl s_client -connect api.gigafive.com:443 -showcerts */ @@ -238,3 +240,56 @@ static void test_cert(const char *cert, const uint8_t *expected_output, size_t o mbedtls_x509_crt_free(&crt); } + +static int myrand(void *rng_state, unsigned char *output, size_t len) +{ + size_t olen; + return mbedtls_hardware_poll(rng_state, output, len, &olen); +} + +#ifdef CONFIG_MBEDTLS_HARDWARE_MPI + +TEST_CASE("test performance RSA key operations", "[bignum][ignore]") +{ + mbedtls_rsa_context rsa; + unsigned char orig_buf[4096 / 8]; + unsigned char encrypted_buf[4096 / 8]; + unsigned char decrypted_buf[4096 / 8]; + int64_t start; + int public_perf, private_perf; + + printf("First, orig_buf is encrypted by the public key, and then decrypted by the private key\n"); + + for (int keysize = 2048; keysize <= 4096; keysize += 2048) { + memset(orig_buf, 0xAA, sizeof(orig_buf)); + orig_buf[0] = 0; // Ensure that orig_buf is smaller than rsa.N + + mbedtls_rsa_init(&rsa, MBEDTLS_RSA_PRIVATE, 0); + TEST_ASSERT_EQUAL(0, mbedtls_rsa_gen_key(&rsa, myrand, NULL, keysize, 65537)); + + TEST_ASSERT_EQUAL(keysize, (int)rsa.len * 8); + TEST_ASSERT_EQUAL(keysize, (int)rsa.D.n * sizeof(mbedtls_mpi_uint) * 8); // The private exponent + + start = esp_timer_get_time(); + TEST_ASSERT_EQUAL(0, mbedtls_rsa_public(&rsa, orig_buf, encrypted_buf)); + public_perf = esp_timer_get_time() - start; + + start = esp_timer_get_time(); + TEST_ASSERT_EQUAL(0, mbedtls_rsa_private(&rsa, NULL, NULL, encrypted_buf, decrypted_buf)); + private_perf = esp_timer_get_time() - start; + + if (keysize == 2048) { + TEST_PERFORMANCE_LESS_THAN(RSA_2048KEY_PUBLIC_OP, "public operations %d us", public_perf); + TEST_PERFORMANCE_LESS_THAN(RSA_2048KEY_PRIVATE_OP, "private operations %d us", private_perf); + } else { + TEST_PERFORMANCE_LESS_THAN(RSA_4096KEY_PUBLIC_OP, "public operations %d us", public_perf); + TEST_PERFORMANCE_LESS_THAN(RSA_4096KEY_PRIVATE_OP, "private operations %d us", private_perf); + } + + TEST_ASSERT_EQUAL_MEMORY_MESSAGE(orig_buf, decrypted_buf, keysize / 8, "RSA operation"); + + mbedtls_rsa_free(&rsa); + } +} + +#endif // CONFIG_MBEDTLS_HARDWARE_MPI \ No newline at end of file