From daa782c0d7283a4469b696d52c4a8a4d8d0b517f Mon Sep 17 00:00:00 2001 From: Nick Gasson Date: Fri, 5 Jan 2024 17:39:15 +0000 Subject: [PATCH] Add accelerated SHA1 using ARMv8 crypto extension --- configure.ac | 9 +++ thirdparty/sha1.c | 200 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 208 insertions(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 06fcdf62..7321cf87 100644 --- a/configure.ac +++ b/configure.ac @@ -135,6 +135,15 @@ case $target_cpu in [Target supports POPCNT instructions])], [], [-Werror]) ;; + + aarch64) + AX_CHECK_COMPILE_FLAG( + [-march=armv8-a+crypto], + [AC_DEFINE_UNQUOTED([HAVE_ARM_CRYPTO], [1], + [Target supports ARMv8 crypto instructions])], + [], [-Werror]) + AC_CHECK_HEADERS([arm_neon.h]) + ;; esac AX_GCC_FUNC_ATTRIBUTE([returns_nonnull]) diff --git a/thirdparty/sha1.c b/thirdparty/sha1.c index 71e5355c..40946a4f 100644 --- a/thirdparty/sha1.c +++ b/thirdparty/sha1.c @@ -23,6 +23,10 @@ #include #endif +#ifdef HAVE_ARM_CRYPTO +#include +#endif + #define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits)))) /* @@ -299,6 +303,194 @@ static void sha1_process_x86(uint32_t state[5], const uint8_t data[], size_t len } #endif +#ifdef HAVE_ARM_CRYPTO +#ifdef __clang__ +__attribute__((target("crypto"))) +#else +__attribute__((target("+crypto"))) +#endif +static void sha1_process_arm(uint32_t state[5], const uint8_t data[], uint32_t length) +{ + /* Written and placed in public domain by Jeffrey Walton */ + /* Based on code from ARM, and by Johannes Schneiders, Skip */ + /* Hovsmith and Barry O'Rourke for the mbedTLS project. */ + + uint32x4_t ABCD, ABCD_SAVED; + uint32x4_t TMP0, TMP1; + uint32x4_t MSG0, MSG1, MSG2, MSG3; + uint32_t E0, E0_SAVED, E1; + + /* Load state */ + ABCD = vld1q_u32(&state[0]); + E0 = state[4]; + + while (length >= 64) + { + /* Save state */ + ABCD_SAVED = ABCD; + E0_SAVED = E0; + + /* Load message */ + MSG0 = vld1q_u32((const uint32_t*)(data)); + MSG1 = vld1q_u32((const uint32_t*)(data + 16)); + MSG2 = vld1q_u32((const uint32_t*)(data + 32)); + MSG3 = vld1q_u32((const uint32_t*)(data + 48)); + + /* Reverse for little endian */ + MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0))); + MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1))); + MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2))); + MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3))); + + TMP0 = vaddq_u32(MSG0, vdupq_n_u32(0x5A827999)); + TMP1 = vaddq_u32(MSG1, vdupq_n_u32(0x5A827999)); + + /* Rounds 0-3 */ + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1cq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG2, vdupq_n_u32(0x5A827999)); + MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); + + /* Rounds 4-7 */ + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1cq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG3, vdupq_n_u32(0x5A827999)); + MSG0 = vsha1su1q_u32(MSG0, MSG3); + MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); + + /* Rounds 8-11 */ + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1cq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG0, vdupq_n_u32(0x5A827999)); + MSG1 = vsha1su1q_u32(MSG1, MSG0); + MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); + + /* Rounds 12-15 */ + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1cq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG1, vdupq_n_u32(0x6ED9EBA1)); + MSG2 = vsha1su1q_u32(MSG2, MSG1); + MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); + + /* Rounds 16-19 */ + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1cq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG2, vdupq_n_u32(0x6ED9EBA1)); + MSG3 = vsha1su1q_u32(MSG3, MSG2); + MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); + + /* Rounds 20-23 */ + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG3, vdupq_n_u32(0x6ED9EBA1)); + MSG0 = vsha1su1q_u32(MSG0, MSG3); + MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); + + /* Rounds 24-27 */ + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG0, vdupq_n_u32(0x6ED9EBA1)); + MSG1 = vsha1su1q_u32(MSG1, MSG0); + MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); + + /* Rounds 28-31 */ + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG1, vdupq_n_u32(0x6ED9EBA1)); + MSG2 = vsha1su1q_u32(MSG2, MSG1); + MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); + + /* Rounds 32-35 */ + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG2, vdupq_n_u32(0x8F1BBCDC)); + MSG3 = vsha1su1q_u32(MSG3, MSG2); + MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); + + /* Rounds 36-39 */ + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG3, vdupq_n_u32(0x8F1BBCDC)); + MSG0 = vsha1su1q_u32(MSG0, MSG3); + MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); + + /* Rounds 40-43 */ + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1mq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG0, vdupq_n_u32(0x8F1BBCDC)); + MSG1 = vsha1su1q_u32(MSG1, MSG0); + MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); + + /* Rounds 44-47 */ + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1mq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG1, vdupq_n_u32(0x8F1BBCDC)); + MSG2 = vsha1su1q_u32(MSG2, MSG1); + MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); + + /* Rounds 48-51 */ + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1mq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG2, vdupq_n_u32(0x8F1BBCDC)); + MSG3 = vsha1su1q_u32(MSG3, MSG2); + MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); + + /* Rounds 52-55 */ + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1mq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG3, vdupq_n_u32(0xCA62C1D6)); + MSG0 = vsha1su1q_u32(MSG0, MSG3); + MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); + + /* Rounds 56-59 */ + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1mq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG0, vdupq_n_u32(0xCA62C1D6)); + MSG1 = vsha1su1q_u32(MSG1, MSG0); + MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); + + /* Rounds 60-63 */ + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG1, vdupq_n_u32(0xCA62C1D6)); + MSG2 = vsha1su1q_u32(MSG2, MSG1); + MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); + + /* Rounds 64-67 */ + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG2, vdupq_n_u32(0xCA62C1D6)); + MSG3 = vsha1su1q_u32(MSG3, MSG2); + MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); + + /* Rounds 68-71 */ + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG3, vdupq_n_u32(0xCA62C1D6)); + MSG0 = vsha1su1q_u32(MSG0, MSG3); + + /* Rounds 72-75 */ + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E0, TMP0); + + /* Rounds 76-79 */ + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E1, TMP1); + + /* Combine state */ + E0 += E0_SAVED; + ABCD = vaddq_u32(ABCD_SAVED, ABCD); + + data += 64; + length -= 64; + } + + /* Save state */ + vst1q_u32(&state[0], ABCD); + state[4] = E0; +} +#endif + __attribute__((always_inline)) static inline void sha1_transform_generic(uint32_t state[5], @@ -327,6 +519,11 @@ sha1_transform_generic(uint32_t state[5], } #endif +#ifdef HAVE_ARM_CRYPTO + sha1_process_arm(state, buffer, length); + return; +#endif + for (int i = 0; i < length; i += 64) SHA1Transform(state, buffer + i); } @@ -407,6 +604,7 @@ SHA1Final(unsigned char digest[SHA1_LEN], SHA1_CTX *context) #include #include #include +#include int main(int argc, char **argv) { @@ -417,7 +615,7 @@ int main(int argc, char **argv) if (stat(argv[1], &st) != 0) err(1, "stat"); - char *buf = malloc(st.st_size); + unsigned char *buf = malloc(st.st_size); int fd = open(argv[1], O_RDONLY); if (fd < 0) -- 2.39.2