From 4558398e9514daaa0ce39a03c44009148d4e6cbb Mon Sep 17 00:00:00 2001 From: Nick Gasson Date: Sat, 30 Dec 2023 22:37:35 +0000 Subject: [PATCH] Add optimised SHA1 implementation using SSE intrinsics --- configure.ac | 6 + thirdparty/Makemodule.am | 6 + thirdparty/sha1.c | 287 ++++++++++++++++++++++++++++++++++++++- 3 files changed, 295 insertions(+), 4 deletions(-) diff --git a/configure.ac b/configure.ac index a94c1463..06fcdf62 100644 --- a/configure.ac +++ b/configure.ac @@ -122,6 +122,12 @@ case $target_cpu in [Target supports SSE4.1 instructions])], [], [-Werror]) + AX_CHECK_COMPILE_FLAG( + [-msha], + [AC_DEFINE_UNQUOTED([HAVE_SSE_SHA], [1], + [Target supports SHA instructions])], + [], [-Werror]) + AX_CHECK_COMPILE_FLAG( [-mpopcnt], [AX_APPEND_FLAG([-mpopcnt], [EXTRA_CFLAGS]) diff --git a/thirdparty/Makemodule.am b/thirdparty/Makemodule.am index 97fdbf88..227de695 100644 --- a/thirdparty/Makemodule.am +++ b/thirdparty/Makemodule.am @@ -35,3 +35,9 @@ endif if GNULIB_STRNDUP lib_libgnulib_a_SOURCES += thirdparty/strndup.c endif + +EXTRA_PROGRAMS += bin/sha1test + +bin_sha1test_SOURCES = thirdparty/sha1.c + +bin_sha1test_CFLAGS = -DSHA1TEST diff --git a/thirdparty/sha1.c b/thirdparty/sha1.c index c6248492..71e5355c 100644 --- a/thirdparty/sha1.c +++ b/thirdparty/sha1.c @@ -13,8 +13,15 @@ */ #include "sha1.h" +#include "config.h" +#include #include +#include + +#ifdef HAVE_SSE_SHA +#include +#endif #define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits)))) @@ -98,6 +105,232 @@ SHA1Transform(uint32_t state[5], const unsigned char buffer[64]) a = b = c = d = e = 0; } +#ifdef HAVE_SSE_SHA +__attribute__((target("sse4.1,sha"))) +static void sha1_process_x86(uint32_t state[5], const uint8_t data[], size_t length) +{ + /* Written and place in public domain by Jeffrey Walton */ + /* Based on code from Intel, and by Sean Gulley for */ + /* the miTLS project. */ + + __m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1; + __m128i MSG0, MSG1, MSG2, MSG3; + const __m128i MASK = _mm_set_epi64x(0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL); + + /* Load initial values */ + ABCD = _mm_loadu_si128((const __m128i*) state); + E0 = _mm_set_epi32(state[4], 0, 0, 0); + ABCD = _mm_shuffle_epi32(ABCD, 0x1B); + + while (length >= 64) + { + /* Save current state */ + ABCD_SAVE = ABCD; + E0_SAVE = E0; + + /* Rounds 0-3 */ + MSG0 = _mm_loadu_si128((const __m128i*)(data + 0)); + MSG0 = _mm_shuffle_epi8(MSG0, MASK); + E0 = _mm_add_epi32(E0, MSG0); + E1 = ABCD; + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); + + /* Rounds 4-7 */ + MSG1 = _mm_loadu_si128((const __m128i*)(data + 16)); + MSG1 = _mm_shuffle_epi8(MSG1, MASK); + E1 = _mm_sha1nexte_epu32(E1, MSG1); + E0 = ABCD; + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0); + MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); + + /* Rounds 8-11 */ + MSG2 = _mm_loadu_si128((const __m128i*)(data + 32)); + MSG2 = _mm_shuffle_epi8(MSG2, MASK); + E0 = _mm_sha1nexte_epu32(E0, MSG2); + E1 = ABCD; + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); + MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); + MSG0 = _mm_xor_si128(MSG0, MSG2); + + /* Rounds 12-15 */ + MSG3 = _mm_loadu_si128((const __m128i*)(data + 48)); + MSG3 = _mm_shuffle_epi8(MSG3, MASK); + E1 = _mm_sha1nexte_epu32(E1, MSG3); + E0 = ABCD; + MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0); + MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); + MSG1 = _mm_xor_si128(MSG1, MSG3); + + /* Rounds 16-19 */ + E0 = _mm_sha1nexte_epu32(E0, MSG0); + E1 = ABCD; + MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); + MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); + MSG2 = _mm_xor_si128(MSG2, MSG0); + + /* Rounds 20-23 */ + E1 = _mm_sha1nexte_epu32(E1, MSG1); + E0 = ABCD; + MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); + MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); + MSG3 = _mm_xor_si128(MSG3, MSG1); + + /* Rounds 24-27 */ + E0 = _mm_sha1nexte_epu32(E0, MSG2); + E1 = ABCD; + MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1); + MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); + MSG0 = _mm_xor_si128(MSG0, MSG2); + + /* Rounds 28-31 */ + E1 = _mm_sha1nexte_epu32(E1, MSG3); + E0 = ABCD; + MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); + MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); + MSG1 = _mm_xor_si128(MSG1, MSG3); + + /* Rounds 32-35 */ + E0 = _mm_sha1nexte_epu32(E0, MSG0); + E1 = ABCD; + MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1); + MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); + MSG2 = _mm_xor_si128(MSG2, MSG0); + + /* Rounds 36-39 */ + E1 = _mm_sha1nexte_epu32(E1, MSG1); + E0 = ABCD; + MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); + MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); + MSG3 = _mm_xor_si128(MSG3, MSG1); + + /* Rounds 40-43 */ + E0 = _mm_sha1nexte_epu32(E0, MSG2); + E1 = ABCD; + MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); + MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); + MSG0 = _mm_xor_si128(MSG0, MSG2); + + /* Rounds 44-47 */ + E1 = _mm_sha1nexte_epu32(E1, MSG3); + E0 = ABCD; + MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2); + MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); + MSG1 = _mm_xor_si128(MSG1, MSG3); + + /* Rounds 48-51 */ + E0 = _mm_sha1nexte_epu32(E0, MSG0); + E1 = ABCD; + MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); + MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); + MSG2 = _mm_xor_si128(MSG2, MSG0); + + /* Rounds 52-55 */ + E1 = _mm_sha1nexte_epu32(E1, MSG1); + E0 = ABCD; + MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2); + MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); + MSG3 = _mm_xor_si128(MSG3, MSG1); + + /* Rounds 56-59 */ + E0 = _mm_sha1nexte_epu32(E0, MSG2); + E1 = ABCD; + MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); + MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); + MSG0 = _mm_xor_si128(MSG0, MSG2); + + /* Rounds 60-63 */ + E1 = _mm_sha1nexte_epu32(E1, MSG3); + E0 = ABCD; + MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); + MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); + MSG1 = _mm_xor_si128(MSG1, MSG3); + + /* Rounds 64-67 */ + E0 = _mm_sha1nexte_epu32(E0, MSG0); + E1 = ABCD; + MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3); + MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); + MSG2 = _mm_xor_si128(MSG2, MSG0); + + /* Rounds 68-71 */ + E1 = _mm_sha1nexte_epu32(E1, MSG1); + E0 = ABCD; + MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); + MSG3 = _mm_xor_si128(MSG3, MSG1); + + /* Rounds 72-75 */ + E0 = _mm_sha1nexte_epu32(E0, MSG2); + E1 = ABCD; + MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3); + + /* Rounds 76-79 */ + E1 = _mm_sha1nexte_epu32(E1, MSG3); + E0 = ABCD; + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); + + /* Combine state */ + E0 = _mm_sha1nexte_epu32(E0, E0_SAVE); + ABCD = _mm_add_epi32(ABCD, ABCD_SAVE); + + data += 64; + length -= 64; + } + + /* Save state */ + ABCD = _mm_shuffle_epi32(ABCD, 0x1B); + _mm_storeu_si128((__m128i*) state, ABCD); + state[4] = _mm_extract_epi32(E0, 3); +} +#endif + +__attribute__((always_inline)) +static inline void +sha1_transform_generic(uint32_t state[5], + const unsigned char *buffer, + size_t length) +{ + assert(length % 64 == 0); + +#ifdef HAVE_SSE_SHA +#if __GNUC__ >= 11 + const bool have_sha = __builtin_cpu_supports("sha"); +#else + static int have_sha = -1; + if (have_sha == -1) { + int a = 7, b, c = 0, d; + asm volatile ("cpuid" + : "=a"(a), "=b"(b), "=c"(c), "=d"(d) + : "a"(a), "c"(c)); + have_sha = !!((b >> 29) & 1); + } +#endif + + if (have_sha) { + sha1_process_x86(state, buffer, length); + return; + } +#endif + + for (int i = 0; i < length; i += 64) + SHA1Transform(state, buffer + i); +} + /* * SHA1Init - Initialize new context */ @@ -119,7 +352,7 @@ SHA1Init(SHA1_CTX *context) void SHA1Update(SHA1_CTX *context, const unsigned char *data, size_t len) { - unsigned int i, j; + unsigned int i, j, chunksz; j = context->count[0]; if ((context->count[0] += len << 3) < j) @@ -127,9 +360,10 @@ SHA1Update(SHA1_CTX *context, const unsigned char *data, size_t len) j = (j >> 3) & 63; if ((j + len) > 63) { memcpy(&context->buffer[j], data, (i = 64-j)); - SHA1Transform(context->state, context->buffer); - for ( ; i + 63 < len; i += 64) - SHA1Transform(context->state, &data[i]); + sha1_transform_generic(context->state, context->buffer, 64); + chunksz = (len - i) & -64; + sha1_transform_generic(context->state, &data[i], chunksz); + i += chunksz; j = 0; } else { i = 0; @@ -165,3 +399,48 @@ SHA1Final(unsigned char digest[SHA1_LEN], SHA1_CTX *context) memset(context, '\0', sizeof (*context)); } } + +#ifdef SHA1TEST + +#include +#include +#include +#include +#include + +int main(int argc, char **argv) +{ + if (argc != 2) + errx(1, "missing file argument"); + + struct stat st; + if (stat(argv[1], &st) != 0) + err(1, "stat"); + + char *buf = malloc(st.st_size); + + int fd = open(argv[1], O_RDONLY); + if (fd < 0) + err(1, "open: %s", argv[1]); + + if (read(fd, buf, st.st_size) != st.st_size) + err(1, "read: %s", argv[1]); + + close(fd); + + SHA1_CTX ctx; + SHA1Init(&ctx); + SHA1Update(&ctx, buf, st.st_size); + + unsigned char hash[SHA1_LEN]; + SHA1Final(hash, &ctx); + + for (int i = 0; i < SHA1_LEN; i++) + printf("%02x", hash[i]); + printf("\n"); + + free(buf); + return 0; +} + +#endif // SHA1TEST -- 2.39.2