diff options
Diffstat (limited to 'src/arch/arm64/core')
| -rw-r--r-- | src/arch/arm64/core/arm64_bigint.c | 107 | ||||
| -rw-r--r-- | src/arch/arm64/core/arm64_string.c | 174 |
2 files changed, 100 insertions, 181 deletions
diff --git a/src/arch/arm64/core/arm64_bigint.c b/src/arch/arm64/core/arm64_bigint.c deleted file mode 100644 index 7740f1aef..000000000 --- a/src/arch/arm64/core/arm64_bigint.c +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright (C) 2016 Michael Brown <mbrown@fensystems.co.uk>. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation; either version 2 of the - * License, or any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * You can also choose to distribute this program under the terms of - * the Unmodified Binary Distribution Licence (as given in the file - * COPYING.UBDL), provided that you have satisfied its requirements. - */ - -FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL ); - -#include <stdint.h> -#include <string.h> -#include <ipxe/bigint.h> - -/** @file - * - * Big integer support - */ - -/** - * Multiply big integers - * - * @v multiplicand0 Element 0 of big integer to be multiplied - * @v multiplicand_size Number of elements in multiplicand - * @v multiplier0 Element 0 of big integer to be multiplied - * @v multiplier_size Number of elements in multiplier - * @v result0 Element 0 of big integer to hold result - */ -void bigint_multiply_raw ( const uint64_t *multiplicand0, - unsigned int multiplicand_size, - const uint64_t *multiplier0, - unsigned int multiplier_size, - uint64_t *result0 ) { - unsigned int result_size = ( multiplicand_size + multiplier_size ); - const bigint_t ( multiplicand_size ) __attribute__ (( may_alias )) - *multiplicand = ( ( const void * ) multiplicand0 ); - const bigint_t ( multiplier_size ) __attribute__ (( may_alias )) - *multiplier = ( ( const void * ) multiplier0 ); - bigint_t ( result_size ) __attribute__ (( may_alias )) - *result = ( ( void * ) result0 ); - unsigned int i; - unsigned int j; - uint64_t multiplicand_element; - uint64_t multiplier_element; - uint64_t *result_elements; - uint64_t discard_low; - uint64_t discard_high; - uint64_t discard_temp_low; - uint64_t discard_temp_high; - - /* Zero result */ - memset ( result, 0, sizeof ( *result ) ); - - /* Multiply integers one element at a time */ - for ( i = 0 ; i < multiplicand_size ; i++ ) { - multiplicand_element = multiplicand->element[i]; - for ( j = 0 ; j < multiplier_size ; j++ ) { - multiplier_element = multiplier->element[j]; - result_elements = &result->element[ i + j ]; - /* Perform a single multiply, and add the - * resulting double-element into the result, - * carrying as necessary. The carry can - * never overflow beyond the end of the - * result, since: - * - * a < 2^{n}, b < 2^{m} => ab < 2^{n+m} - */ - __asm__ __volatile__ ( "mul %1, %6, %7\n\t" - "umulh %2, %6, %7\n\t" - "ldp %3, %4, [%0]\n\t" - "adds %3, %3, %1\n\t" - "adcs %4, %4, %2\n\t" - "stp %3, %4, [%0], #16\n\t" - "bcc 2f\n\t" - "\n1:\n\t" - "ldr %3, [%0]\n\t" - "adcs %3, %3, xzr\n\t" - "str %3, [%0], #8\n\t" - "bcs 1b\n\t" - "\n2:\n\t" - : "+r" ( result_elements ), - "=&r" ( discard_low ), - "=&r" ( discard_high ), - "=r" ( discard_temp_low ), - "=r" ( discard_temp_high ), - "+m" ( *result ) - : "r" ( multiplicand_element ), - "r" ( multiplier_element ) - : "cc" ); - } - } -} diff --git a/src/arch/arm64/core/arm64_string.c b/src/arch/arm64/core/arm64_string.c index 28a2b73bc..07a7eefdf 100644 --- a/src/arch/arm64/core/arm64_string.c +++ b/src/arch/arm64/core/arm64_string.c @@ -31,6 +31,9 @@ FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL ); #include <string.h> +/** Block size (for "ldp"/"stp") */ +#define ARM64_STRING_BLKSZ 16 + /** * Copy memory area * @@ -40,59 +43,70 @@ FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL ); * @ret dest Destination address */ void arm64_memcpy ( void *dest, const void *src, size_t len ) { - void *discard_dest; - void *discard_end; - const void *discard_src; - size_t discard_offset; + size_t len_pre; + size_t len_mid; + size_t len_post; unsigned long discard_data; unsigned long discard_low; unsigned long discard_high; + unsigned long discard_len; - /* If length is too short for an "ldp"/"stp" instruction pair, - * then just copy individual bytes. + /* Calculate pre-aligned, aligned, and post-aligned lengths. + * (Align on the destination address, on the assumption that + * misaligned stores are likely to be more expensive than + * misaligned loads.) */ - if ( len < 16 ) { - __asm__ __volatile__ ( "cbz %0, 2f\n\t" - "\n1:\n\t" - "sub %0, %0, #1\n\t" - "ldrb %w1, [%3, %0]\n\t" - "strb %w1, [%2, %0]\n\t" - "cbnz %0, 1b\n\t" - "\n2:\n\t" - : "=&r" ( discard_offset ), - "=&r" ( discard_data ) - : "r" ( dest ), "r" ( src ), "0" ( len ) - : "memory" ); - return; - } + len_pre = ( ( ARM64_STRING_BLKSZ - ( ( intptr_t ) dest ) ) & + ( ARM64_STRING_BLKSZ - 1 ) ); + if ( len_pre > len ) + len_pre = len; + len -= len_pre; + len_mid = ( len & ~( ARM64_STRING_BLKSZ - 1 ) ); + len -= len_mid; + len_post = len; - /* Use "ldp"/"stp" to copy 16 bytes at a time: one initial - * potentially unaligned access, multiple destination-aligned - * accesses, one final potentially unaligned access. - */ - __asm__ __volatile__ ( "ldp %3, %4, [%1], #16\n\t" - "stp %3, %4, [%0], #16\n\t" - "and %3, %0, #15\n\t" - "sub %0, %0, %3\n\t" - "sub %1, %1, %3\n\t" - "bic %2, %5, #15\n\t" - "b 2f\n\t" + /* Copy pre-aligned section */ + __asm__ __volatile__ ( "cbz %2, 2f\n\t" + "\n1:\n\t" + "ldrb %w3, [%1], #1\n\t" + "strb %w3, [%0], #1\n\t" + "sub %2, %2, #1\n\t" + "cbnz %2, 1b\n\t" + "\n2:\n\t" + : "+r" ( dest ), "+r" ( src ), + "=&r" ( discard_len ), + "=&r" ( discard_data ) + : "2" ( len_pre ) + : "memory" ); + + /* Copy aligned section */ + __asm__ __volatile__ ( "cbz %2, 2f\n\t" "\n1:\n\t" "ldp %3, %4, [%1], #16\n\t" "stp %3, %4, [%0], #16\n\t" + "sub %2, %2, #16\n\t" + "cbnz %2, 1b\n\t" "\n2:\n\t" - "cmp %0, %2\n\t" - "bne 1b\n\t" - "ldp %3, %4, [%6, #-16]\n\t" - "stp %3, %4, [%5, #-16]\n\t" - : "=&r" ( discard_dest ), - "=&r" ( discard_src ), - "=&r" ( discard_end ), + : "+r" ( dest ), "+r" ( src ), + "=&r" ( discard_len ), "=&r" ( discard_low ), "=&r" ( discard_high ) - : "r" ( dest + len ), "r" ( src + len ), - "0" ( dest ), "1" ( src ) - : "memory", "cc" ); + : "2" ( len_mid ) + : "memory" ); + + /* Copy post-aligned section */ + __asm__ __volatile__ ( "cbz %2, 2f\n\t" + "\n1:\n\t" + "ldrb %w3, [%1], #1\n\t" + "strb %w3, [%0], #1\n\t" + "sub %2, %2, #1\n\t" + "cbnz %2, 1b\n\t" + "\n2:\n\t" + : "+r" ( dest ), "+r" ( src ), + "=&r" ( discard_len ), + "=&r" ( discard_data ) + : "2" ( len_post ) + : "memory" ); } /** @@ -102,44 +116,56 @@ void arm64_memcpy ( void *dest, const void *src, size_t len ) { * @v len Length */ void arm64_bzero ( void *dest, size_t len ) { - size_t discard_offset; - void *discard_dest; - void *discard_end; + size_t len_pre; + size_t len_mid; + size_t len_post; + unsigned long discard_len; - /* If length is too short for an "stp" instruction, then just - * zero individual bytes. - */ - if ( len < 16 ) { - __asm__ __volatile__ ( "cbz %0, 2f\n\t" - "\n1:\n\t" - "sub %0, %0, #1\n\t" - "strb wzr, [%1, %0]\n\t" - "cbnz %0, 1b\n\t" - "\n2:\n\t" - : "=&r" ( discard_offset ) - : "r" ( dest ), "0" ( len ) - : "memory" ); - return; - } + /* Calculate pre-aligned, aligned, and post-aligned lengths */ + len_pre = ( ( ARM64_STRING_BLKSZ - ( ( intptr_t ) dest ) ) & + ( ARM64_STRING_BLKSZ - 1 ) ); + if ( len_pre > len ) + len_pre = len; + len -= len_pre; + len_mid = ( len & ~( ARM64_STRING_BLKSZ - 1 ) ); + len -= len_mid; + len_post = len; - /* Use "stp" to zero 16 bytes at a time: one initial - * potentially unaligned access, multiple aligned accesses, - * one final potentially unaligned access. - */ - __asm__ __volatile__ ( "stp xzr, xzr, [%0], #16\n\t" - "bic %0, %0, #15\n\t" - "bic %1, %2, #15\n\t" - "b 2f\n\t" + /* Zero pre-aligned section */ + __asm__ __volatile__ ( "cbz %1, 2f\n\t" + "\n1:\n\t" + "strb wzr, [%0], #1\n\t" + "sub %1, %1, #1\n\t" + "cbnz %1, 1b\n\t" + "\n2:\n\t" + : "+r" ( dest ), + "=&r" ( discard_len ) + : "1" ( len_pre ) + : "memory" ); + + /* Zero aligned section */ + __asm__ __volatile__ ( "cbz %1, 2f\n\t" "\n1:\n\t" "stp xzr, xzr, [%0], #16\n\t" + "sub %1, %1, #16\n\t" + "cbnz %1, 1b\n\t" "\n2:\n\t" - "cmp %0, %1\n\t" - "bne 1b\n\t" - "stp xzr, xzr, [%2, #-16]\n\t" - : "=&r" ( discard_dest ), - "=&r" ( discard_end ) - : "r" ( dest + len ), "0" ( dest ) - : "memory", "cc" ); + : "+r" ( dest ), + "=&r" ( discard_len ) + : "1" ( len_mid ) + : "memory" ); + + /* Zero post-aligned section */ + __asm__ __volatile__ ( "cbz %1, 2f\n\t" + "\n1:\n\t" + "strb wzr, [%0], #1\n\t" + "sub %1, %1, #1\n\t" + "cbnz %1, 1b\n\t" + "\n2:\n\t" + : "+r" ( dest ), + "=&r" ( discard_len ) + : "1" ( len_post ) + : "memory" ); } /** |
