summaryrefslogtreecommitdiffstats
path: root/src/arch/arm64/core
diff options
context:
space:
mode:
Diffstat (limited to 'src/arch/arm64/core')
-rw-r--r--src/arch/arm64/core/arm64_bigint.c107
-rw-r--r--src/arch/arm64/core/arm64_string.c174
2 files changed, 100 insertions, 181 deletions
diff --git a/src/arch/arm64/core/arm64_bigint.c b/src/arch/arm64/core/arm64_bigint.c
deleted file mode 100644
index 7740f1aef..000000000
--- a/src/arch/arm64/core/arm64_bigint.c
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (C) 2016 Michael Brown <mbrown@fensystems.co.uk>.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of the
- * License, or any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA.
- *
- * You can also choose to distribute this program under the terms of
- * the Unmodified Binary Distribution Licence (as given in the file
- * COPYING.UBDL), provided that you have satisfied its requirements.
- */
-
-FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
-
-#include <stdint.h>
-#include <string.h>
-#include <ipxe/bigint.h>
-
-/** @file
- *
- * Big integer support
- */
-
-/**
- * Multiply big integers
- *
- * @v multiplicand0 Element 0 of big integer to be multiplied
- * @v multiplicand_size Number of elements in multiplicand
- * @v multiplier0 Element 0 of big integer to be multiplied
- * @v multiplier_size Number of elements in multiplier
- * @v result0 Element 0 of big integer to hold result
- */
-void bigint_multiply_raw ( const uint64_t *multiplicand0,
- unsigned int multiplicand_size,
- const uint64_t *multiplier0,
- unsigned int multiplier_size,
- uint64_t *result0 ) {
- unsigned int result_size = ( multiplicand_size + multiplier_size );
- const bigint_t ( multiplicand_size ) __attribute__ (( may_alias ))
- *multiplicand = ( ( const void * ) multiplicand0 );
- const bigint_t ( multiplier_size ) __attribute__ (( may_alias ))
- *multiplier = ( ( const void * ) multiplier0 );
- bigint_t ( result_size ) __attribute__ (( may_alias ))
- *result = ( ( void * ) result0 );
- unsigned int i;
- unsigned int j;
- uint64_t multiplicand_element;
- uint64_t multiplier_element;
- uint64_t *result_elements;
- uint64_t discard_low;
- uint64_t discard_high;
- uint64_t discard_temp_low;
- uint64_t discard_temp_high;
-
- /* Zero result */
- memset ( result, 0, sizeof ( *result ) );
-
- /* Multiply integers one element at a time */
- for ( i = 0 ; i < multiplicand_size ; i++ ) {
- multiplicand_element = multiplicand->element[i];
- for ( j = 0 ; j < multiplier_size ; j++ ) {
- multiplier_element = multiplier->element[j];
- result_elements = &result->element[ i + j ];
- /* Perform a single multiply, and add the
- * resulting double-element into the result,
- * carrying as necessary. The carry can
- * never overflow beyond the end of the
- * result, since:
- *
- * a < 2^{n}, b < 2^{m} => ab < 2^{n+m}
- */
- __asm__ __volatile__ ( "mul %1, %6, %7\n\t"
- "umulh %2, %6, %7\n\t"
- "ldp %3, %4, [%0]\n\t"
- "adds %3, %3, %1\n\t"
- "adcs %4, %4, %2\n\t"
- "stp %3, %4, [%0], #16\n\t"
- "bcc 2f\n\t"
- "\n1:\n\t"
- "ldr %3, [%0]\n\t"
- "adcs %3, %3, xzr\n\t"
- "str %3, [%0], #8\n\t"
- "bcs 1b\n\t"
- "\n2:\n\t"
- : "+r" ( result_elements ),
- "=&r" ( discard_low ),
- "=&r" ( discard_high ),
- "=r" ( discard_temp_low ),
- "=r" ( discard_temp_high ),
- "+m" ( *result )
- : "r" ( multiplicand_element ),
- "r" ( multiplier_element )
- : "cc" );
- }
- }
-}
diff --git a/src/arch/arm64/core/arm64_string.c b/src/arch/arm64/core/arm64_string.c
index 28a2b73bc..07a7eefdf 100644
--- a/src/arch/arm64/core/arm64_string.c
+++ b/src/arch/arm64/core/arm64_string.c
@@ -31,6 +31,9 @@ FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
#include <string.h>
+/** Block size (for "ldp"/"stp") */
+#define ARM64_STRING_BLKSZ 16
+
/**
* Copy memory area
*
@@ -40,59 +43,70 @@ FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
* @ret dest Destination address
*/
void arm64_memcpy ( void *dest, const void *src, size_t len ) {
- void *discard_dest;
- void *discard_end;
- const void *discard_src;
- size_t discard_offset;
+ size_t len_pre;
+ size_t len_mid;
+ size_t len_post;
unsigned long discard_data;
unsigned long discard_low;
unsigned long discard_high;
+ unsigned long discard_len;
- /* If length is too short for an "ldp"/"stp" instruction pair,
- * then just copy individual bytes.
+ /* Calculate pre-aligned, aligned, and post-aligned lengths.
+ * (Align on the destination address, on the assumption that
+ * misaligned stores are likely to be more expensive than
+ * misaligned loads.)
*/
- if ( len < 16 ) {
- __asm__ __volatile__ ( "cbz %0, 2f\n\t"
- "\n1:\n\t"
- "sub %0, %0, #1\n\t"
- "ldrb %w1, [%3, %0]\n\t"
- "strb %w1, [%2, %0]\n\t"
- "cbnz %0, 1b\n\t"
- "\n2:\n\t"
- : "=&r" ( discard_offset ),
- "=&r" ( discard_data )
- : "r" ( dest ), "r" ( src ), "0" ( len )
- : "memory" );
- return;
- }
+ len_pre = ( ( ARM64_STRING_BLKSZ - ( ( intptr_t ) dest ) ) &
+ ( ARM64_STRING_BLKSZ - 1 ) );
+ if ( len_pre > len )
+ len_pre = len;
+ len -= len_pre;
+ len_mid = ( len & ~( ARM64_STRING_BLKSZ - 1 ) );
+ len -= len_mid;
+ len_post = len;
- /* Use "ldp"/"stp" to copy 16 bytes at a time: one initial
- * potentially unaligned access, multiple destination-aligned
- * accesses, one final potentially unaligned access.
- */
- __asm__ __volatile__ ( "ldp %3, %4, [%1], #16\n\t"
- "stp %3, %4, [%0], #16\n\t"
- "and %3, %0, #15\n\t"
- "sub %0, %0, %3\n\t"
- "sub %1, %1, %3\n\t"
- "bic %2, %5, #15\n\t"
- "b 2f\n\t"
+ /* Copy pre-aligned section */
+ __asm__ __volatile__ ( "cbz %2, 2f\n\t"
+ "\n1:\n\t"
+ "ldrb %w3, [%1], #1\n\t"
+ "strb %w3, [%0], #1\n\t"
+ "sub %2, %2, #1\n\t"
+ "cbnz %2, 1b\n\t"
+ "\n2:\n\t"
+ : "+r" ( dest ), "+r" ( src ),
+ "=&r" ( discard_len ),
+ "=&r" ( discard_data )
+ : "2" ( len_pre )
+ : "memory" );
+
+ /* Copy aligned section */
+ __asm__ __volatile__ ( "cbz %2, 2f\n\t"
"\n1:\n\t"
"ldp %3, %4, [%1], #16\n\t"
"stp %3, %4, [%0], #16\n\t"
+ "sub %2, %2, #16\n\t"
+ "cbnz %2, 1b\n\t"
"\n2:\n\t"
- "cmp %0, %2\n\t"
- "bne 1b\n\t"
- "ldp %3, %4, [%6, #-16]\n\t"
- "stp %3, %4, [%5, #-16]\n\t"
- : "=&r" ( discard_dest ),
- "=&r" ( discard_src ),
- "=&r" ( discard_end ),
+ : "+r" ( dest ), "+r" ( src ),
+ "=&r" ( discard_len ),
"=&r" ( discard_low ),
"=&r" ( discard_high )
- : "r" ( dest + len ), "r" ( src + len ),
- "0" ( dest ), "1" ( src )
- : "memory", "cc" );
+ : "2" ( len_mid )
+ : "memory" );
+
+ /* Copy post-aligned section */
+ __asm__ __volatile__ ( "cbz %2, 2f\n\t"
+ "\n1:\n\t"
+ "ldrb %w3, [%1], #1\n\t"
+ "strb %w3, [%0], #1\n\t"
+ "sub %2, %2, #1\n\t"
+ "cbnz %2, 1b\n\t"
+ "\n2:\n\t"
+ : "+r" ( dest ), "+r" ( src ),
+ "=&r" ( discard_len ),
+ "=&r" ( discard_data )
+ : "2" ( len_post )
+ : "memory" );
}
/**
@@ -102,44 +116,56 @@ void arm64_memcpy ( void *dest, const void *src, size_t len ) {
* @v len Length
*/
void arm64_bzero ( void *dest, size_t len ) {
- size_t discard_offset;
- void *discard_dest;
- void *discard_end;
+ size_t len_pre;
+ size_t len_mid;
+ size_t len_post;
+ unsigned long discard_len;
- /* If length is too short for an "stp" instruction, then just
- * zero individual bytes.
- */
- if ( len < 16 ) {
- __asm__ __volatile__ ( "cbz %0, 2f\n\t"
- "\n1:\n\t"
- "sub %0, %0, #1\n\t"
- "strb wzr, [%1, %0]\n\t"
- "cbnz %0, 1b\n\t"
- "\n2:\n\t"
- : "=&r" ( discard_offset )
- : "r" ( dest ), "0" ( len )
- : "memory" );
- return;
- }
+ /* Calculate pre-aligned, aligned, and post-aligned lengths */
+ len_pre = ( ( ARM64_STRING_BLKSZ - ( ( intptr_t ) dest ) ) &
+ ( ARM64_STRING_BLKSZ - 1 ) );
+ if ( len_pre > len )
+ len_pre = len;
+ len -= len_pre;
+ len_mid = ( len & ~( ARM64_STRING_BLKSZ - 1 ) );
+ len -= len_mid;
+ len_post = len;
- /* Use "stp" to zero 16 bytes at a time: one initial
- * potentially unaligned access, multiple aligned accesses,
- * one final potentially unaligned access.
- */
- __asm__ __volatile__ ( "stp xzr, xzr, [%0], #16\n\t"
- "bic %0, %0, #15\n\t"
- "bic %1, %2, #15\n\t"
- "b 2f\n\t"
+ /* Zero pre-aligned section */
+ __asm__ __volatile__ ( "cbz %1, 2f\n\t"
+ "\n1:\n\t"
+ "strb wzr, [%0], #1\n\t"
+ "sub %1, %1, #1\n\t"
+ "cbnz %1, 1b\n\t"
+ "\n2:\n\t"
+ : "+r" ( dest ),
+ "=&r" ( discard_len )
+ : "1" ( len_pre )
+ : "memory" );
+
+ /* Zero aligned section */
+ __asm__ __volatile__ ( "cbz %1, 2f\n\t"
"\n1:\n\t"
"stp xzr, xzr, [%0], #16\n\t"
+ "sub %1, %1, #16\n\t"
+ "cbnz %1, 1b\n\t"
"\n2:\n\t"
- "cmp %0, %1\n\t"
- "bne 1b\n\t"
- "stp xzr, xzr, [%2, #-16]\n\t"
- : "=&r" ( discard_dest ),
- "=&r" ( discard_end )
- : "r" ( dest + len ), "0" ( dest )
- : "memory", "cc" );
+ : "+r" ( dest ),
+ "=&r" ( discard_len )
+ : "1" ( len_mid )
+ : "memory" );
+
+ /* Zero post-aligned section */
+ __asm__ __volatile__ ( "cbz %1, 2f\n\t"
+ "\n1:\n\t"
+ "strb wzr, [%0], #1\n\t"
+ "sub %1, %1, #1\n\t"
+ "cbnz %1, 1b\n\t"
+ "\n2:\n\t"
+ : "+r" ( dest ),
+ "=&r" ( discard_len )
+ : "1" ( len_post )
+ : "memory" );
}
/**