mirror of
https://github.com/ipxe/ipxe
synced 2026-02-14 02:31:26 +03:00
[arm] Avoid unaligned accesses for memcpy() and memset()
iPXE runs only in environments that support unaligned accesses to RAM. However, memcpy() and memset() are also used to write to graphical framebuffer memory, which may support only aligned accesses on some CPU architectures such as ARM. Restructure the 64-bit ARM memcpy() and memset() routines along the lines of the RISC-V implementations, which split the region into pre-aligned, aligned, and post-aligned sections. Signed-off-by: Michael Brown <mcb30@ipxe.org>
This commit is contained in:
@@ -31,6 +31,9 @@ FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
|
|||||||
|
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
|
/** Block size (for "ldp"/"stp") */
|
||||||
|
#define ARM64_STRING_BLKSZ 16
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Copy memory area
|
* Copy memory area
|
||||||
*
|
*
|
||||||
@@ -40,59 +43,70 @@ FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
|
|||||||
* @ret dest Destination address
|
* @ret dest Destination address
|
||||||
*/
|
*/
|
||||||
void arm64_memcpy ( void *dest, const void *src, size_t len ) {
|
void arm64_memcpy ( void *dest, const void *src, size_t len ) {
|
||||||
void *discard_dest;
|
size_t len_pre;
|
||||||
void *discard_end;
|
size_t len_mid;
|
||||||
const void *discard_src;
|
size_t len_post;
|
||||||
size_t discard_offset;
|
|
||||||
unsigned long discard_data;
|
unsigned long discard_data;
|
||||||
unsigned long discard_low;
|
unsigned long discard_low;
|
||||||
unsigned long discard_high;
|
unsigned long discard_high;
|
||||||
|
unsigned long discard_len;
|
||||||
|
|
||||||
/* If length is too short for an "ldp"/"stp" instruction pair,
|
/* Calculate pre-aligned, aligned, and post-aligned lengths.
|
||||||
* then just copy individual bytes.
|
* (Align on the destination address, on the assumption that
|
||||||
|
* misaligned stores are likely to be more expensive than
|
||||||
|
* misaligned loads.)
|
||||||
*/
|
*/
|
||||||
if ( len < 16 ) {
|
len_pre = ( ( ARM64_STRING_BLKSZ - ( ( intptr_t ) dest ) ) &
|
||||||
__asm__ __volatile__ ( "cbz %0, 2f\n\t"
|
( ARM64_STRING_BLKSZ - 1 ) );
|
||||||
"\n1:\n\t"
|
if ( len_pre > len )
|
||||||
"sub %0, %0, #1\n\t"
|
len_pre = len;
|
||||||
"ldrb %w1, [%3, %0]\n\t"
|
len -= len_pre;
|
||||||
"strb %w1, [%2, %0]\n\t"
|
len_mid = ( len & ~( ARM64_STRING_BLKSZ - 1 ) );
|
||||||
"cbnz %0, 1b\n\t"
|
len -= len_mid;
|
||||||
"\n2:\n\t"
|
len_post = len;
|
||||||
: "=&r" ( discard_offset ),
|
|
||||||
"=&r" ( discard_data )
|
|
||||||
: "r" ( dest ), "r" ( src ), "0" ( len )
|
|
||||||
: "memory" );
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Use "ldp"/"stp" to copy 16 bytes at a time: one initial
|
/* Copy pre-aligned section */
|
||||||
* potentially unaligned access, multiple destination-aligned
|
__asm__ __volatile__ ( "cbz %2, 2f\n\t"
|
||||||
* accesses, one final potentially unaligned access.
|
"\n1:\n\t"
|
||||||
*/
|
"ldrb %w3, [%1], #1\n\t"
|
||||||
__asm__ __volatile__ ( "ldp %3, %4, [%1], #16\n\t"
|
"strb %w3, [%0], #1\n\t"
|
||||||
"stp %3, %4, [%0], #16\n\t"
|
"sub %2, %2, #1\n\t"
|
||||||
"and %3, %0, #15\n\t"
|
"cbnz %2, 1b\n\t"
|
||||||
"sub %0, %0, %3\n\t"
|
"\n2:\n\t"
|
||||||
"sub %1, %1, %3\n\t"
|
: "+r" ( dest ), "+r" ( src ),
|
||||||
"bic %2, %5, #15\n\t"
|
"=&r" ( discard_len ),
|
||||||
"b 2f\n\t"
|
"=&r" ( discard_data )
|
||||||
|
: "2" ( len_pre )
|
||||||
|
: "memory" );
|
||||||
|
|
||||||
|
/* Copy aligned section */
|
||||||
|
__asm__ __volatile__ ( "cbz %2, 2f\n\t"
|
||||||
"\n1:\n\t"
|
"\n1:\n\t"
|
||||||
"ldp %3, %4, [%1], #16\n\t"
|
"ldp %3, %4, [%1], #16\n\t"
|
||||||
"stp %3, %4, [%0], #16\n\t"
|
"stp %3, %4, [%0], #16\n\t"
|
||||||
|
"sub %2, %2, #16\n\t"
|
||||||
|
"cbnz %2, 1b\n\t"
|
||||||
"\n2:\n\t"
|
"\n2:\n\t"
|
||||||
"cmp %0, %2\n\t"
|
: "+r" ( dest ), "+r" ( src ),
|
||||||
"bne 1b\n\t"
|
"=&r" ( discard_len ),
|
||||||
"ldp %3, %4, [%6, #-16]\n\t"
|
|
||||||
"stp %3, %4, [%5, #-16]\n\t"
|
|
||||||
: "=&r" ( discard_dest ),
|
|
||||||
"=&r" ( discard_src ),
|
|
||||||
"=&r" ( discard_end ),
|
|
||||||
"=&r" ( discard_low ),
|
"=&r" ( discard_low ),
|
||||||
"=&r" ( discard_high )
|
"=&r" ( discard_high )
|
||||||
: "r" ( dest + len ), "r" ( src + len ),
|
: "2" ( len_mid )
|
||||||
"0" ( dest ), "1" ( src )
|
: "memory" );
|
||||||
: "memory", "cc" );
|
|
||||||
|
/* Copy post-aligned section */
|
||||||
|
__asm__ __volatile__ ( "cbz %2, 2f\n\t"
|
||||||
|
"\n1:\n\t"
|
||||||
|
"ldrb %w3, [%1], #1\n\t"
|
||||||
|
"strb %w3, [%0], #1\n\t"
|
||||||
|
"sub %2, %2, #1\n\t"
|
||||||
|
"cbnz %2, 1b\n\t"
|
||||||
|
"\n2:\n\t"
|
||||||
|
: "+r" ( dest ), "+r" ( src ),
|
||||||
|
"=&r" ( discard_len ),
|
||||||
|
"=&r" ( discard_data )
|
||||||
|
: "2" ( len_post )
|
||||||
|
: "memory" );
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -102,44 +116,56 @@ void arm64_memcpy ( void *dest, const void *src, size_t len ) {
|
|||||||
* @v len Length
|
* @v len Length
|
||||||
*/
|
*/
|
||||||
void arm64_bzero ( void *dest, size_t len ) {
|
void arm64_bzero ( void *dest, size_t len ) {
|
||||||
size_t discard_offset;
|
size_t len_pre;
|
||||||
void *discard_dest;
|
size_t len_mid;
|
||||||
void *discard_end;
|
size_t len_post;
|
||||||
|
unsigned long discard_len;
|
||||||
|
|
||||||
/* If length is too short for an "stp" instruction, then just
|
/* Calculate pre-aligned, aligned, and post-aligned lengths */
|
||||||
* zero individual bytes.
|
len_pre = ( ( ARM64_STRING_BLKSZ - ( ( intptr_t ) dest ) ) &
|
||||||
*/
|
( ARM64_STRING_BLKSZ - 1 ) );
|
||||||
if ( len < 16 ) {
|
if ( len_pre > len )
|
||||||
__asm__ __volatile__ ( "cbz %0, 2f\n\t"
|
len_pre = len;
|
||||||
"\n1:\n\t"
|
len -= len_pre;
|
||||||
"sub %0, %0, #1\n\t"
|
len_mid = ( len & ~( ARM64_STRING_BLKSZ - 1 ) );
|
||||||
"strb wzr, [%1, %0]\n\t"
|
len -= len_mid;
|
||||||
"cbnz %0, 1b\n\t"
|
len_post = len;
|
||||||
"\n2:\n\t"
|
|
||||||
: "=&r" ( discard_offset )
|
|
||||||
: "r" ( dest ), "0" ( len )
|
|
||||||
: "memory" );
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Use "stp" to zero 16 bytes at a time: one initial
|
/* Zero pre-aligned section */
|
||||||
* potentially unaligned access, multiple aligned accesses,
|
__asm__ __volatile__ ( "cbz %1, 2f\n\t"
|
||||||
* one final potentially unaligned access.
|
"\n1:\n\t"
|
||||||
*/
|
"strb wzr, [%0], #1\n\t"
|
||||||
__asm__ __volatile__ ( "stp xzr, xzr, [%0], #16\n\t"
|
"sub %1, %1, #1\n\t"
|
||||||
"bic %0, %0, #15\n\t"
|
"cbnz %1, 1b\n\t"
|
||||||
"bic %1, %2, #15\n\t"
|
"\n2:\n\t"
|
||||||
"b 2f\n\t"
|
: "+r" ( dest ),
|
||||||
|
"=&r" ( discard_len )
|
||||||
|
: "1" ( len_pre )
|
||||||
|
: "memory" );
|
||||||
|
|
||||||
|
/* Zero aligned section */
|
||||||
|
__asm__ __volatile__ ( "cbz %1, 2f\n\t"
|
||||||
"\n1:\n\t"
|
"\n1:\n\t"
|
||||||
"stp xzr, xzr, [%0], #16\n\t"
|
"stp xzr, xzr, [%0], #16\n\t"
|
||||||
|
"sub %1, %1, #16\n\t"
|
||||||
|
"cbnz %1, 1b\n\t"
|
||||||
"\n2:\n\t"
|
"\n2:\n\t"
|
||||||
"cmp %0, %1\n\t"
|
: "+r" ( dest ),
|
||||||
"bne 1b\n\t"
|
"=&r" ( discard_len )
|
||||||
"stp xzr, xzr, [%2, #-16]\n\t"
|
: "1" ( len_mid )
|
||||||
: "=&r" ( discard_dest ),
|
: "memory" );
|
||||||
"=&r" ( discard_end )
|
|
||||||
: "r" ( dest + len ), "0" ( dest )
|
/* Zero post-aligned section */
|
||||||
: "memory", "cc" );
|
__asm__ __volatile__ ( "cbz %1, 2f\n\t"
|
||||||
|
"\n1:\n\t"
|
||||||
|
"strb wzr, [%0], #1\n\t"
|
||||||
|
"sub %1, %1, #1\n\t"
|
||||||
|
"cbnz %1, 1b\n\t"
|
||||||
|
"\n2:\n\t"
|
||||||
|
: "+r" ( dest ),
|
||||||
|
"=&r" ( discard_len )
|
||||||
|
: "1" ( len_post )
|
||||||
|
: "memory" );
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
Reference in New Issue
Block a user