mirror of
https://github.com/ipxe/ipxe
synced 2026-06-29 00:07:28 +03:00
[loong64] Replace optimised string operations
The current implementation of the optimised string operations appears to have been ported from the (old) arm64 implementation, and does not cleanly match the LoongArch64 instruction set. Replace with code derived from the riscv64 implementation, modified to use indexed load and store instructions. Signed-off-by: Michael Brown <mcb30@ipxe.org>
This commit is contained in:
@@ -1,6 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2016 Michael Brown <mbrown@fensystems.co.uk>.
|
||||
* Copyright (c) 2023, Xiaotian Wu <wuxiaotian@loongson.cn>
|
||||
* Copyright (C) 2026 Michael Brown <mbrown@fensystems.co.uk>.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License as
|
||||
@@ -29,6 +28,7 @@
|
||||
*/
|
||||
|
||||
FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
|
||||
FILE_SECBOOT ( PERMITTED );
|
||||
|
||||
#include <string.h>
|
||||
|
||||
@@ -41,68 +41,65 @@ FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
|
||||
* @ret dest Destination address
|
||||
*/
|
||||
void loong64_memcpy ( void *dest, const void *src, size_t len ) {
|
||||
void *discard_dest;
|
||||
void *discard_end;
|
||||
const void *discard_src;
|
||||
size_t discard_offset;
|
||||
size_t len_pre;
|
||||
size_t len_mid;
|
||||
size_t len_post;
|
||||
size_t offset;
|
||||
unsigned long discard_data;
|
||||
unsigned long discard_low;
|
||||
unsigned long discard_high;
|
||||
|
||||
/* If length is too short, then just copy individual bytes.
|
||||
/* Calculate pre-aligned, aligned, and post-aligned lengths.
|
||||
* (Align on the destination address, on the assumption that
|
||||
* misaligned stores are likely to be more expensive than
|
||||
* misaligned loads.)
|
||||
*/
|
||||
if ( len < 16 ) {
|
||||
__asm__ __volatile__ ( "beqz %0, 2f\n\t"
|
||||
len_pre = ( ( sizeof ( unsigned long ) - ( ( intptr_t ) dest ) ) &
|
||||
( sizeof ( unsigned long ) - 1 ) );
|
||||
if ( len_pre > len )
|
||||
len_pre = len;
|
||||
len -= len_pre;
|
||||
len_mid = ( len & ~( sizeof ( unsigned long ) - 1 ) );
|
||||
len -= len_mid;
|
||||
len_post = len;
|
||||
|
||||
/* Copy pre-aligned section */
|
||||
__asm__ __volatile__ ( "b 2f\n\t"
|
||||
"\n1:\n\t"
|
||||
"addi.d %0, %0, -1\n\t"
|
||||
"ldx.b %1, %3, %0\n\t"
|
||||
"stx.b %1, %2, %0\n\t"
|
||||
"bnez %0, 1b\n\t"
|
||||
"addi.d %0, %0, 1\n\t"
|
||||
"\n2:\n\t"
|
||||
: "=&r" ( discard_offset ),
|
||||
"=&r" ( discard_data )
|
||||
: "r" ( dest ), "r" ( src ), "0" ( len )
|
||||
: "memory", "t0" );
|
||||
return;
|
||||
}
|
||||
"bne %0, %4, 1b\n\t"
|
||||
: "=&r" ( offset ), "=&r" ( discard_data )
|
||||
: "r" ( dest ), "r" ( src ), "r" ( len_pre ),
|
||||
"0" ( 0 )
|
||||
: "memory" );
|
||||
|
||||
/* Copy 16 bytes at a time: one initial
|
||||
* potentially unaligned access, multiple destination-aligned
|
||||
* accesses, one final potentially unaligned access.
|
||||
*/
|
||||
__asm__ __volatile__ ( "ld.d %3, %1, 0\n\t"
|
||||
"ld.d %4, %1, 8\n\t"
|
||||
"addi.d %1, %1, 16\n\t"
|
||||
"st.d %3, %0, 0\n\t"
|
||||
"st.d %4, %0, 8\n\t"
|
||||
"addi.d %0, %0, 16\n\t"
|
||||
"andi %3, %0, 15\n\t"
|
||||
"sub.d %0, %0, %3\n\t"
|
||||
"sub.d %1, %1, %3\n\t"
|
||||
"addi.d $t0, $zero, 0xf\n\t"
|
||||
"andn %2, %5, $t0\n\t"
|
||||
"b 2f\n\t"
|
||||
/* Copy aligned section */
|
||||
__asm__ __volatile__ ( "b 2f\n\t"
|
||||
"\n1:\n\t"
|
||||
"ld.d %3, %1, 0\n\t"
|
||||
"ld.d %4, %1, 8\n\t"
|
||||
"addi.d %1, %1, 16\n\t"
|
||||
"st.d %3, %0, 0\n\t"
|
||||
"st.d %4, %0, 8\n\t"
|
||||
"addi.d %0, %0, 16\n\t"
|
||||
"ldx.d %1, %3, %0\n\t"
|
||||
"stx.d %1, %2, %0\n\t"
|
||||
"addi.d %0, %0, %5\n\t"
|
||||
"\n2:\n\t"
|
||||
"bne %0, %2, 1b\n\t"
|
||||
"ld.d %3, %6, -16\n\t"
|
||||
"ld.d %4, %6, -8\n\t"
|
||||
"st.d %3, %5, -16\n\t"
|
||||
"st.d %4, %5, -8\n\t"
|
||||
: "=&r" ( discard_dest ),
|
||||
"=&r" ( discard_src ),
|
||||
"=&r" ( discard_end ),
|
||||
"=&r" ( discard_low ),
|
||||
"=&r" ( discard_high )
|
||||
: "r" ( dest + len ), "r" ( src + len ),
|
||||
"0" ( dest ), "1" ( src )
|
||||
: "memory", "t0" );
|
||||
"bne %0, %4, 1b\n\t"
|
||||
: "+r" ( offset ), "=&r" ( discard_data )
|
||||
: "r" ( dest ), "r" ( src ),
|
||||
"r" ( offset + len_mid ),
|
||||
"i" ( sizeof ( unsigned long ) )
|
||||
: "memory" );
|
||||
|
||||
/* Copy post-aligned section */
|
||||
__asm__ __volatile__ ( "b 2f\n\t"
|
||||
"\n1:\n\t"
|
||||
"ldx.b %1, %3, %0\n\t"
|
||||
"stx.b %1, %2, %0\n\t"
|
||||
"addi.d %0, %0, 1\n\t"
|
||||
"\n2:\n\t"
|
||||
"bne %0, %4, 1b\n\t"
|
||||
: "+r" ( offset ), "=&r" ( discard_data )
|
||||
: "r" ( dest ), "r" ( src ),
|
||||
"r" ( offset + len_post )
|
||||
: "memory" );
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -112,50 +109,54 @@ void loong64_memcpy ( void *dest, const void *src, size_t len ) {
|
||||
* @v len Length
|
||||
*/
|
||||
void loong64_bzero ( void *dest, size_t len ) {
|
||||
size_t discard_offset;
|
||||
void *discard_dest;
|
||||
void *discard_end;
|
||||
size_t len_pre;
|
||||
size_t len_mid;
|
||||
size_t len_post;
|
||||
size_t offset;
|
||||
|
||||
/* If length is too short, then just zero individual bytes.
|
||||
*/
|
||||
if ( len < 16 ) {
|
||||
__asm__ __volatile__ ( "beqz %0, 2f\n\t"
|
||||
/* Calculate pre-aligned, aligned, and post-aligned lengths */
|
||||
len_pre = ( ( sizeof ( unsigned long ) - ( ( intptr_t ) dest ) ) &
|
||||
( sizeof ( unsigned long ) - 1 ) );
|
||||
if ( len_pre > len )
|
||||
len_pre = len;
|
||||
len -= len_pre;
|
||||
len_mid = ( len & ~( sizeof ( unsigned long ) - 1 ) );
|
||||
len -= len_mid;
|
||||
len_post = len;
|
||||
|
||||
/* Zero pre-aligned section */
|
||||
__asm__ __volatile__ ( "b 2f\n\t"
|
||||
"\n1:\n\t"
|
||||
"addi.d %0, %0, -1\n\t"
|
||||
"stx.b $zero, %1, %0\n\t"
|
||||
"bnez %0, 1b\n\t"
|
||||
"addi.d %0, %0, 1\n\t"
|
||||
"\n2:\n\t"
|
||||
: "=&r" ( discard_offset )
|
||||
: "r" ( dest ), "0" ( len )
|
||||
"bne %0, %2, 1b\n\t"
|
||||
: "=&r" ( offset )
|
||||
: "r" ( dest ), "r" ( len_pre ), "0" ( 0 )
|
||||
: "memory" );
|
||||
return;
|
||||
}
|
||||
|
||||
/* To zero 16 bytes at a time: one initial
|
||||
* potentially unaligned access, multiple aligned accesses,
|
||||
* one final potentially unaligned access.
|
||||
*/
|
||||
|
||||
__asm__ __volatile__ ( "st.d $zero, %0, 0\n\t"
|
||||
"st.d $zero, %0, 8\n\t"
|
||||
"addi.d %0, %0, 16\n\t"
|
||||
"addi.w $t0, $zero, 15\n\t"
|
||||
"andn %0, %0, $t0\n\t"
|
||||
"addi.w $t0, $zero, 15\n\t"
|
||||
"andn %1, %2, $t0\n\t"
|
||||
"b 2f\n\t"
|
||||
/* Zero aligned section */
|
||||
__asm__ __volatile__ ( "b 2f\n\t"
|
||||
"\n1:\n\t"
|
||||
"st.d $zero, %0, 0\n\t"
|
||||
"st.d $zero, %0, 8\n\t"
|
||||
"addi.d %0, %0, 16\n\t"
|
||||
"stx.d $zero, %1, %0\n\t"
|
||||
"addi.d %0, %0, %3\n\t"
|
||||
"\n2:\n\t"
|
||||
"bne %0, %1, 1b\n\t"
|
||||
"st.d $zero, %2, -16\n\t"
|
||||
"st.d $zero, %2, -8\n\t"
|
||||
: "=&r" ( discard_dest ),
|
||||
"=&r" ( discard_end )
|
||||
: "r" ( dest + len ), "0" ( dest )
|
||||
: "memory", "t0" );
|
||||
"bne %0, %2, 1b\n\t"
|
||||
: "+r" ( offset )
|
||||
: "r" ( dest ), "r" ( offset + len_mid ),
|
||||
"i" ( sizeof ( unsigned long ) )
|
||||
: "memory" );
|
||||
|
||||
/* Zero post-aligned section */
|
||||
__asm__ __volatile__ ( "b 2f\n\t"
|
||||
"\n1:\n\t"
|
||||
"stx.b $zero, %1, %0\n\t"
|
||||
"addi.d %0, %0, 1\n\t"
|
||||
"\n2:\n\t"
|
||||
"bne %0, %2, 1b\n\t"
|
||||
: "+r" ( offset )
|
||||
: "r" ( dest ), "r" ( offset + len_post )
|
||||
: "memory" );
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -166,10 +167,14 @@ void loong64_bzero ( void *dest, size_t len ) {
|
||||
* @v character Fill character
|
||||
*
|
||||
* The unusual parameter order is to allow for more efficient
|
||||
* tail-calling to loong64_memset() when zeroing a region.
|
||||
* tail-calling to loong64_bzero() when zeroing a region.
|
||||
*/
|
||||
void loong64_memset ( void *dest, size_t len, int character ) {
|
||||
size_t discard_offset;
|
||||
size_t offset;
|
||||
|
||||
/* Do nothing if length is zero */
|
||||
if ( ! len )
|
||||
return;
|
||||
|
||||
/* Use optimised zeroing code if applicable */
|
||||
if ( character == 0 ) {
|
||||
@@ -181,71 +186,14 @@ void loong64_memset ( void *dest, size_t len, int character ) {
|
||||
* value is relatively rare and unlikely to be
|
||||
* performance-critical.
|
||||
*/
|
||||
__asm__ __volatile__ ( "beqz %0, 2f\n\t"
|
||||
"\n1:\n\t"
|
||||
"addi.d %0, %0, -1\n\t"
|
||||
__asm__ __volatile__ ( "\n1:\n\t"
|
||||
"stx.b %2, %1, %0\n\t"
|
||||
"bnez %0, 1b\n\t"
|
||||
"\n2:\n\t"
|
||||
: "=&r" ( discard_offset )
|
||||
: "r" ( dest ), "r" ( character ), "0" ( len )
|
||||
: "memory" );
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy (possibly overlapping) memory region forwards
|
||||
*
|
||||
* @v dest Destination region
|
||||
* @v src Source region
|
||||
* @v len Length
|
||||
*/
|
||||
void loong64_memmove_forwards ( void *dest, const void *src, size_t len ) {
|
||||
void *discard_dest;
|
||||
const void *discard_src;
|
||||
unsigned long discard_data;
|
||||
|
||||
/* Assume memmove() is not performance-critical, and perform a
|
||||
* bytewise copy for simplicity.
|
||||
*/
|
||||
__asm__ __volatile__ ( "b 2f\n\t"
|
||||
"\n1:\n\t"
|
||||
"ld.b %2, %1, 0\n\t"
|
||||
"addi.d %1, %1, 1\n\t"
|
||||
"st.b %2, %0, 0\n\t"
|
||||
"addi.d %0, %0, 1\n\t"
|
||||
"\n2:\n\t"
|
||||
"bne %0, %3, 1b\n\t"
|
||||
: "=&r" ( discard_dest ),
|
||||
"=&r" ( discard_src ),
|
||||
"=&r" ( discard_data )
|
||||
: "r" ( dest + len ), "0" ( dest ), "1" ( src )
|
||||
: "memory" );
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy (possibly overlapping) memory region backwards
|
||||
*
|
||||
* @v dest Destination region
|
||||
* @v src Source region
|
||||
* @v len Length
|
||||
*/
|
||||
void loong64_memmove_backwards ( void *dest, const void *src, size_t len ) {
|
||||
size_t discard_offset;
|
||||
unsigned long discard_data;
|
||||
|
||||
/* Assume memmove() is not performance-critical, and perform a
|
||||
* bytewise copy for simplicity.
|
||||
*/
|
||||
__asm__ __volatile__ ( "beqz %0, 2f\n\t"
|
||||
"\n1:\n\t"
|
||||
"addi.d %0, %0, -1\n\t"
|
||||
"ldx.b %1, %3, %0\n\t"
|
||||
"stx.b %1, %2, %0\n\t"
|
||||
"bnez %0, 1b\n\t"
|
||||
"\n2:\n\t"
|
||||
: "=&r" ( discard_offset ),
|
||||
"=&r" ( discard_data )
|
||||
: "r" ( dest ), "r" ( src ), "0" ( len )
|
||||
: "=&r" ( offset )
|
||||
: "r" ( dest ), "r" ( character ), "r" ( len ),
|
||||
"0" ( 0 )
|
||||
: "memory" );
|
||||
}
|
||||
|
||||
@@ -257,10 +205,30 @@ void loong64_memmove_backwards ( void *dest, const void *src, size_t len ) {
|
||||
* @v len Length
|
||||
*/
|
||||
void loong64_memmove ( void *dest, const void *src, size_t len ) {
|
||||
size_t offset;
|
||||
unsigned long discard_data;
|
||||
|
||||
/* Do nothing if length is zero */
|
||||
if ( ! len )
|
||||
return;
|
||||
|
||||
/* Use memcpy() if copy direction is forwards */
|
||||
if ( dest <= src ) {
|
||||
loong64_memmove_forwards ( dest, src, len );
|
||||
} else {
|
||||
loong64_memmove_backwards ( dest, src, len );
|
||||
memcpy ( dest, src, len );
|
||||
return;
|
||||
}
|
||||
|
||||
/* Assume memmove() is not performance-critical, and perform a
|
||||
* bytewise copy backwards for simplicity.
|
||||
*/
|
||||
__asm__ __volatile__ ( "\n1:\n\t"
|
||||
"addi.d %0, %0, -1\n\t"
|
||||
"ldx.b %1, %3, %0\n\t"
|
||||
"stx.b %1, %2, %0\n\t"
|
||||
"\n2:\n\t"
|
||||
"bnez %0, 1b\n\t"
|
||||
: "=&r" ( offset ), "=&r" ( discard_data )
|
||||
: "r" ( dest ), "r" ( src ),
|
||||
"0" ( len )
|
||||
: "memory" );
|
||||
}
|
||||
|
||||
@@ -13,8 +13,6 @@ FILE_SECBOOT ( PERMITTED );
|
||||
extern void loong64_bzero ( void *dest, size_t len );
|
||||
extern void loong64_memset ( void *dest, size_t len, int character );
|
||||
extern void loong64_memcpy ( void *dest, const void *src, size_t len );
|
||||
extern void loong64_memmove_forwards ( void *dest, const void *src, size_t len );
|
||||
extern void loong64_memmove_backwards ( void *dest, const void *src, size_t len );
|
||||
extern void loong64_memmove ( void *dest, const void *src, size_t len );
|
||||
|
||||
/**
|
||||
@@ -27,6 +25,14 @@ extern void loong64_memmove ( void *dest, const void *src, size_t len );
|
||||
*/
|
||||
static inline __attribute__ (( always_inline )) void *
|
||||
memset ( void *dest, int character, size_t len ) {
|
||||
|
||||
/* Zeroing: use the optimised variable-length zeroing code */
|
||||
if ( __builtin_constant_p ( character ) && ( character == 0 ) ) {
|
||||
loong64_bzero ( dest, len );
|
||||
return dest;
|
||||
}
|
||||
|
||||
/* Not necessarily zeroing: use basic variable-length code */
|
||||
loong64_memset ( dest, len, character );
|
||||
return dest;
|
||||
}
|
||||
@@ -41,6 +47,7 @@ memset ( void *dest, int character, size_t len ) {
|
||||
*/
|
||||
static inline __attribute__ (( always_inline )) void *
|
||||
memcpy ( void *dest, const void *src, size_t len ) {
|
||||
|
||||
loong64_memcpy ( dest, src, len );
|
||||
return dest;
|
||||
}
|
||||
@@ -55,6 +62,17 @@ memcpy ( void *dest, const void *src, size_t len ) {
|
||||
*/
|
||||
static inline __attribute__ (( always_inline )) void *
|
||||
memmove ( void *dest, const void *src, size_t len ) {
|
||||
ssize_t offset = ( dest - src );
|
||||
|
||||
/* If direction of copy is known to be forwards at build time,
|
||||
* then use variable-length memcpy().
|
||||
*/
|
||||
if ( __builtin_constant_p ( offset ) && ( offset <= 0 ) ) {
|
||||
loong64_memcpy ( dest, src, len );
|
||||
return dest;
|
||||
}
|
||||
|
||||
/* Otherwise, use ambidirectional copy */
|
||||
loong64_memmove ( dest, src, len );
|
||||
return dest;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user