[loong64] Replace optimised string operations

The current implementation of the optimised string operations appears
to have been ported from the (old) arm64 implementation, and does not
cleanly match the LoongArch64 instruction set.

Replace with code derived from the riscv64 implementation, modified to
use indexed load and store instructions.

Signed-off-by: Michael Brown <mcb30@ipxe.org>
This commit is contained in:
Michael Brown
2026-05-21 15:19:10 +01:00
parent 63eeb23ad6
commit 6dcc401054
2 changed files with 152 additions and 166 deletions
+124 -156
View File
@@ -1,6 +1,5 @@
/* /*
* Copyright (C) 2016 Michael Brown <mbrown@fensystems.co.uk>. * Copyright (C) 2026 Michael Brown <mbrown@fensystems.co.uk>.
* Copyright (c) 2023, Xiaotian Wu <wuxiaotian@loongson.cn>
* *
* This program is free software; you can redistribute it and/or * This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as * modify it under the terms of the GNU General Public License as
@@ -29,6 +28,7 @@
*/ */
FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL ); FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
FILE_SECBOOT ( PERMITTED );
#include <string.h> #include <string.h>
@@ -41,68 +41,65 @@ FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
* @ret dest Destination address * @ret dest Destination address
*/ */
void loong64_memcpy ( void *dest, const void *src, size_t len ) { void loong64_memcpy ( void *dest, const void *src, size_t len ) {
void *discard_dest; size_t len_pre;
void *discard_end; size_t len_mid;
const void *discard_src; size_t len_post;
size_t discard_offset; size_t offset;
unsigned long discard_data; unsigned long discard_data;
unsigned long discard_low;
unsigned long discard_high;
/* If length is too short, then just copy individual bytes. /* Calculate pre-aligned, aligned, and post-aligned lengths.
* (Align on the destination address, on the assumption that
* misaligned stores are likely to be more expensive than
* misaligned loads.)
*/ */
if ( len < 16 ) { len_pre = ( ( sizeof ( unsigned long ) - ( ( intptr_t ) dest ) ) &
__asm__ __volatile__ ( "beqz %0, 2f\n\t" ( sizeof ( unsigned long ) - 1 ) );
if ( len_pre > len )
len_pre = len;
len -= len_pre;
len_mid = ( len & ~( sizeof ( unsigned long ) - 1 ) );
len -= len_mid;
len_post = len;
/* Copy pre-aligned section */
__asm__ __volatile__ ( "b 2f\n\t"
"\n1:\n\t" "\n1:\n\t"
"addi.d %0, %0, -1\n\t"
"ldx.b %1, %3, %0\n\t" "ldx.b %1, %3, %0\n\t"
"stx.b %1, %2, %0\n\t" "stx.b %1, %2, %0\n\t"
"bnez %0, 1b\n\t" "addi.d %0, %0, 1\n\t"
"\n2:\n\t" "\n2:\n\t"
: "=&r" ( discard_offset ), "bne %0, %4, 1b\n\t"
"=&r" ( discard_data ) : "=&r" ( offset ), "=&r" ( discard_data )
: "r" ( dest ), "r" ( src ), "0" ( len ) : "r" ( dest ), "r" ( src ), "r" ( len_pre ),
: "memory", "t0" ); "0" ( 0 )
return; : "memory" );
}
/* Copy 16 bytes at a time: one initial /* Copy aligned section */
* potentially unaligned access, multiple destination-aligned __asm__ __volatile__ ( "b 2f\n\t"
* accesses, one final potentially unaligned access.
*/
__asm__ __volatile__ ( "ld.d %3, %1, 0\n\t"
"ld.d %4, %1, 8\n\t"
"addi.d %1, %1, 16\n\t"
"st.d %3, %0, 0\n\t"
"st.d %4, %0, 8\n\t"
"addi.d %0, %0, 16\n\t"
"andi %3, %0, 15\n\t"
"sub.d %0, %0, %3\n\t"
"sub.d %1, %1, %3\n\t"
"addi.d $t0, $zero, 0xf\n\t"
"andn %2, %5, $t0\n\t"
"b 2f\n\t"
"\n1:\n\t" "\n1:\n\t"
"ld.d %3, %1, 0\n\t" "ldx.d %1, %3, %0\n\t"
"ld.d %4, %1, 8\n\t" "stx.d %1, %2, %0\n\t"
"addi.d %1, %1, 16\n\t" "addi.d %0, %0, %5\n\t"
"st.d %3, %0, 0\n\t"
"st.d %4, %0, 8\n\t"
"addi.d %0, %0, 16\n\t"
"\n2:\n\t" "\n2:\n\t"
"bne %0, %2, 1b\n\t" "bne %0, %4, 1b\n\t"
"ld.d %3, %6, -16\n\t" : "+r" ( offset ), "=&r" ( discard_data )
"ld.d %4, %6, -8\n\t" : "r" ( dest ), "r" ( src ),
"st.d %3, %5, -16\n\t" "r" ( offset + len_mid ),
"st.d %4, %5, -8\n\t" "i" ( sizeof ( unsigned long ) )
: "=&r" ( discard_dest ), : "memory" );
"=&r" ( discard_src ),
"=&r" ( discard_end ), /* Copy post-aligned section */
"=&r" ( discard_low ), __asm__ __volatile__ ( "b 2f\n\t"
"=&r" ( discard_high ) "\n1:\n\t"
: "r" ( dest + len ), "r" ( src + len ), "ldx.b %1, %3, %0\n\t"
"0" ( dest ), "1" ( src ) "stx.b %1, %2, %0\n\t"
: "memory", "t0" ); "addi.d %0, %0, 1\n\t"
"\n2:\n\t"
"bne %0, %4, 1b\n\t"
: "+r" ( offset ), "=&r" ( discard_data )
: "r" ( dest ), "r" ( src ),
"r" ( offset + len_post )
: "memory" );
} }
/** /**
@@ -112,50 +109,54 @@ void loong64_memcpy ( void *dest, const void *src, size_t len ) {
* @v len Length * @v len Length
*/ */
void loong64_bzero ( void *dest, size_t len ) { void loong64_bzero ( void *dest, size_t len ) {
size_t discard_offset; size_t len_pre;
void *discard_dest; size_t len_mid;
void *discard_end; size_t len_post;
size_t offset;
/* If length is too short, then just zero individual bytes. /* Calculate pre-aligned, aligned, and post-aligned lengths */
*/ len_pre = ( ( sizeof ( unsigned long ) - ( ( intptr_t ) dest ) ) &
if ( len < 16 ) { ( sizeof ( unsigned long ) - 1 ) );
__asm__ __volatile__ ( "beqz %0, 2f\n\t" if ( len_pre > len )
len_pre = len;
len -= len_pre;
len_mid = ( len & ~( sizeof ( unsigned long ) - 1 ) );
len -= len_mid;
len_post = len;
/* Zero pre-aligned section */
__asm__ __volatile__ ( "b 2f\n\t"
"\n1:\n\t" "\n1:\n\t"
"addi.d %0, %0, -1\n\t"
"stx.b $zero, %1, %0\n\t" "stx.b $zero, %1, %0\n\t"
"bnez %0, 1b\n\t" "addi.d %0, %0, 1\n\t"
"\n2:\n\t" "\n2:\n\t"
: "=&r" ( discard_offset ) "bne %0, %2, 1b\n\t"
: "r" ( dest ), "0" ( len ) : "=&r" ( offset )
: "r" ( dest ), "r" ( len_pre ), "0" ( 0 )
: "memory" ); : "memory" );
return;
}
/* To zero 16 bytes at a time: one initial /* Zero aligned section */
* potentially unaligned access, multiple aligned accesses, __asm__ __volatile__ ( "b 2f\n\t"
* one final potentially unaligned access.
*/
__asm__ __volatile__ ( "st.d $zero, %0, 0\n\t"
"st.d $zero, %0, 8\n\t"
"addi.d %0, %0, 16\n\t"
"addi.w $t0, $zero, 15\n\t"
"andn %0, %0, $t0\n\t"
"addi.w $t0, $zero, 15\n\t"
"andn %1, %2, $t0\n\t"
"b 2f\n\t"
"\n1:\n\t" "\n1:\n\t"
"st.d $zero, %0, 0\n\t" "stx.d $zero, %1, %0\n\t"
"st.d $zero, %0, 8\n\t" "addi.d %0, %0, %3\n\t"
"addi.d %0, %0, 16\n\t"
"\n2:\n\t" "\n2:\n\t"
"bne %0, %1, 1b\n\t" "bne %0, %2, 1b\n\t"
"st.d $zero, %2, -16\n\t" : "+r" ( offset )
"st.d $zero, %2, -8\n\t" : "r" ( dest ), "r" ( offset + len_mid ),
: "=&r" ( discard_dest ), "i" ( sizeof ( unsigned long ) )
"=&r" ( discard_end ) : "memory" );
: "r" ( dest + len ), "0" ( dest )
: "memory", "t0" ); /* Zero post-aligned section */
__asm__ __volatile__ ( "b 2f\n\t"
"\n1:\n\t"
"stx.b $zero, %1, %0\n\t"
"addi.d %0, %0, 1\n\t"
"\n2:\n\t"
"bne %0, %2, 1b\n\t"
: "+r" ( offset )
: "r" ( dest ), "r" ( offset + len_post )
: "memory" );
} }
/** /**
@@ -166,10 +167,14 @@ void loong64_bzero ( void *dest, size_t len ) {
* @v character Fill character * @v character Fill character
* *
* The unusual parameter order is to allow for more efficient * The unusual parameter order is to allow for more efficient
* tail-calling to loong64_memset() when zeroing a region. * tail-calling to loong64_bzero() when zeroing a region.
*/ */
void loong64_memset ( void *dest, size_t len, int character ) { void loong64_memset ( void *dest, size_t len, int character ) {
size_t discard_offset; size_t offset;
/* Do nothing if length is zero */
if ( ! len )
return;
/* Use optimised zeroing code if applicable */ /* Use optimised zeroing code if applicable */
if ( character == 0 ) { if ( character == 0 ) {
@@ -181,71 +186,14 @@ void loong64_memset ( void *dest, size_t len, int character ) {
* value is relatively rare and unlikely to be * value is relatively rare and unlikely to be
* performance-critical. * performance-critical.
*/ */
__asm__ __volatile__ ( "beqz %0, 2f\n\t" __asm__ __volatile__ ( "\n1:\n\t"
"\n1:\n\t"
"addi.d %0, %0, -1\n\t"
"stx.b %2, %1, %0\n\t" "stx.b %2, %1, %0\n\t"
"bnez %0, 1b\n\t"
"\n2:\n\t"
: "=&r" ( discard_offset )
: "r" ( dest ), "r" ( character ), "0" ( len )
: "memory" );
}
/**
* Copy (possibly overlapping) memory region forwards
*
* @v dest Destination region
* @v src Source region
* @v len Length
*/
void loong64_memmove_forwards ( void *dest, const void *src, size_t len ) {
void *discard_dest;
const void *discard_src;
unsigned long discard_data;
/* Assume memmove() is not performance-critical, and perform a
* bytewise copy for simplicity.
*/
__asm__ __volatile__ ( "b 2f\n\t"
"\n1:\n\t"
"ld.b %2, %1, 0\n\t"
"addi.d %1, %1, 1\n\t"
"st.b %2, %0, 0\n\t"
"addi.d %0, %0, 1\n\t" "addi.d %0, %0, 1\n\t"
"\n2:\n\t" "\n2:\n\t"
"bne %0, %3, 1b\n\t" "bne %0, %3, 1b\n\t"
: "=&r" ( discard_dest ), : "=&r" ( offset )
"=&r" ( discard_src ), : "r" ( dest ), "r" ( character ), "r" ( len ),
"=&r" ( discard_data ) "0" ( 0 )
: "r" ( dest + len ), "0" ( dest ), "1" ( src )
: "memory" );
}
/**
* Copy (possibly overlapping) memory region backwards
*
* @v dest Destination region
* @v src Source region
* @v len Length
*/
void loong64_memmove_backwards ( void *dest, const void *src, size_t len ) {
size_t discard_offset;
unsigned long discard_data;
/* Assume memmove() is not performance-critical, and perform a
* bytewise copy for simplicity.
*/
__asm__ __volatile__ ( "beqz %0, 2f\n\t"
"\n1:\n\t"
"addi.d %0, %0, -1\n\t"
"ldx.b %1, %3, %0\n\t"
"stx.b %1, %2, %0\n\t"
"bnez %0, 1b\n\t"
"\n2:\n\t"
: "=&r" ( discard_offset ),
"=&r" ( discard_data )
: "r" ( dest ), "r" ( src ), "0" ( len )
: "memory" ); : "memory" );
} }
@@ -257,10 +205,30 @@ void loong64_memmove_backwards ( void *dest, const void *src, size_t len ) {
* @v len Length * @v len Length
*/ */
void loong64_memmove ( void *dest, const void *src, size_t len ) { void loong64_memmove ( void *dest, const void *src, size_t len ) {
size_t offset;
unsigned long discard_data;
/* Do nothing if length is zero */
if ( ! len )
return;
/* Use memcpy() if copy direction is forwards */
if ( dest <= src ) { if ( dest <= src ) {
loong64_memmove_forwards ( dest, src, len ); memcpy ( dest, src, len );
} else { return;
loong64_memmove_backwards ( dest, src, len );
} }
/* Assume memmove() is not performance-critical, and perform a
* bytewise copy backwards for simplicity.
*/
__asm__ __volatile__ ( "\n1:\n\t"
"addi.d %0, %0, -1\n\t"
"ldx.b %1, %3, %0\n\t"
"stx.b %1, %2, %0\n\t"
"\n2:\n\t"
"bnez %0, 1b\n\t"
: "=&r" ( offset ), "=&r" ( discard_data )
: "r" ( dest ), "r" ( src ),
"0" ( len )
: "memory" );
} }
+20 -2
View File
@@ -13,8 +13,6 @@ FILE_SECBOOT ( PERMITTED );
extern void loong64_bzero ( void *dest, size_t len ); extern void loong64_bzero ( void *dest, size_t len );
extern void loong64_memset ( void *dest, size_t len, int character ); extern void loong64_memset ( void *dest, size_t len, int character );
extern void loong64_memcpy ( void *dest, const void *src, size_t len ); extern void loong64_memcpy ( void *dest, const void *src, size_t len );
extern void loong64_memmove_forwards ( void *dest, const void *src, size_t len );
extern void loong64_memmove_backwards ( void *dest, const void *src, size_t len );
extern void loong64_memmove ( void *dest, const void *src, size_t len ); extern void loong64_memmove ( void *dest, const void *src, size_t len );
/** /**
@@ -27,6 +25,14 @@ extern void loong64_memmove ( void *dest, const void *src, size_t len );
*/ */
static inline __attribute__ (( always_inline )) void * static inline __attribute__ (( always_inline )) void *
memset ( void *dest, int character, size_t len ) { memset ( void *dest, int character, size_t len ) {
/* Zeroing: use the optimised variable-length zeroing code */
if ( __builtin_constant_p ( character ) && ( character == 0 ) ) {
loong64_bzero ( dest, len );
return dest;
}
/* Not necessarily zeroing: use basic variable-length code */
loong64_memset ( dest, len, character ); loong64_memset ( dest, len, character );
return dest; return dest;
} }
@@ -41,6 +47,7 @@ memset ( void *dest, int character, size_t len ) {
*/ */
static inline __attribute__ (( always_inline )) void * static inline __attribute__ (( always_inline )) void *
memcpy ( void *dest, const void *src, size_t len ) { memcpy ( void *dest, const void *src, size_t len ) {
loong64_memcpy ( dest, src, len ); loong64_memcpy ( dest, src, len );
return dest; return dest;
} }
@@ -55,6 +62,17 @@ memcpy ( void *dest, const void *src, size_t len ) {
*/ */
static inline __attribute__ (( always_inline )) void * static inline __attribute__ (( always_inline )) void *
memmove ( void *dest, const void *src, size_t len ) { memmove ( void *dest, const void *src, size_t len ) {
ssize_t offset = ( dest - src );
/* If direction of copy is known to be forwards at build time,
* then use variable-length memcpy().
*/
if ( __builtin_constant_p ( offset ) && ( offset <= 0 ) ) {
loong64_memcpy ( dest, src, len );
return dest;
}
/* Otherwise, use ambidirectional copy */
loong64_memmove ( dest, src, len ); loong64_memmove ( dest, src, len );
return dest; return dest;
} }