From 086e9dc0e2ca925b1b58caefd04ed2757d14790b Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 5 Oct 2012 17:02:09 +0100 Subject: metag: Optimised library functions Add optimised library functions for metag. Signed-off-by: James Hogan --- arch/metag/lib/memcpy.S | 185 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 185 insertions(+) create mode 100644 arch/metag/lib/memcpy.S (limited to 'arch/metag/lib/memcpy.S') diff --git a/arch/metag/lib/memcpy.S b/arch/metag/lib/memcpy.S new file mode 100644 index 000000000000..46b7a2b9479e --- /dev/null +++ b/arch/metag/lib/memcpy.S @@ -0,0 +1,185 @@ +! Copyright (C) 2008-2012 Imagination Technologies Ltd. + + .text + .global _memcpy + .type _memcpy,function +! D1Ar1 dst +! D0Ar2 src +! D1Ar3 cnt +! D0Re0 dst +_memcpy: + CMP D1Ar3, #16 + MOV A1.2, D0Ar2 ! source pointer + MOV A0.2, D1Ar1 ! destination pointer + MOV A0.3, D1Ar1 ! for return value +! If there are less than 16 bytes to copy use the byte copy loop + BGE $Llong_copy + +$Lbyte_copy: +! Simply copy a byte at a time + SUBS TXRPT, D1Ar3, #1 + BLT $Lend +$Lloop_byte: + GETB D1Re0, [A1.2++] + SETB [A0.2++], D1Re0 + BR $Lloop_byte + +$Lend: +! Finally set return value and return + MOV D0Re0, A0.3 + MOV PC, D1RtP + +$Llong_copy: + ANDS D1Ar5, D1Ar1, #7 ! test destination alignment + BZ $Laligned_dst + +! The destination address is not 8 byte aligned. We will copy bytes from +! the source to the destination until the remaining data has an 8 byte +! destination address alignment (i.e we should never copy more than 7 +! bytes here). +$Lalign_dst: + GETB D0Re0, [A1.2++] + ADD D1Ar5, D1Ar5, #1 ! dest is aligned when D1Ar5 reaches #8 + SUB D1Ar3, D1Ar3, #1 ! decrement count of remaining bytes + SETB [A0.2++], D0Re0 + CMP D1Ar5, #8 + BNE $Lalign_dst + +! We have at least (16 - 7) = 9 bytes to copy - calculate the number of 8 byte +! blocks, then jump to the unaligned copy loop or fall through to the aligned +! copy loop as appropriate. +$Laligned_dst: + MOV D0Ar4, A1.2 + LSR D1Ar5, D1Ar3, #3 ! D1Ar5 = number of 8 byte blocks + ANDS D0Ar4, D0Ar4, #7 ! test source alignment + BNZ $Lunaligned_copy ! if unaligned, use unaligned copy loop + +! Both source and destination are 8 byte aligned - the easy case. +$Laligned_copy: + LSRS D1Ar5, D1Ar3, #5 ! D1Ar5 = number of 32 byte blocks + BZ $Lbyte_copy + SUB TXRPT, D1Ar5, #1 + +$Laligned_32: + GETL D0Re0, D1Re0, [A1.2++] + GETL D0Ar6, D1Ar5, [A1.2++] + SETL [A0.2++], D0Re0, D1Re0 + SETL [A0.2++], D0Ar6, D1Ar5 + GETL D0Re0, D1Re0, [A1.2++] + GETL D0Ar6, D1Ar5, [A1.2++] + SETL [A0.2++], D0Re0, D1Re0 + SETL [A0.2++], D0Ar6, D1Ar5 + BR $Laligned_32 + +! If there are any remaining bytes use the byte copy loop, otherwise we are done + ANDS D1Ar3, D1Ar3, #0x1f + BNZ $Lbyte_copy + B $Lend + +! The destination is 8 byte aligned but the source is not, and there are 8 +! or more bytes to be copied. +$Lunaligned_copy: +! Adjust the source pointer (A1.2) to the 8 byte boundary before its +! current value + MOV D0Ar4, A1.2 + MOV D0Ar6, A1.2 + ANDMB D0Ar4, D0Ar4, #0xfff8 + MOV A1.2, D0Ar4 +! Save the number of bytes of mis-alignment in D0Ar4 for use later + SUBS D0Ar6, D0Ar6, D0Ar4 + MOV D0Ar4, D0Ar6 +! if there is no mis-alignment after all, use the aligned copy loop + BZ $Laligned_copy + +! prefetch 8 bytes + GETL D0Re0, D1Re0, [A1.2] + + SUB TXRPT, D1Ar5, #1 + +! There are 3 mis-alignment cases to be considered. Less than 4 bytes, exactly +! 4 bytes, and more than 4 bytes. + CMP D0Ar6, #4 + BLT $Lunaligned_1_2_3 ! use 1-3 byte mis-alignment loop + BZ $Lunaligned_4 ! use 4 byte mis-alignment loop + +! The mis-alignment is more than 4 bytes +$Lunaligned_5_6_7: + SUB D0Ar6, D0Ar6, #4 +! Calculate the bit offsets required for the shift operations necesssary +! to align the data. +! D0Ar6 = bit offset, D1Ar5 = (32 - bit offset) + MULW D0Ar6, D0Ar6, #8 + MOV D1Ar5, #32 + SUB D1Ar5, D1Ar5, D0Ar6 +! Move data 4 bytes before we enter the main loop + MOV D0Re0, D1Re0 + +$Lloop_5_6_7: + GETL D0Ar2, D1Ar1, [++A1.2] +! form 64-bit data in D0Re0, D1Re0 + LSR D0Re0, D0Re0, D0Ar6 + MOV D1Re0, D0Ar2 + LSL D1Re0, D1Re0, D1Ar5 + ADD D0Re0, D0Re0, D1Re0 + + LSR D0Ar2, D0Ar2, D0Ar6 + LSL D1Re0, D1Ar1, D1Ar5 + ADD D1Re0, D1Re0, D0Ar2 + + SETL [A0.2++], D0Re0, D1Re0 + MOV D0Re0, D1Ar1 + BR $Lloop_5_6_7 + + B $Lunaligned_end + +$Lunaligned_1_2_3: +! Calculate the bit offsets required for the shift operations necesssary +! to align the data. +! D0Ar6 = bit offset, D1Ar5 = (32 - bit offset) + MULW D0Ar6, D0Ar6, #8 + MOV D1Ar5, #32 + SUB D1Ar5, D1Ar5, D0Ar6 + +$Lloop_1_2_3: +! form 64-bit data in D0Re0,D1Re0 + LSR D0Re0, D0Re0, D0Ar6 + LSL D1Ar1, D1Re0, D1Ar5 + ADD D0Re0, D0Re0, D1Ar1 + MOV D0Ar2, D1Re0 + LSR D0FrT, D0Ar2, D0Ar6 + GETL D0Ar2, D1Ar1, [++A1.2] + + MOV D1Re0, D0Ar2 + LSL D1Re0, D1Re0, D1Ar5 + ADD D1Re0, D1Re0, D0FrT + + SETL [A0.2++], D0Re0, D1Re0 + MOV D0Re0, D0Ar2 + MOV D1Re0, D1Ar1 + BR $Lloop_1_2_3 + + B $Lunaligned_end + +! The 4 byte mis-alignment case - this does not require any shifting, just a +! shuffling of registers. +$Lunaligned_4: + MOV D0Re0, D1Re0 +$Lloop_4: + GETL D0Ar2, D1Ar1, [++A1.2] + MOV D1Re0, D0Ar2 + SETL [A0.2++], D0Re0, D1Re0 + MOV D0Re0, D1Ar1 + BR $Lloop_4 + +$Lunaligned_end: +! If there are no remaining bytes to copy, we are done. + ANDS D1Ar3, D1Ar3, #7 + BZ $Lend +! Re-adjust the source pointer (A1.2) back to the actual (unaligned) byte +! address of the remaining bytes, and fall through to the byte copy loop. + MOV D0Ar6, A1.2 + ADD D1Ar5, D0Ar4, D0Ar6 + MOV A1.2, D1Ar5 + B $Lbyte_copy + + .size _memcpy,.-_memcpy -- cgit v1.2.3