From d17ecf443d8fdccf2e1674b1ac9d73bc9c9429c7 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Fri, 18 Mar 2022 11:37:09 +0100 Subject: csky: fix typos in comments Various spelling mistakes in comments. Detected with the help of Coccinelle. Signed-off-by: Julia Lawall Signed-off-by: Guo Ren --- arch/csky/kernel/module.c | 2 +- arch/csky/kernel/probes/uprobes.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/csky/kernel/module.c b/arch/csky/kernel/module.c index 6cd82d69c655..f11b3e573344 100644 --- a/arch/csky/kernel/module.c +++ b/arch/csky/kernel/module.c @@ -68,7 +68,7 @@ int apply_relocate_add(Elf32_Shdr *sechdrs, const char *strtab, *location = rel[i].r_addend + sym->st_value; break; case R_CSKY_PC32: - /* Add the value, subtract its postition */ + /* Add the value, subtract its position */ *location = rel[i].r_addend + sym->st_value - (uint32_t)location; break; diff --git a/arch/csky/kernel/probes/uprobes.c b/arch/csky/kernel/probes/uprobes.c index 1a9e0961b2b5..2d31a12e46cf 100644 --- a/arch/csky/kernel/probes/uprobes.c +++ b/arch/csky/kernel/probes/uprobes.c @@ -102,7 +102,7 @@ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) struct uprobe_task *utask = current->utask; /* - * Task has received a fatal signal, so reset back to probbed + * Task has received a fatal signal, so reset back to probed * address. */ instruction_pointer_set(regs, utask->vaddr); -- cgit v1.2.3 From 8c4d16471e2babe9bdfe41d6ef724526629696cb Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Wed, 6 Apr 2022 22:28:43 +0800 Subject: csky: patch_text: Fixup last cpu should be master These patch_text implementations are using stop_machine_cpuslocked infrastructure with atomic cpu_count. The original idea: When the master CPU patch_text, the others should wait for it. But current implementation is using the first CPU as master, which couldn't guarantee the remaining CPUs are waiting. This patch changes the last CPU as the master to solve the potential risk. Fixes: 33e53ae1ce41 ("csky: Add kprobes supported") Signed-off-by: Guo Ren Signed-off-by: Guo Ren Reviewed-by: Masami Hiramatsu Cc: --- arch/csky/kernel/probes/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/csky/kernel/probes/kprobes.c b/arch/csky/kernel/probes/kprobes.c index 42920f25e73c..34ba684d5962 100644 --- a/arch/csky/kernel/probes/kprobes.c +++ b/arch/csky/kernel/probes/kprobes.c @@ -30,7 +30,7 @@ static int __kprobes patch_text_cb(void *priv) struct csky_insn_patch *param = priv; unsigned int addr = (unsigned int)param->addr; - if (atomic_inc_return(¶m->cpu_count) == 1) { + if (atomic_inc_return(¶m->cpu_count) == num_online_cpus()) { *(u16 *) addr = cpu_to_le16(param->opcode); dcache_wb_range(addr, addr + 2); atomic_inc(¶m->cpu_count); -- cgit v1.2.3 From cfb24463a53edeb388f3563e166ad7f9591dad3d Mon Sep 17 00:00:00 2001 From: Deyan Wang Date: Mon, 14 Feb 2022 20:02:50 +0800 Subject: csky: Fix versioncheck warnings $make versioncheck arch/csky/include/asm/io.h: 8 linux/version.h not needed. arch/csky/kernel/process.c: 5 linux/version.h not needed. arch/csky/mm/dma-mapping.c: 12 linux/version.h not needed. comments from Randy: The patch makes sense but these are not compile warnings. They come from scripts/checkversion.pl, which can be called by 'make versioncheck', so I suppose that something in your build system is running 'make versioncheck'. Signed-off-by: Deyan Wang Acked-by: Randy Dunlap Signed-off-by: Guo Ren Signed-off-by: Guo Ren --- arch/csky/include/asm/io.h | 1 - arch/csky/kernel/process.c | 1 - arch/csky/mm/dma-mapping.c | 1 - 3 files changed, 3 deletions(-) (limited to 'arch') diff --git a/arch/csky/include/asm/io.h b/arch/csky/include/asm/io.h index f82654053dc0..ed53f0b47388 100644 --- a/arch/csky/include/asm/io.h +++ b/arch/csky/include/asm/io.h @@ -5,7 +5,6 @@ #include #include -#include /* * I/O memory access primitives. Reads are ordered relative to any diff --git a/arch/csky/kernel/process.c b/arch/csky/kernel/process.c index 3d0ca22cd0e2..5de04707aa07 100644 --- a/arch/csky/kernel/process.c +++ b/arch/csky/kernel/process.c @@ -2,7 +2,6 @@ // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd. #include -#include #include #include #include diff --git a/arch/csky/mm/dma-mapping.c b/arch/csky/mm/dma-mapping.c index c3a775a7e8f9..82447029feb4 100644 --- a/arch/csky/mm/dma-mapping.c +++ b/arch/csky/mm/dma-mapping.c @@ -9,7 +9,6 @@ #include #include #include -#include #include static inline void cache_op(phys_addr_t paddr, size_t size, -- cgit v1.2.3 From e4df2d5e852a7d24df3672ae9951eb79e179be08 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Wed, 30 Mar 2022 20:07:14 +0800 Subject: csky: Add C based string functions Try to access RAM with the largest bit width possible, but without doing unaligned accesses. A further improvement could be to use multiple read and writes as the assembly version was trying to do. Tested on a BeagleV Starlight with a SiFive U74 core, where the improvement is noticeable. Signed-off-by: Matteo Croce Co-developed-by: Guo Ren Signed-off-by: Guo Ren --- arch/csky/Kconfig | 8 ++ arch/csky/abiv1/Makefile | 2 - arch/csky/abiv1/memcpy.S | 347 --------------------------------------------- arch/csky/abiv1/strksyms.c | 6 - arch/csky/abiv2/Makefile | 2 + arch/csky/abiv2/strksyms.c | 4 +- arch/csky/lib/Makefile | 3 + arch/csky/lib/string.c | 134 +++++++++++++++++ 8 files changed, 150 insertions(+), 356 deletions(-) delete mode 100644 arch/csky/abiv1/memcpy.S delete mode 100644 arch/csky/abiv1/strksyms.c create mode 100644 arch/csky/lib/string.c (limited to 'arch') diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index 75ef86605d69..21d72b078eef 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -320,6 +320,14 @@ config HOTPLUG_CPU controlled through /sys/devices/system/cpu/cpu1/hotplug/target. Say N if you want to disable CPU hotplug. + +config HAVE_EFFICIENT_UNALIGNED_STRING_OPS + bool "Enable EFFICIENT_UNALIGNED_STRING_OPS for abiv2" + depends on CPU_CK807 || CPU_CK810 || CPU_CK860 + help + Say Y here to enable EFFICIENT_UNALIGNED_STRING_OPS. Some CPU models could + deal with unaligned access by hardware. + endmenu source "arch/csky/Kconfig.platforms" diff --git a/arch/csky/abiv1/Makefile b/arch/csky/abiv1/Makefile index 601ce3b2fb85..a4b2ade0fc67 100644 --- a/arch/csky/abiv1/Makefile +++ b/arch/csky/abiv1/Makefile @@ -4,5 +4,3 @@ obj-y += bswapdi.o obj-y += bswapsi.o obj-y += cacheflush.o obj-y += mmap.o -obj-y += memcpy.o -obj-y += strksyms.o diff --git a/arch/csky/abiv1/memcpy.S b/arch/csky/abiv1/memcpy.S deleted file mode 100644 index 5078eb5169fa..000000000000 --- a/arch/csky/abiv1/memcpy.S +++ /dev/null @@ -1,347 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd. - -#include - -.macro GET_FRONT_BITS rx y -#ifdef __cskyLE__ - lsri \rx, \y -#else - lsli \rx, \y -#endif -.endm - -.macro GET_AFTER_BITS rx y -#ifdef __cskyLE__ - lsli \rx, \y -#else - lsri \rx, \y -#endif -.endm - -/* void *memcpy(void *dest, const void *src, size_t n); */ -ENTRY(memcpy) - mov r7, r2 - cmplti r4, 4 - bt .L_copy_by_byte - mov r6, r2 - andi r6, 3 - cmpnei r6, 0 - jbt .L_dest_not_aligned - mov r6, r3 - andi r6, 3 - cmpnei r6, 0 - jbt .L_dest_aligned_but_src_not_aligned -.L0: - cmplti r4, 16 - jbt .L_aligned_and_len_less_16bytes - subi sp, 8 - stw r8, (sp, 0) -.L_aligned_and_len_larger_16bytes: - ldw r1, (r3, 0) - ldw r5, (r3, 4) - ldw r8, (r3, 8) - stw r1, (r7, 0) - ldw r1, (r3, 12) - stw r5, (r7, 4) - stw r8, (r7, 8) - stw r1, (r7, 12) - subi r4, 16 - addi r3, 16 - addi r7, 16 - cmplti r4, 16 - jbf .L_aligned_and_len_larger_16bytes - ldw r8, (sp, 0) - addi sp, 8 - cmpnei r4, 0 - jbf .L_return - -.L_aligned_and_len_less_16bytes: - cmplti r4, 4 - bt .L_copy_by_byte -.L1: - ldw r1, (r3, 0) - stw r1, (r7, 0) - subi r4, 4 - addi r3, 4 - addi r7, 4 - cmplti r4, 4 - jbf .L1 - br .L_copy_by_byte - -.L_return: - rts - -.L_copy_by_byte: /* len less than 4 bytes */ - cmpnei r4, 0 - jbf .L_return -.L4: - ldb r1, (r3, 0) - stb r1, (r7, 0) - addi r3, 1 - addi r7, 1 - decne r4 - jbt .L4 - rts - -/* - * If dest is not aligned, just copying some bytes makes the dest align. - * Afther that, we judge whether the src is aligned. - */ -.L_dest_not_aligned: - mov r5, r3 - rsub r5, r5, r7 - abs r5, r5 - cmplt r5, r4 - bt .L_copy_by_byte - mov r5, r7 - sub r5, r3 - cmphs r5, r4 - bf .L_copy_by_byte - mov r5, r6 -.L5: - ldb r1, (r3, 0) /* makes the dest align. */ - stb r1, (r7, 0) - addi r5, 1 - subi r4, 1 - addi r3, 1 - addi r7, 1 - cmpnei r5, 4 - jbt .L5 - cmplti r4, 4 - jbt .L_copy_by_byte - mov r6, r3 /* judge whether the src is aligned. */ - andi r6, 3 - cmpnei r6, 0 - jbf .L0 - -/* Judge the number of misaligned, 1, 2, 3? */ -.L_dest_aligned_but_src_not_aligned: - mov r5, r3 - rsub r5, r5, r7 - abs r5, r5 - cmplt r5, r4 - bt .L_copy_by_byte - bclri r3, 0 - bclri r3, 1 - ldw r1, (r3, 0) - addi r3, 4 - cmpnei r6, 2 - bf .L_dest_aligned_but_src_not_aligned_2bytes - cmpnei r6, 3 - bf .L_dest_aligned_but_src_not_aligned_3bytes - -.L_dest_aligned_but_src_not_aligned_1byte: - mov r5, r7 - sub r5, r3 - cmphs r5, r4 - bf .L_copy_by_byte - cmplti r4, 16 - bf .L11 -.L10: /* If the len is less than 16 bytes */ - GET_FRONT_BITS r1 8 - mov r5, r1 - ldw r6, (r3, 0) - mov r1, r6 - GET_AFTER_BITS r6 24 - or r5, r6 - stw r5, (r7, 0) - subi r4, 4 - addi r3, 4 - addi r7, 4 - cmplti r4, 4 - bf .L10 - subi r3, 3 - br .L_copy_by_byte -.L11: - subi sp, 16 - stw r8, (sp, 0) - stw r9, (sp, 4) - stw r10, (sp, 8) - stw r11, (sp, 12) -.L12: - ldw r5, (r3, 0) - ldw r11, (r3, 4) - ldw r8, (r3, 8) - ldw r9, (r3, 12) - - GET_FRONT_BITS r1 8 /* little or big endian? */ - mov r10, r5 - GET_AFTER_BITS r5 24 - or r5, r1 - - GET_FRONT_BITS r10 8 - mov r1, r11 - GET_AFTER_BITS r11 24 - or r11, r10 - - GET_FRONT_BITS r1 8 - mov r10, r8 - GET_AFTER_BITS r8 24 - or r8, r1 - - GET_FRONT_BITS r10 8 - mov r1, r9 - GET_AFTER_BITS r9 24 - or r9, r10 - - stw r5, (r7, 0) - stw r11, (r7, 4) - stw r8, (r7, 8) - stw r9, (r7, 12) - subi r4, 16 - addi r3, 16 - addi r7, 16 - cmplti r4, 16 - jbf .L12 - ldw r8, (sp, 0) - ldw r9, (sp, 4) - ldw r10, (sp, 8) - ldw r11, (sp, 12) - addi sp , 16 - cmplti r4, 4 - bf .L10 - subi r3, 3 - br .L_copy_by_byte - -.L_dest_aligned_but_src_not_aligned_2bytes: - cmplti r4, 16 - bf .L21 -.L20: - GET_FRONT_BITS r1 16 - mov r5, r1 - ldw r6, (r3, 0) - mov r1, r6 - GET_AFTER_BITS r6 16 - or r5, r6 - stw r5, (r7, 0) - subi r4, 4 - addi r3, 4 - addi r7, 4 - cmplti r4, 4 - bf .L20 - subi r3, 2 - br .L_copy_by_byte - rts - -.L21: /* n > 16 */ - subi sp, 16 - stw r8, (sp, 0) - stw r9, (sp, 4) - stw r10, (sp, 8) - stw r11, (sp, 12) - -.L22: - ldw r5, (r3, 0) - ldw r11, (r3, 4) - ldw r8, (r3, 8) - ldw r9, (r3, 12) - - GET_FRONT_BITS r1 16 - mov r10, r5 - GET_AFTER_BITS r5 16 - or r5, r1 - - GET_FRONT_BITS r10 16 - mov r1, r11 - GET_AFTER_BITS r11 16 - or r11, r10 - - GET_FRONT_BITS r1 16 - mov r10, r8 - GET_AFTER_BITS r8 16 - or r8, r1 - - GET_FRONT_BITS r10 16 - mov r1, r9 - GET_AFTER_BITS r9 16 - or r9, r10 - - stw r5, (r7, 0) - stw r11, (r7, 4) - stw r8, (r7, 8) - stw r9, (r7, 12) - subi r4, 16 - addi r3, 16 - addi r7, 16 - cmplti r4, 16 - jbf .L22 - ldw r8, (sp, 0) - ldw r9, (sp, 4) - ldw r10, (sp, 8) - ldw r11, (sp, 12) - addi sp, 16 - cmplti r4, 4 - bf .L20 - subi r3, 2 - br .L_copy_by_byte - - -.L_dest_aligned_but_src_not_aligned_3bytes: - cmplti r4, 16 - bf .L31 -.L30: - GET_FRONT_BITS r1 24 - mov r5, r1 - ldw r6, (r3, 0) - mov r1, r6 - GET_AFTER_BITS r6 8 - or r5, r6 - stw r5, (r7, 0) - subi r4, 4 - addi r3, 4 - addi r7, 4 - cmplti r4, 4 - bf .L30 - subi r3, 1 - br .L_copy_by_byte -.L31: - subi sp, 16 - stw r8, (sp, 0) - stw r9, (sp, 4) - stw r10, (sp, 8) - stw r11, (sp, 12) -.L32: - ldw r5, (r3, 0) - ldw r11, (r3, 4) - ldw r8, (r3, 8) - ldw r9, (r3, 12) - - GET_FRONT_BITS r1 24 - mov r10, r5 - GET_AFTER_BITS r5 8 - or r5, r1 - - GET_FRONT_BITS r10 24 - mov r1, r11 - GET_AFTER_BITS r11 8 - or r11, r10 - - GET_FRONT_BITS r1 24 - mov r10, r8 - GET_AFTER_BITS r8 8 - or r8, r1 - - GET_FRONT_BITS r10 24 - mov r1, r9 - GET_AFTER_BITS r9 8 - or r9, r10 - - stw r5, (r7, 0) - stw r11, (r7, 4) - stw r8, (r7, 8) - stw r9, (r7, 12) - subi r4, 16 - addi r3, 16 - addi r7, 16 - cmplti r4, 16 - jbf .L32 - ldw r8, (sp, 0) - ldw r9, (sp, 4) - ldw r10, (sp, 8) - ldw r11, (sp, 12) - addi sp, 16 - cmplti r4, 4 - bf .L30 - subi r3, 1 - br .L_copy_by_byte diff --git a/arch/csky/abiv1/strksyms.c b/arch/csky/abiv1/strksyms.c deleted file mode 100644 index c7ccbb27e8d7..000000000000 --- a/arch/csky/abiv1/strksyms.c +++ /dev/null @@ -1,6 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd. - -#include - -EXPORT_SYMBOL(memcpy); diff --git a/arch/csky/abiv2/Makefile b/arch/csky/abiv2/Makefile index c561efa5533c..ea8005fe01a8 100644 --- a/arch/csky/abiv2/Makefile +++ b/arch/csky/abiv2/Makefile @@ -2,9 +2,11 @@ obj-y += cacheflush.o obj-$(CONFIG_CPU_HAS_FPU) += fpu.o obj-y += memcmp.o +ifeq ($(CONFIG_HAVE_EFFICIENT_UNALIGNED_STRING_OPS), y) obj-y += memcpy.o obj-y += memmove.o obj-y += memset.o +endif obj-y += strcmp.o obj-y += strcpy.o obj-y += strlen.o diff --git a/arch/csky/abiv2/strksyms.c b/arch/csky/abiv2/strksyms.c index 06da723d8202..8d1fd28c6cf9 100644 --- a/arch/csky/abiv2/strksyms.c +++ b/arch/csky/abiv2/strksyms.c @@ -3,10 +3,12 @@ #include +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_STRING_OPS EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(memset); -EXPORT_SYMBOL(memcmp); EXPORT_SYMBOL(memmove); +#endif +EXPORT_SYMBOL(memcmp); EXPORT_SYMBOL(strcmp); EXPORT_SYMBOL(strcpy); EXPORT_SYMBOL(strlen); diff --git a/arch/csky/lib/Makefile b/arch/csky/lib/Makefile index 7fbdbb2c4d12..d0ce6e2d7ab2 100644 --- a/arch/csky/lib/Makefile +++ b/arch/csky/lib/Makefile @@ -1,3 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only lib-y := usercopy.o delay.o obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o +ifneq ($(CONFIG_HAVE_EFFICIENT_UNALIGNED_STRING_OPS), y) +lib-y += string.o +endif diff --git a/arch/csky/lib/string.c b/arch/csky/lib/string.c new file mode 100644 index 000000000000..d65626fcaeac --- /dev/null +++ b/arch/csky/lib/string.c @@ -0,0 +1,134 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * String functions optimized for hardware which doesn't + * handle unaligned memory accesses efficiently. + * + * Copyright (C) 2021 Matteo Croce + */ + +#include +#include + +/* Minimum size for a word copy to be convenient */ +#define BYTES_LONG sizeof(long) +#define WORD_MASK (BYTES_LONG - 1) +#define MIN_THRESHOLD (BYTES_LONG * 2) + +/* convenience union to avoid cast between different pointer types */ +union types { + u8 *as_u8; + unsigned long *as_ulong; + uintptr_t as_uptr; +}; + +union const_types { + const u8 *as_u8; + unsigned long *as_ulong; + uintptr_t as_uptr; +}; + +void *memcpy(void *dest, const void *src, size_t count) +{ + union const_types s = { .as_u8 = src }; + union types d = { .as_u8 = dest }; + int distance = 0; + + if (count < MIN_THRESHOLD) + goto copy_remainder; + + /* Copy a byte at time until destination is aligned. */ + for (; d.as_uptr & WORD_MASK; count--) + *d.as_u8++ = *s.as_u8++; + + distance = s.as_uptr & WORD_MASK; + + if (distance) { + unsigned long last, next; + + /* + * s is distance bytes ahead of d, and d just reached + * the alignment boundary. Move s backward to word align it + * and shift data to compensate for distance, in order to do + * word-by-word copy. + */ + s.as_u8 -= distance; + + next = s.as_ulong[0]; + for (; count >= BYTES_LONG; count -= BYTES_LONG) { + last = next; + next = s.as_ulong[1]; + + d.as_ulong[0] = last >> (distance * 8) | + next << ((BYTES_LONG - distance) * 8); + + d.as_ulong++; + s.as_ulong++; + } + + /* Restore s with the original offset. */ + s.as_u8 += distance; + } else { + /* + * If the source and dest lower bits are the same, do a simple + * 32/64 bit wide copy. + */ + for (; count >= BYTES_LONG; count -= BYTES_LONG) + *d.as_ulong++ = *s.as_ulong++; + } + +copy_remainder: + while (count--) + *d.as_u8++ = *s.as_u8++; + + return dest; +} +EXPORT_SYMBOL(memcpy); + +/* + * Simply check if the buffer overlaps an call memcpy() in case, + * otherwise do a simple one byte at time backward copy. + */ +void *memmove(void *dest, const void *src, size_t count) +{ + if (dest < src || src + count <= dest) + return memcpy(dest, src, count); + + if (dest > src) { + const char *s = src + count; + char *tmp = dest + count; + + while (count--) + *--tmp = *--s; + } + return dest; +} +EXPORT_SYMBOL(memmove); + +void *memset(void *s, int c, size_t count) +{ + union types dest = { .as_u8 = s }; + + if (count >= MIN_THRESHOLD) { + unsigned long cu = (unsigned long)c; + + /* Compose an ulong with 'c' repeated 4/8 times */ + cu |= cu << 8; + cu |= cu << 16; + /* Suppress warning on 32 bit machines */ + cu |= (cu << 16) << 16; + + for (; count && dest.as_uptr & WORD_MASK; count--) + *dest.as_u8++ = c; + + /* Copy using the largest size allowed */ + for (; count >= BYTES_LONG; count -= BYTES_LONG) + *dest.as_ulong++ = cu; + } + + /* copy the remainder */ + while (count--) + *dest.as_u8++ = c; + + return s; +} +EXPORT_SYMBOL(memset); -- cgit v1.2.3 From 8318f7c231d5be09e47410c5ab387b9bef6fe19e Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Wed, 6 Apr 2022 21:32:22 +0800 Subject: csky: optimize memcpy_{from,to}io() and memset_io() Optimize memcpy_{from,to}io() and memset_io() by transferring in 64 bit as much as possible with minimized barrier usage. This simplest optimization brings faster throughput compare to current byte-by-byte read and write with barrier in the loop. Code's skeleton is taken from the powerpc & arm64. Signed-off-by: Guo Ren Signed-off-by: Guo Ren --- arch/csky/include/asm/io.h | 11 ++++++ arch/csky/kernel/Makefile | 2 +- arch/csky/kernel/io.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 103 insertions(+), 1 deletion(-) create mode 100644 arch/csky/kernel/io.c (limited to 'arch') diff --git a/arch/csky/include/asm/io.h b/arch/csky/include/asm/io.h index ed53f0b47388..4725bb977b0f 100644 --- a/arch/csky/include/asm/io.h +++ b/arch/csky/include/asm/io.h @@ -31,6 +31,17 @@ #define writel(v,c) ({ wmb(); writel_relaxed((v),(c)); mb(); }) #endif +/* + * String version of I/O memory access operations. + */ +extern void __memcpy_fromio(void *, const volatile void __iomem *, size_t); +extern void __memcpy_toio(volatile void __iomem *, const void *, size_t); +extern void __memset_io(volatile void __iomem *, int, size_t); + +#define memset_io(c,v,l) __memset_io((c),(v),(l)) +#define memcpy_fromio(a,c,l) __memcpy_fromio((a),(c),(l)) +#define memcpy_toio(c,a,l) __memcpy_toio((c),(a),(l)) + /* * I/O memory mapping functions. */ diff --git a/arch/csky/kernel/Makefile b/arch/csky/kernel/Makefile index 6c0f36010ed0..4eb41421ca5b 100644 --- a/arch/csky/kernel/Makefile +++ b/arch/csky/kernel/Makefile @@ -2,7 +2,7 @@ extra-y := head.o vmlinux.lds obj-y += entry.o atomic.o signal.o traps.o irq.o time.o vdso.o vdso/ -obj-y += power.o syscall.o syscall_table.o setup.o +obj-y += power.o syscall.o syscall_table.o setup.o io.o obj-y += process.o cpu-probe.o ptrace.o stacktrace.o obj-y += probes/ diff --git a/arch/csky/kernel/io.c b/arch/csky/kernel/io.c new file mode 100644 index 000000000000..5883f13fa2b1 --- /dev/null +++ b/arch/csky/kernel/io.c @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include + +/* + * Copy data from IO memory space to "real" memory space. + */ +void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t count) +{ + while (count && !IS_ALIGNED((unsigned long)from, 4)) { + *(u8 *)to = __raw_readb(from); + from++; + to++; + count--; + } + + while (count >= 4) { + *(u32 *)to = __raw_readl(from); + from += 4; + to += 4; + count -= 4; + } + + while (count) { + *(u8 *)to = __raw_readb(from); + from++; + to++; + count--; + } +} +EXPORT_SYMBOL(__memcpy_fromio); + +/* + * Copy data from "real" memory space to IO memory space. + */ +void __memcpy_toio(volatile void __iomem *to, const void *from, size_t count) +{ + while (count && !IS_ALIGNED((unsigned long)to, 4)) { + __raw_writeb(*(u8 *)from, to); + from++; + to++; + count--; + } + + while (count >= 4) { + __raw_writel(*(u32 *)from, to); + from += 4; + to += 4; + count -= 4; + } + + while (count) { + __raw_writeb(*(u8 *)from, to); + from++; + to++; + count--; + } +} +EXPORT_SYMBOL(__memcpy_toio); + +/* + * "memset" on IO memory space. + */ +void __memset_io(volatile void __iomem *dst, int c, size_t count) +{ + u32 qc = (u8)c; + + qc |= qc << 8; + qc |= qc << 16; + + while (count && !IS_ALIGNED((unsigned long)dst, 4)) { + __raw_writeb(c, dst); + dst++; + count--; + } + + while (count >= 4) { + __raw_writel(qc, dst); + dst += 4; + count -= 4; + } + + while (count) { + __raw_writeb(c, dst); + dst++; + count--; + } +} +EXPORT_SYMBOL(__memset_io); -- cgit v1.2.3 From 186f69b64c80a594337211e8238e44a3863e9d94 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Wed, 6 Apr 2022 20:30:13 +0800 Subject: csky: atomic: Optimize cmpxchg with acquire & release Optimize cmpxchg with ASM acquire/release fence ASM instructions instead of previous generic based. Prevent a fence when cmxchg's first load != old. Comments by Rutland: 8e86f0b409a4 ("arm64: atomics: fix use of acquire + release for full barrier semantics") Comments by Boqun: FWIW, you probably need to make sure that a barrier instruction inside an lr/sc loop is a good thing. IIUC, the execution time of a barrier instruction is determined by the status of store buffers and invalidate queues (and probably other stuffs), so it may increase the execution time of the lr/sc loop, and make it unlikely to succeed. But this really depends on how the arch executes these instructions. Link: https://lore.kernel.org/linux-riscv/CAJF2gTSAxpAi=LbAdu7jntZRUa=-dJwL0VfmDfBV5MHB=rcZ-w@mail.gmail.com/T/#m27a0f1342995deae49ce1d0e1f2683f8a181d6c3 Signed-off-by: Guo Ren Signed-off-by: Guo Ren Cc: Mark Rutland --- arch/csky/include/asm/barrier.h | 11 ++++--- arch/csky/include/asm/cmpxchg.h | 64 ++++++++++++++++++++++++++++++++++++++--- 2 files changed, 67 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/csky/include/asm/barrier.h b/arch/csky/include/asm/barrier.h index f4045dd53e17..15de58b10aec 100644 --- a/arch/csky/include/asm/barrier.h +++ b/arch/csky/include/asm/barrier.h @@ -37,17 +37,21 @@ * bar.brar * bar.bwaw */ +#define FULL_FENCE ".long 0x842fc000\n" +#define ACQUIRE_FENCE ".long 0x8427c000\n" +#define RELEASE_FENCE ".long 0x842ec000\n" + #define __bar_brw() asm volatile (".long 0x842cc000\n":::"memory") #define __bar_br() asm volatile (".long 0x8424c000\n":::"memory") #define __bar_bw() asm volatile (".long 0x8428c000\n":::"memory") #define __bar_arw() asm volatile (".long 0x8423c000\n":::"memory") #define __bar_ar() asm volatile (".long 0x8421c000\n":::"memory") #define __bar_aw() asm volatile (".long 0x8422c000\n":::"memory") -#define __bar_brwarw() asm volatile (".long 0x842fc000\n":::"memory") -#define __bar_brarw() asm volatile (".long 0x8427c000\n":::"memory") +#define __bar_brwarw() asm volatile (FULL_FENCE:::"memory") +#define __bar_brarw() asm volatile (ACQUIRE_FENCE:::"memory") #define __bar_bwarw() asm volatile (".long 0x842bc000\n":::"memory") #define __bar_brwar() asm volatile (".long 0x842dc000\n":::"memory") -#define __bar_brwaw() asm volatile (".long 0x842ec000\n":::"memory") +#define __bar_brwaw() asm volatile (RELEASE_FENCE:::"memory") #define __bar_brar() asm volatile (".long 0x8425c000\n":::"memory") #define __bar_brar() asm volatile (".long 0x8425c000\n":::"memory") #define __bar_bwaw() asm volatile (".long 0x842ac000\n":::"memory") @@ -56,7 +60,6 @@ #define __smp_rmb() __bar_brar() #define __smp_wmb() __bar_bwaw() -#define ACQUIRE_FENCE ".long 0x8427c000\n" #define __smp_acquire_fence() __bar_brarw() #define __smp_release_fence() __bar_brwaw() diff --git a/arch/csky/include/asm/cmpxchg.h b/arch/csky/include/asm/cmpxchg.h index d1bef11f8dc9..5b8faccd65e4 100644 --- a/arch/csky/include/asm/cmpxchg.h +++ b/arch/csky/include/asm/cmpxchg.h @@ -64,15 +64,71 @@ extern void __bad_xchg(void); #define arch_cmpxchg_relaxed(ptr, o, n) \ (__cmpxchg_relaxed((ptr), (o), (n), sizeof(*(ptr)))) -#define arch_cmpxchg(ptr, o, n) \ +#define __cmpxchg_acquire(ptr, old, new, size) \ ({ \ + __typeof__(ptr) __ptr = (ptr); \ + __typeof__(new) __new = (new); \ + __typeof__(new) __tmp; \ + __typeof__(old) __old = (old); \ + __typeof__(*(ptr)) __ret; \ + switch (size) { \ + case 4: \ + asm volatile ( \ + "1: ldex.w %0, (%3) \n" \ + " cmpne %0, %4 \n" \ + " bt 2f \n" \ + " mov %1, %2 \n" \ + " stex.w %1, (%3) \n" \ + " bez %1, 1b \n" \ + ACQUIRE_FENCE \ + "2: \n" \ + : "=&r" (__ret), "=&r" (__tmp) \ + : "r" (__new), "r"(__ptr), "r"(__old) \ + :); \ + break; \ + default: \ + __bad_xchg(); \ + } \ + __ret; \ +}) + +#define arch_cmpxchg_acquire(ptr, o, n) \ + (__cmpxchg_acquire((ptr), (o), (n), sizeof(*(ptr)))) + +#define __cmpxchg(ptr, old, new, size) \ +({ \ + __typeof__(ptr) __ptr = (ptr); \ + __typeof__(new) __new = (new); \ + __typeof__(new) __tmp; \ + __typeof__(old) __old = (old); \ __typeof__(*(ptr)) __ret; \ - __smp_release_fence(); \ - __ret = arch_cmpxchg_relaxed(ptr, o, n); \ - __smp_acquire_fence(); \ + switch (size) { \ + case 4: \ + asm volatile ( \ + RELEASE_FENCE \ + "1: ldex.w %0, (%3) \n" \ + " cmpne %0, %4 \n" \ + " bt 2f \n" \ + " mov %1, %2 \n" \ + " stex.w %1, (%3) \n" \ + " bez %1, 1b \n" \ + FULL_FENCE \ + "2: \n" \ + : "=&r" (__ret), "=&r" (__tmp) \ + : "r" (__new), "r"(__ptr), "r"(__old) \ + :); \ + break; \ + default: \ + __bad_xchg(); \ + } \ __ret; \ }) +#define arch_cmpxchg(ptr, o, n) \ + (__cmpxchg((ptr), (o), (n), sizeof(*(ptr)))) + +#define arch_cmpxchg_local(ptr, o, n) \ + (__cmpxchg_relaxed((ptr), (o), (n), sizeof(*(ptr)))) #else #include #endif -- cgit v1.2.3 From 6b160e0513e9f0b0c0b7ff4d5b964822d1750ce9 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Wed, 6 Apr 2022 20:47:52 +0800 Subject: csky: atomic: Add custom atomic.h implementation The generic atomic.h used cmpxchg to implement the atomic operations, it will cause daul loop to reduce the forward guarantee. The patch implement csky custom atomic operations with ldex/stex instructions for the best performance. Important comment by Rutland: 8e86f0b409a4 ("arm64: atomics: fix use of acquire + release for full barrier semantics") Link: https://lore.kernel.org/linux-riscv/CAJF2gTSAxpAi=LbAdu7jntZRUa=-dJwL0VfmDfBV5MHB=rcZ-w@mail.gmail.com/T/#m27a0f1342995deae49ce1d0e1f2683f8a181d6c3 Signed-off-by: Guo Ren Signed-off-by: Guo Ren Cc: Mark Rutland --- arch/csky/include/asm/atomic.h | 142 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 142 insertions(+) create mode 100644 arch/csky/include/asm/atomic.h (limited to 'arch') diff --git a/arch/csky/include/asm/atomic.h b/arch/csky/include/asm/atomic.h new file mode 100644 index 000000000000..56c9dc8e91b3 --- /dev/null +++ b/arch/csky/include/asm/atomic.h @@ -0,0 +1,142 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __ASM_CSKY_ATOMIC_H +#define __ASM_CSKY_ATOMIC_H + +#ifdef CONFIG_SMP +#include + +#include +#include + +#define __atomic_acquire_fence() __bar_brarw() + +#define __atomic_release_fence() __bar_brwaw() + +static __always_inline int arch_atomic_read(const atomic_t *v) +{ + return READ_ONCE(v->counter); +} +static __always_inline void arch_atomic_set(atomic_t *v, int i) +{ + WRITE_ONCE(v->counter, i); +} + +#define ATOMIC_OP(op) \ +static __always_inline \ +void arch_atomic_##op(int i, atomic_t *v) \ +{ \ + unsigned long tmp; \ + __asm__ __volatile__ ( \ + "1: ldex.w %0, (%2) \n" \ + " " #op " %0, %1 \n" \ + " stex.w %0, (%2) \n" \ + " bez %0, 1b \n" \ + : "=&r" (tmp) \ + : "r" (i), "r" (&v->counter) \ + : "memory"); \ +} + +ATOMIC_OP(add) +ATOMIC_OP(sub) +ATOMIC_OP(and) +ATOMIC_OP( or) +ATOMIC_OP(xor) + +#undef ATOMIC_OP + +#define ATOMIC_FETCH_OP(op) \ +static __always_inline \ +int arch_atomic_fetch_##op##_relaxed(int i, atomic_t *v) \ +{ \ + register int ret, tmp; \ + __asm__ __volatile__ ( \ + "1: ldex.w %0, (%3) \n" \ + " mov %1, %0 \n" \ + " " #op " %0, %2 \n" \ + " stex.w %0, (%3) \n" \ + " bez %0, 1b \n" \ + : "=&r" (tmp), "=&r" (ret) \ + : "r" (i), "r"(&v->counter) \ + : "memory"); \ + return ret; \ +} + +#define ATOMIC_OP_RETURN(op, c_op) \ +static __always_inline \ +int arch_atomic_##op##_return_relaxed(int i, atomic_t *v) \ +{ \ + return arch_atomic_fetch_##op##_relaxed(i, v) c_op i; \ +} + +#define ATOMIC_OPS(op, c_op) \ + ATOMIC_FETCH_OP(op) \ + ATOMIC_OP_RETURN(op, c_op) + +ATOMIC_OPS(add, +) +ATOMIC_OPS(sub, -) + +#define arch_atomic_fetch_add_relaxed arch_atomic_fetch_add_relaxed +#define arch_atomic_fetch_sub_relaxed arch_atomic_fetch_sub_relaxed + +#define arch_atomic_add_return_relaxed arch_atomic_add_return_relaxed +#define arch_atomic_sub_return_relaxed arch_atomic_sub_return_relaxed + +#undef ATOMIC_OPS +#undef ATOMIC_OP_RETURN + +#define ATOMIC_OPS(op) \ + ATOMIC_FETCH_OP(op) + +ATOMIC_OPS(and) +ATOMIC_OPS( or) +ATOMIC_OPS(xor) + +#define arch_atomic_fetch_and_relaxed arch_atomic_fetch_and_relaxed +#define arch_atomic_fetch_or_relaxed arch_atomic_fetch_or_relaxed +#define arch_atomic_fetch_xor_relaxed arch_atomic_fetch_xor_relaxed + +#undef ATOMIC_OPS + +#undef ATOMIC_FETCH_OP + +#define ATOMIC_OP() \ +static __always_inline \ +int arch_atomic_xchg_relaxed(atomic_t *v, int n) \ +{ \ + return __xchg_relaxed(n, &(v->counter), 4); \ +} \ +static __always_inline \ +int arch_atomic_cmpxchg_relaxed(atomic_t *v, int o, int n) \ +{ \ + return __cmpxchg_relaxed(&(v->counter), o, n, 4); \ +} \ +static __always_inline \ +int arch_atomic_cmpxchg_acquire(atomic_t *v, int o, int n) \ +{ \ + return __cmpxchg_acquire(&(v->counter), o, n, 4); \ +} \ +static __always_inline \ +int arch_atomic_cmpxchg(atomic_t *v, int o, int n) \ +{ \ + return __cmpxchg(&(v->counter), o, n, 4); \ +} + +#define ATOMIC_OPS() \ + ATOMIC_OP() + +ATOMIC_OPS() + +#define arch_atomic_xchg_relaxed arch_atomic_xchg_relaxed +#define arch_atomic_cmpxchg_relaxed arch_atomic_cmpxchg_relaxed +#define arch_atomic_cmpxchg_acquire arch_atomic_cmpxchg_acquire +#define arch_atomic_cmpxchg arch_atomic_cmpxchg + +#undef ATOMIC_OPS +#undef ATOMIC_OP + +#else +#include +#endif + +#endif /* __ASM_CSKY_ATOMIC_H */ -- cgit v1.2.3 From c5acdf12cc24d34ea3f9426472dcb3f5d581b1e5 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Wed, 13 Apr 2022 15:27:52 +0800 Subject: csky: atomic: Add conditional atomic operations' optimization Add conditional atomic operations' optimization: - arch_atomic_fetch_add_unless - arch_atomic_inc_unless_negative - arch_atomic_dec_unless_positive - arch_atomic_dec_if_positive Comments by Boqun: FWIW, you probably need to make sure that a barrier instruction inside an lr/sc loop is a good thing. IIUC, the execution time of a barrier instruction is determined by the status of store buffers and invalidate queues (and probably other stuffs), so it may increase the execution time of the lr/sc loop, and make it unlikely to succeed. But this really depends on how the arch executes these instructions. Signed-off-by: Guo Ren Signed-off-by: Guo Ren Cc: Boqun Feng --- arch/csky/include/asm/atomic.h | 95 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) (limited to 'arch') diff --git a/arch/csky/include/asm/atomic.h b/arch/csky/include/asm/atomic.h index 56c9dc8e91b3..60406ef9c2bb 100644 --- a/arch/csky/include/asm/atomic.h +++ b/arch/csky/include/asm/atomic.h @@ -100,6 +100,101 @@ ATOMIC_OPS(xor) #undef ATOMIC_FETCH_OP +static __always_inline int +arch_atomic_fetch_add_unless(atomic_t *v, int a, int u) +{ + int prev, tmp; + + __asm__ __volatile__ ( + RELEASE_FENCE + "1: ldex.w %0, (%3) \n" + " cmpne %0, %4 \n" + " bf 2f \n" + " mov %1, %0 \n" + " add %1, %2 \n" + " stex.w %1, (%3) \n" + " bez %1, 1b \n" + FULL_FENCE + "2:\n" + : "=&r" (prev), "=&r" (tmp) + : "r" (a), "r" (&v->counter), "r" (u) + : "memory"); + + return prev; +} +#define arch_atomic_fetch_add_unless arch_atomic_fetch_add_unless + +static __always_inline bool +arch_atomic_inc_unless_negative(atomic_t *v) +{ + int rc, tmp; + + __asm__ __volatile__ ( + RELEASE_FENCE + "1: ldex.w %0, (%2) \n" + " movi %1, 0 \n" + " blz %0, 2f \n" + " movi %1, 1 \n" + " addi %0, 1 \n" + " stex.w %0, (%2) \n" + " bez %0, 1b \n" + FULL_FENCE + "2:\n" + : "=&r" (tmp), "=&r" (rc) + : "r" (&v->counter) + : "memory"); + + return tmp ? true : false; + +} +#define arch_atomic_inc_unless_negative arch_atomic_inc_unless_negative + +static __always_inline bool +arch_atomic_dec_unless_positive(atomic_t *v) +{ + int rc, tmp; + + __asm__ __volatile__ ( + RELEASE_FENCE + "1: ldex.w %0, (%2) \n" + " movi %1, 0 \n" + " bhz %0, 2f \n" + " movi %1, 1 \n" + " subi %0, 1 \n" + " stex.w %0, (%2) \n" + " bez %0, 1b \n" + FULL_FENCE + "2:\n" + : "=&r" (tmp), "=&r" (rc) + : "r" (&v->counter) + : "memory"); + + return tmp ? true : false; +} +#define arch_atomic_dec_unless_positive arch_atomic_dec_unless_positive + +static __always_inline int +arch_atomic_dec_if_positive(atomic_t *v) +{ + int dec, tmp; + + __asm__ __volatile__ ( + RELEASE_FENCE + "1: ldex.w %0, (%2) \n" + " subi %1, %0, 1 \n" + " blz %1, 2f \n" + " stex.w %1, (%2) \n" + " bez %1, 1b \n" + FULL_FENCE + "2:\n" + : "=&r" (dec), "=&r" (tmp) + : "r" (&v->counter) + : "memory"); + + return dec - 1; +} +#define arch_atomic_dec_if_positive arch_atomic_dec_if_positive + #define ATOMIC_OP() \ static __always_inline \ int arch_atomic_xchg_relaxed(atomic_t *v, int n) \ -- cgit v1.2.3 From 9d975568606631601cc2bb5b62598869838ff0be Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 12 May 2022 12:59:00 +0900 Subject: csky: Remove unused $(dtb-y) from boot/Makefile arch/csky/boot/Makefile does not build DTB, arch/csky/boot/dts/Makefile does. Signed-off-by: Masahiro Yamada Signed-off-by: Guo Ren --- arch/csky/boot/Makefile | 1 - 1 file changed, 1 deletion(-) (limited to 'arch') diff --git a/arch/csky/boot/Makefile b/arch/csky/boot/Makefile index dbc9b1bd72f0..c3cfde28f8e6 100644 --- a/arch/csky/boot/Makefile +++ b/arch/csky/boot/Makefile @@ -1,6 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only targets := Image zImage uImage -targets += $(dtb-y) $(obj)/Image: vmlinux FORCE $(call if_changed,objcopy) -- cgit v1.2.3 From 29b24a76bdea0786a727e08266607c6e3bbfa160 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 12 May 2022 12:59:01 +0900 Subject: csky: Remove unused core-y for dts This line was used for embedding a DT into vmlinux. Since commit c4c14c3bd177 ("csky: remove builtin-dtb Kbuild"), DT for csky is just a separate blob. It is covered by the generic rule in the top Makefile: ifdef CONFIG_OF_EARLY_FLATTREE all: dtbs endif Signed-off-by: Masahiro Yamada Signed-off-by: Guo Ren --- arch/csky/Makefile | 1 - 1 file changed, 1 deletion(-) (limited to 'arch') diff --git a/arch/csky/Makefile b/arch/csky/Makefile index 866805077636..4d72aca4069b 100644 --- a/arch/csky/Makefile +++ b/arch/csky/Makefile @@ -69,7 +69,6 @@ libs-y += arch/csky/lib/ \ $(shell $(CC) $(KBUILD_CFLAGS) $(KCFLAGS) -print-libgcc-file-name) boot := arch/csky/boot -core-y += $(boot)/dts/ all: zImage -- cgit v1.2.3 From 64d83f06774668081258bd7f3241267239bb9ab2 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 12 May 2022 12:59:02 +0900 Subject: csky: Move $(core-y) into arch/csky/Kbuild Use the standard obj-y form to specify the sub-directories under arch/csky/. Only leave core-y += arch/csky/$(CSKYABI)/ there. Signed-off-by: Masahiro Yamada Signed-off-by: Guo Ren --- arch/csky/Kbuild | 2 ++ arch/csky/Makefile | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/csky/Kbuild b/arch/csky/Kbuild index 4e39f7abdeb6..0621eaea4196 100644 --- a/arch/csky/Kbuild +++ b/arch/csky/Kbuild @@ -1,4 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only +obj-y += kernel/ mm/ + # for cleaning subdir- += boot diff --git a/arch/csky/Makefile b/arch/csky/Makefile index 4d72aca4069b..4e1d619fd5c6 100644 --- a/arch/csky/Makefile +++ b/arch/csky/Makefile @@ -61,8 +61,6 @@ KBUILD_AFLAGS += $(KBUILD_CFLAGS) head-y := arch/csky/kernel/head.o -core-y += arch/csky/kernel/ -core-y += arch/csky/mm/ core-y += arch/csky/$(CSKYABI)/ libs-y += arch/csky/lib/ \ -- cgit v1.2.3