From 4a6459a8d1dae122d5fb6c40010f91e3afcf4c9c Mon Sep 17 00:00:00 2001 From: riastradh Date: Thu, 21 Apr 2022 12:06:31 +0000 Subject: [PATCH] mips/cavium: Take advantage of Octeon's guaranteed r/rw ordering. --- common/lib/libc/arch/mips/atomic/membar_ops.S | 100 ++++++++++++------ sys/arch/mips/include/asm.h | 21 ++-- 2 files changed, 83 insertions(+), 38 deletions(-) diff --git a/common/lib/libc/arch/mips/atomic/membar_ops.S b/common/lib/libc/arch/mips/atomic/membar_ops.S index 57694df66cc0..d7b444ec1200 100644 --- a/common/lib/libc/arch/mips/atomic/membar_ops.S +++ b/common/lib/libc/arch/mips/atomic/membar_ops.S @@ -1,4 +1,4 @@ -/* $NetBSD: membar_ops.S,v 1.12 2022/04/09 23:32:51 riastradh Exp $ */ +/* $NetBSD: membar_ops.S,v 1.13 2022/04/21 12:06:31 riastradh Exp $ */ /*- * Copyright (c) 2006, 2007 The NetBSD Foundation, Inc. @@ -38,44 +38,80 @@ LEAF(_membar_sync) j ra BDSYNC END(_membar_sync) +ATOMIC_OP_ALIAS(membar_sync,_membar_sync) + +STRONG_ALIAS(_membar_enter,_membar_sync) +ATOMIC_OP_ALIAS(membar_enter,_membar_sync) #ifdef __OCTEON__ + +/* + * cnMIPS guarantees load-before-load/store ordering without any + * barriers. So the only barriers we need are store-before-load (sync) + * and store-before-store (syncw, i.e., sync 4). See Table 2-32 + * `Execution Ordering Rules' on p. 104 of Cavium OCTEON III CN78XX + * Hardware Reference Manual, CN78XX-HM-0.99E, September 2014: + * + * First Operation DLD [load instruction to a physical + * address that is L2/DRAM] + * Second Operation Any + * Execution Ordering Comments + * + * The second operation cannot appear to execute before + * the first (DLD) operation, regardless of the presence + * or absence of SYNC* instructions. + * + * Note: I'm not sure if this applies to earlier cnMIPS -- can't find + * it in the Cavium Networks OCTEON Plus CN50XX Hardware Reference + * Manual CN50XX-HM-0.99E, July 2008. Experimentally, on an erlite3 + * (Cavium Octeon CN5020-500), I can easily detect reordering of + * store-before-store and store-before-load, but I haven't been able to + * detect any reordering of load-before-load or load-before-store. + * + * Note: On early cnMIPS (CN3xxx), there is an erratum which sometimes + * requires issuing two syncw's in a row. I don't know the details -- + * don't have documentation -- and in Linux it is only used for I/O + * purposes. + * + * Currently we don't build kernels that work on both Octeon and + * non-Octeon MIPS CPUs, so none of this is done with binary patching. + * For userlands we could use a separate shared library on Octeon with + * ld.so.conf to override the symbols with cheaper definitions, but we + * don't do that now. + */ + +LEAF(_membar_acquire) + j ra + nop +END(_membar_acquire) +ATOMIC_OP_ALIAS(membar_acquire,_membar_acquire) + +STRONG_ALIAS(_membar_consumer,_membar_acquire) +ATOMIC_OP_ALIAS(membar_consumer,_membar_acquire) + LEAF(_membar_release) - /* - * syncw is documented as ordering store-before-store in - * - * Cavium OCTEON III CN78XX Hardware Reference Manual, - * CN78XX-HM-0.99E, September 2014. - * - * It's unclear from the documentation the architecture - * guarantees load-before-store ordering without barriers, but - * this code assumes it does. If that assumption is wrong, we - * can only use syncw for membar_producer -- membar_release has - * to use the full sync. - */ j ra syncw END(_membar_release) -#endif - -ATOMIC_OP_ALIAS(membar_sync,_membar_sync) -ATOMIC_OP_ALIAS(membar_acquire,_membar_sync) -STRONG_ALIAS(_membar_acquire,_membar_sync) -ATOMIC_OP_ALIAS(membar_enter,_membar_sync) -STRONG_ALIAS(_membar_enter,_membar_sync) -#ifdef __OCTEON__ -ATOMIC_OP_ALIAS(membar_exit,_membar_release) -STRONG_ALIAS(_membar_exit,_membar_release) ATOMIC_OP_ALIAS(membar_release,_membar_release) -ATOMIC_OP_ALIAS(membar_producer,_membar_release) + +STRONG_ALIAS(_membar_exit,_membar_release) +ATOMIC_OP_ALIAS(membar_exit,_membar_release) + STRONG_ALIAS(_membar_producer,_membar_release) -#else -ATOMIC_OP_ALIAS(membar_exit,_membar_sync) -STRONG_ALIAS(_membar_exit,_membar_sync) -ATOMIC_OP_ALIAS(membar_release,_membar_sync) +ATOMIC_OP_ALIAS(membar_producer,_membar_release) + +#else /* !__OCTEON__ */ + +STRONG_ALIAS(_membar_acquire,_membar_sync) +ATOMIC_OP_ALIAS(membar_acquire,_membar_sync) STRONG_ALIAS(_membar_release,_membar_sync) -ATOMIC_OP_ALIAS(membar_producer,_membar_sync) -STRONG_ALIAS(_membar_producer,_membar_sync) -#endif -ATOMIC_OP_ALIAS(membar_consumer,_membar_sync) +ATOMIC_OP_ALIAS(membar_release,_membar_sync) +STRONG_ALIAS(_membar_exit,_membar_sync) +ATOMIC_OP_ALIAS(membar_exit,_membar_sync) STRONG_ALIAS(_membar_consumer,_membar_sync) +ATOMIC_OP_ALIAS(membar_consumer,_membar_sync) +STRONG_ALIAS(_membar_producer,_membar_sync) +ATOMIC_OP_ALIAS(membar_producer,_membar_sync) + +#endif diff --git a/sys/arch/mips/include/asm.h b/sys/arch/mips/include/asm.h index 27a46591f06f..8673c5063e1e 100644 --- a/sys/arch/mips/include/asm.h +++ b/sys/arch/mips/include/asm.h @@ -1,4 +1,4 @@ -/* $NetBSD: asm.h,v 1.70 2022/04/09 14:09:32 riastradh Exp $ */ +/* $NetBSD: asm.h,v 1.71 2022/04/21 12:06:31 riastradh Exp $ */ /* * Copyright (c) 1992, 1993 @@ -574,12 +574,21 @@ _C_LABEL(x): /* compiler define */ #if defined(__OCTEON__) - /* early cnMIPS have erratum which means 2 */ -#define LLSCSYNC sync 4; sync 4 +/* + * See common/lib/libc/arch/mips/atomic/membar_ops.S for notes on + * Octeon memory ordering guarantees and barriers. + * + * cnMIPS also has a quirk where the store buffer can get clogged and + * we need to apply a plunger to it _after_ releasing a lock or else + * other CPUs may spin for hundreds of thousands of cycles before they + * see the lock is released. So we also have the quirky SYNC_PLUNGER + * barrier as syncw. + */ +#define LLSCSYNC /* nothing */ #define BDSYNC sync -#define BDSYNC_ACQ sync -#define SYNC_ACQ sync -#define SYNC_REL sync +#define BDSYNC_ACQ nop +#define SYNC_ACQ /* nothing */ +#define SYNC_REL sync 4 #define BDSYNC_PLUNGER sync 4 #define SYNC_PLUNGER sync 4 #elif __mips >= 3 || !defined(__mips_o32)