Merge branch 'sparc64-queued-locks'

Babu Moger says: ==================== Enable queued rwlock and queued spinlock for SPARC This series of patches enables queued rwlock and queued spinlock support for SPARC. These features were introduced some time ago in upstream. Here are some of the earlier discussions. https://lwn.net/Articles/572765/ https://lwn.net/Articles/582200/ https://lwn.net/Articles/561775/ https://lwn.net/Articles/590243/ Tests: Ran AIM7 benchmark to verify the performance on various workloads. https://github.com/davidlohr/areaim. Same benchmark was used when this feature was introduced and enabled on x86. Here are the test results. Kernel 4.11.0-rc6 4.11.0-rc6 + Change baseline queued locks (Avg No.of jobs) (Avg No.of jobs) Workload High systime 10-100 user 17290.48 17295.18 +0.02 High systime 200-1000 users 109814.95 110248.87 +0.39 High systime 1200-2000 users 107912.40 127923.16 +18.54 Disk IO 10-100 users 168910.16 158834.17 -5.96 Disk IO 200-1000 users 242781.74 281285.80 +15.85 Disk IO 1200-2000 users 228518.23 218421.23 -4.41 Disk IO 10-100 users 183933.77 207928.67 +13.04 Disk IO 200-1000 users 491981.56 500162.33 +1.66 Disk IO 1200-2000 users 463395.66 467312.70 +0.84 fserver 10-100 users 254177.53 270283.08 +6.33 fserver IO 200-1000 users 269017.35 324812.2 +20.74 fserver IO 1200-2000 users 229538.87 284713.77 +24.03 Disk I/O results are little bit in negative territory. But majority of the performance changes are in positive and it is significant in some cases. Changes: v3 -> v4: 1. Took care of Geert Uytterhoeven's comment about patch #3(def_bool y) 2. Working on separate patch sets to define CPU_BIG_ENDIAN for all the default big endian architectures based on feedback from Geert and Arnd. v2 -> v3: 1. Rebased the patches on top of 4.12-rc2. 2. Re-ordered the patch #1 and patch #2. That is the same order I have seen the issues. So, it should be addressed in the same order. Patch #1 removes the check __LINUX_SPINLOCK_TYPES_H. Patch #2 addreses the compile error with qrwlock.c. This addresses the comments from Dave Miller on v2. v1 -> v2: Addressed the comments from David Miller. 1. Added CPU_BIG_ENDIAN for all SPARC 2. Removed #ifndef __LINUX_SPINLOCK_TYPES_H guard from spinlock_types.h 3. Removed check for CONFIG_QUEUED_RWLOCKS in SPARC64 as it is the default definition for SPARC64 now. Cleaned-up the previous arch_read_xxx and arch_write_xxx definitions as it is defined now in qrwlock.h. 4. Removed check for CONFIG_QUEUED_SPINLOCKS in SPARC64 as it is the default definition now for SPARC64 now. Cleaned-up the previous arch_spin_xxx definitions as it is defined in qspinlock.h. v1: Initial version ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
author: David S. Miller 2017-05-25 21:07:08 +0200
committer: David S. Miller 2017-05-25 21:07:08 +0200
commit: 60925ee97e2be4993fb7a2f7e70be0fbce08cf0f (patch)
tree: 8c2db73ea429d948a46c6b8577ce6c40a3fd18bd
parent: Merge tag 'scsi-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/jejb/... (diff)
parent: arch/sparc: Enable queued spinlock support for SPARC (diff)
download: kernel-qcow2-linux-60925ee97e2be4993fb7a2f7e70be0fbce08cf0f.tar.gz
kernel-qcow2-linux-60925ee97e2be4993fb7a2f7e70be0fbce08cf0f.tar.xz
kernel-qcow2-linux-60925ee97e2be4993fb7a2f7e70be0fbce08cf0f.zip
7 files changed, 97 insertions, 219 deletions
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 58243b0d21c0..78af684f63b9 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -83,6 +83,8 @@ config SPARC64
 	select ARCH_SUPPORTS_ATOMIC_RMW
 	select HAVE_NMI
 	select HAVE_REGS_AND_STACK_ACCESS_API
+	select ARCH_USE_QUEUED_RWLOCKS
+	select ARCH_USE_QUEUED_SPINLOCKS
 
 config ARCH_DEFCONFIG
 	string
@@ -92,6 +94,9 @@ config ARCH_DEFCONFIG
 config ARCH_PROC_KCORE_TEXT
 	def_bool y
 
+config CPU_BIG_ENDIAN
+	def_bool y
+
 config ARCH_ATU
 	bool
 	default y if SPARC64
diff --git a/arch/sparc/include/asm/cmpxchg_64.h b/arch/sparc/include/asm/cmpxchg_64.h
index faa2f61058c2..4028f4f1e561 100644
--- a/arch/sparc/include/asm/cmpxchg_64.h
+++ b/arch/sparc/include/asm/cmpxchg_64.h
@@ -6,6 +6,17 @@
 #ifndef __ARCH_SPARC64_CMPXCHG__
 #define __ARCH_SPARC64_CMPXCHG__
 
+static inline unsigned long
+__cmpxchg_u32(volatile int *m, int old, int new)
+{
+	__asm__ __volatile__("cas [%2], %3, %0"
+			     : "=&r" (new)
+			     : "0" (new), "r" (m), "r" (old)
+			     : "memory");
+
+	return new;
+}
+
 static inline unsigned long xchg32(__volatile__ unsigned int *m, unsigned int val)
 {
 	unsigned long tmp1, tmp2;
@@ -44,10 +55,38 @@ static inline unsigned long xchg64(__volatile__ unsigned long *m, unsigned long
 
 void __xchg_called_with_bad_pointer(void);
 
+/*
+ * Use 4 byte cas instruction to achieve 2 byte xchg. Main logic
+ * here is to get the bit shift of the byte we are interested in.
+ * The XOR is handy for reversing the bits for big-endian byte order.
+ */
+static inline unsigned long
+xchg16(__volatile__ unsigned short *m, unsigned short val)
+{
+	unsigned long maddr = (unsigned long)m;
+	int bit_shift = (((unsigned long)m & 2) ^ 2) << 3;
+	unsigned int mask = 0xffff << bit_shift;
+	unsigned int *ptr = (unsigned int  *) (maddr & ~2);
+	unsigned int old32, new32, load32;
+
+	/* Read the old value */
+	load32 = *ptr;
+
+	do {
+		old32 = load32;
+		new32 = (load32 & (~mask)) | val << bit_shift;
+		load32 = __cmpxchg_u32(ptr, old32, new32);
+	} while (load32 != old32);
+
+	return (load32 & mask) >> bit_shift;
+}
+
 static inline unsigned long __xchg(unsigned long x, __volatile__ void * ptr,
 				       int size)
 {
 	switch (size) {
+	case 2:
+		return xchg16(ptr, x);
 	case 4:
 		return xchg32(ptr, x);
 	case 8:
@@ -65,10 +104,11 @@ static inline unsigned long __xchg(unsigned long x, __volatile__ void * ptr,
 
 #include <asm-generic/cmpxchg-local.h>
 
+
 static inline unsigned long
-__cmpxchg_u32(volatile int *m, int old, int new)
+__cmpxchg_u64(volatile long *m, unsigned long old, unsigned long new)
 {
-	__asm__ __volatile__("cas [%2], %3, %0"
+	__asm__ __volatile__("casx [%2], %3, %0"
 			     : "=&r" (new)
 			     : "0" (new), "r" (m), "r" (old)
 			     : "memory");
@@ -76,15 +116,31 @@ __cmpxchg_u32(volatile int *m, int old, int new)
 	return new;
 }
 
+/*
+ * Use 4 byte cas instruction to achieve 1 byte cmpxchg. Main logic
+ * here is to get the bit shift of the byte we are interested in.
+ * The XOR is handy for reversing the bits for big-endian byte order
+ */
 static inline unsigned long
-__cmpxchg_u64(volatile long *m, unsigned long old, unsigned long new)
+__cmpxchg_u8(volatile unsigned char *m, unsigned char old, unsigned char new)
 {
-	__asm__ __volatile__("casx [%2], %3, %0"
-			     : "=&r" (new)
-			     : "0" (new), "r" (m), "r" (old)
-			     : "memory");
-
-	return new;
+	unsigned long maddr = (unsigned long)m;
+	int bit_shift = (((unsigned long)m & 3) ^ 3) << 3;
+	unsigned int mask = 0xff << bit_shift;
+	unsigned int *ptr = (unsigned int *) (maddr & ~3);
+	unsigned int old32, new32, load;
+	unsigned int load32 = *ptr;
+
+	do {
+		new32 = (load32 & ~mask) | (new << bit_shift);
+		old32 = (load32 & ~mask) | (old << bit_shift);
+		load32 = __cmpxchg_u32(ptr, old32, new32);
+		if (load32 == old32)
+			return old;
+		load = (load32 & mask) >> bit_shift;
+	} while (load == old);
+
+	return load;
 }
 
 /* This function doesn't exist, so you'll get a linker error
@@ -95,6 +151,8 @@ static inline unsigned long
 __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, int size)
 {
 	switch (size) {
+		case 1:
+			return __cmpxchg_u8(ptr, old, new);
 		case 4:
 			return __cmpxchg_u32(ptr, old, new);
 		case 8:
diff --git a/arch/sparc/include/asm/qrwlock.h b/arch/sparc/include/asm/qrwlock.h
new file mode 100644
index 000000000000..d68a4b102100
--- /dev/null
+++ b/arch/sparc/include/asm/qrwlock.h
@@ -0,0 +1,7 @@
+#ifndef _ASM_SPARC_QRWLOCK_H
+#define _ASM_SPARC_QRWLOCK_H
+
+#include <asm-generic/qrwlock_types.h>
+#include <asm-generic/qrwlock.h>
+
+#endif /* _ASM_SPARC_QRWLOCK_H */
diff --git a/arch/sparc/include/asm/qspinlock.h b/arch/sparc/include/asm/qspinlock.h
new file mode 100644
index 000000000000..5ae9a2802846
--- /dev/null
+++ b/arch/sparc/include/asm/qspinlock.h
@@ -0,0 +1,7 @@
+#ifndef _ASM_SPARC_QSPINLOCK_H
+#define _ASM_SPARC_QSPINLOCK_H
+
+#include <asm-generic/qspinlock_types.h>
+#include <asm-generic/qspinlock.h>
+
+#endif /* _ASM_SPARC_QSPINLOCK_H */
diff --git a/arch/sparc/include/asm/spinlock_64.h b/arch/sparc/include/asm/spinlock_64.h
index 07c9f2e9bf57..f7028f5e1a5a 100644
--- a/arch/sparc/include/asm/spinlock_64.h
+++ b/arch/sparc/include/asm/spinlock_64.h
@@ -10,216 +10,12 @@
 
 #include <asm/processor.h>
 #include <asm/barrier.h>
-
-/* To get debugging spinlocks which detect and catch
- * deadlock situations, set CONFIG_DEBUG_SPINLOCK
- * and rebuild your kernel.
- */
-
-/* Because we play games to save cycles in the non-contention case, we
- * need to be extra careful about branch targets into the "spinning"
- * code.  They live in their own section, but the newer V9 branches
- * have a shorter range than the traditional 32-bit sparc branch
- * variants.  The rule is that the branches that go into and out of
- * the spinner sections must be pre-V9 branches.
- */
-
-#define arch_spin_is_locked(lp)	((lp)->lock != 0)
-
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	smp_cond_load_acquire(&lock->lock, !VAL);
-}
-
-static inline void arch_spin_lock(arch_spinlock_t *lock)
-{
-	unsigned long tmp;
-
-	__asm__ __volatile__(
-"1:	ldstub		[%1], %0\n"
-"	brnz,pn		%0, 2f\n"
-"	 nop\n"
-"	.subsection	2\n"
-"2:	ldub		[%1], %0\n"
-"	brnz,pt		%0, 2b\n"
-"	 nop\n"
-"	ba,a,pt		%%xcc, 1b\n"
-"	.previous"
-	: "=&r" (tmp)
-	: "r" (lock)
-	: "memory");
-}
-
-static inline int arch_spin_trylock(arch_spinlock_t *lock)
-{
-	unsigned long result;
-
-	__asm__ __volatile__(
-"	ldstub		[%1], %0\n"
-	: "=r" (result)
-	: "r" (lock)
-	: "memory");
-
-	return (result == 0UL);
-}
-
-static inline void arch_spin_unlock(arch_spinlock_t *lock)
-{
-	__asm__ __volatile__(
-"	stb		%%g0, [%0]"
-	: /* No outputs */
-	: "r" (lock)
-	: "memory");
-}
-
-static inline void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
-{
-	unsigned long tmp1, tmp2;
-
-	__asm__ __volatile__(
-"1:	ldstub		[%2], %0\n"
-"	brnz,pn		%0, 2f\n"
-"	 nop\n"
-"	.subsection	2\n"
-"2:	rdpr		%%pil, %1\n"
-"	wrpr		%3, %%pil\n"
-"3:	ldub		[%2], %0\n"
-"	brnz,pt		%0, 3b\n"
-"	 nop\n"
-"	ba,pt		%%xcc, 1b\n"
-"	 wrpr		%1, %%pil\n"
-"	.previous"
-	: "=&r" (tmp1), "=&r" (tmp2)
-	: "r"(lock), "r"(flags)
-	: "memory");
-}
-
-/* Multi-reader locks, these are much saner than the 32-bit Sparc ones... */
-
-static inline void arch_read_lock(arch_rwlock_t *lock)
-{
-	unsigned long tmp1, tmp2;
-
-	__asm__ __volatile__ (
-"1:	ldsw		[%2], %0\n"
-"	brlz,pn		%0, 2f\n"
-"4:	 add		%0, 1, %1\n"
-"	cas		[%2], %0, %1\n"
-"	cmp		%0, %1\n"
-"	bne,pn		%%icc, 1b\n"
-"	 nop\n"
-"	.subsection	2\n"
-"2:	ldsw		[%2], %0\n"
-"	brlz,pt		%0, 2b\n"
-"	 nop\n"
-"	ba,a,pt		%%xcc, 4b\n"
-"	.previous"
-	: "=&r" (tmp1), "=&r" (tmp2)
-	: "r" (lock)
-	: "memory");
-}
-
-static inline int arch_read_trylock(arch_rwlock_t *lock)
-{
-	int tmp1, tmp2;
-
-	__asm__ __volatile__ (
-"1:	ldsw		[%2], %0\n"
-"	brlz,a,pn	%0, 2f\n"
-"	 mov		0, %0\n"
-"	add		%0, 1, %1\n"
-"	cas		[%2], %0, %1\n"
-"	cmp		%0, %1\n"
-"	bne,pn		%%icc, 1b\n"
-"	 mov		1, %0\n"
-"2:"
-	: "=&r" (tmp1), "=&r" (tmp2)
-	: "r" (lock)
-	: "memory");
-
-	return tmp1;
-}
-
-static inline void arch_read_unlock(arch_rwlock_t *lock)
-{
-	unsigned long tmp1, tmp2;
-
-	__asm__ __volatile__(
-"1:	lduw	[%2], %0\n"
-"	sub	%0, 1, %1\n"
-"	cas	[%2], %0, %1\n"
-"	cmp	%0, %1\n"
-"	bne,pn	%%xcc, 1b\n"
-"	 nop"
-	: "=&r" (tmp1), "=&r" (tmp2)
-	: "r" (lock)
-	: "memory");
-}
-
-static inline void arch_write_lock(arch_rwlock_t *lock)
-{
-	unsigned long mask, tmp1, tmp2;
-
-	mask = 0x80000000UL;
-
-	__asm__ __volatile__(
-"1:	lduw		[%2], %0\n"
-"	brnz,pn		%0, 2f\n"
-"4:	 or		%0, %3, %1\n"
-"	cas		[%2], %0, %1\n"
-"	cmp		%0, %1\n"
-"	bne,pn		%%icc, 1b\n"
-"	 nop\n"
-"	.subsection	2\n"
-"2:	lduw		[%2], %0\n"
-"	brnz,pt		%0, 2b\n"
-"	 nop\n"
-"	ba,a,pt		%%xcc, 4b\n"
-"	.previous"
-	: "=&r" (tmp1), "=&r" (tmp2)
-	: "r" (lock), "r" (mask)
-	: "memory");
-}
-
-static inline void arch_write_unlock(arch_rwlock_t *lock)
-{
-	__asm__ __volatile__(
-"	stw		%%g0, [%0]"
-	: /* no outputs */
-	: "r" (lock)
-	: "memory");
-}
-
-static inline int arch_write_trylock(arch_rwlock_t *lock)
-{
-	unsigned long mask, tmp1, tmp2, result;
-
-	mask = 0x80000000UL;
-
-	__asm__ __volatile__(
-"	mov		0, %2\n"
-"1:	lduw		[%3], %0\n"
-"	brnz,pn		%0, 2f\n"
-"	 or		%0, %4, %1\n"
-"	cas		[%3], %0, %1\n"
-"	cmp		%0, %1\n"
-"	bne,pn		%%icc, 1b\n"
-"	 nop\n"
-"	mov		1, %2\n"
-"2:"
-	: "=&r" (tmp1), "=&r" (tmp2), "=&r" (result)
-	: "r" (lock), "r" (mask)
-	: "memory");
-
-	return result;
-}
+#include <asm/qrwlock.h>
+#include <asm/qspinlock.h>
 
 #define arch_read_lock_flags(p, f) arch_read_lock(p)
 #define arch_write_lock_flags(p, f) arch_write_lock(p)
 
-#define arch_read_can_lock(rw)		(!((rw)->lock & 0x80000000UL))
-#define arch_write_can_lock(rw)	(!(rw)->lock)
-
 #define arch_spin_relax(lock)	cpu_relax()
 #define arch_read_relax(lock)	cpu_relax()
 #define arch_write_relax(lock)	cpu_relax()
diff --git a/arch/sparc/include/asm/spinlock_types.h b/arch/sparc/include/asm/spinlock_types.h
index 9c454fdeaad8..bce8ef44dfa9 100644
--- a/arch/sparc/include/asm/spinlock_types.h
+++ b/arch/sparc/include/asm/spinlock_types.h
@@ -1,20 +1,24 @@
 #ifndef __SPARC_SPINLOCK_TYPES_H
 #define __SPARC_SPINLOCK_TYPES_H
 
-#ifndef __LINUX_SPINLOCK_TYPES_H
-# error "please don't include this file directly"
-#endif
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include <asm-generic/qspinlock_types.h>
+#else
 
 typedef struct {
 	volatile unsigned char lock;
 } arch_spinlock_t;
 
 #define __ARCH_SPIN_LOCK_UNLOCKED	{ 0 }
+#endif /* CONFIG_QUEUED_SPINLOCKS */
 
+#ifdef CONFIG_QUEUED_RWLOCKS
+#include <asm-generic/qrwlock_types.h>
+#else
 typedef struct {
 	volatile unsigned int lock;
 } arch_rwlock_t;
 
 #define __ARCH_RW_LOCK_UNLOCKED		{ 0 }
-
+#endif /* CONFIG_QUEUED_RWLOCKS */
 #endif
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index cc3ed0ccdfa2..2655f26ec882 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -20,6 +20,7 @@
 #include <linux/cpumask.h>
 #include <linux/percpu.h>
 #include <linux/hardirq.h>
+#include <linux/spinlock.h>
 #include <asm/qrwlock.h>
 
 /*
author	David S. Miller	2017-05-25 21:07:08 +0200
committer	David S. Miller	2017-05-25 21:07:08 +0200
commit	60925ee97e2be4993fb7a2f7e70be0fbce08cf0f (patch)
tree	8c2db73ea429d948a46c6b8577ce6c40a3fd18bd
parent	Merge tag 'scsi-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/jejb/... (diff)
parent	arch/sparc: Enable queued spinlock support for SPARC (diff)
download	kernel-qcow2-linux-60925ee97e2be4993fb7a2f7e70be0fbce08cf0f.tar.gz kernel-qcow2-linux-60925ee97e2be4993fb7a2f7e70be0fbce08cf0f.tar.xz kernel-qcow2-linux-60925ee97e2be4993fb7a2f7e70be0fbce08cf0f.zip