From 82985258390e85289940d3663344197344e071f2 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 7 Apr 2017 17:20:01 -0400 Subject: kill strlen_user() no callers, no consistent semantics, no sane way to use it... Signed-off-by: Al Viro --- arch/ia64/include/asm/uaccess.h | 12 --- arch/ia64/lib/Makefile | 2 +- arch/ia64/lib/strlen_user.S | 200 ---------------------------------------- 3 files changed, 1 insertion(+), 213 deletions(-) delete mode 100644 arch/ia64/lib/strlen_user.S (limited to 'arch/ia64') diff --git a/arch/ia64/include/asm/uaccess.h b/arch/ia64/include/asm/uaccess.h index 82a7646c4416..b2106b01e84f 100644 --- a/arch/ia64/include/asm/uaccess.h +++ b/arch/ia64/include/asm/uaccess.h @@ -277,18 +277,6 @@ extern long __must_check __strncpy_from_user (char *to, const char __user *from, __sfu_ret; \ }) -/* Returns: 0 if bad, string length+1 (memory size) of string if ok */ -extern unsigned long __strlen_user (const char __user *); - -#define strlen_user(str) \ -({ \ - const char __user *__su_str = (str); \ - unsigned long __su_ret = 0; \ - if (__access_ok(__su_str, 0)) \ - __su_ret = __strlen_user(__su_str); \ - __su_ret; \ -}) - /* * Returns: 0 if exception before NUL or reaching the supplied limit * (N), a value greater than N if the limit would be exceeded, else diff --git a/arch/ia64/lib/Makefile b/arch/ia64/lib/Makefile index 0a40b14407b1..1a36a3a39624 100644 --- a/arch/ia64/lib/Makefile +++ b/arch/ia64/lib/Makefile @@ -5,7 +5,7 @@ lib-y := io.o __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o \ __divdi3.o __udivdi3.o __moddi3.o __umoddi3.o \ checksum.o clear_page.o csum_partial_copy.o \ - clear_user.o strncpy_from_user.o strlen_user.o strnlen_user.o \ + clear_user.o strncpy_from_user.o strnlen_user.o \ flush.o ip_fast_csum.o do_csum.o \ memset.o strlen.o xor.o diff --git a/arch/ia64/lib/strlen_user.S b/arch/ia64/lib/strlen_user.S deleted file mode 100644 index 9d257684e733..000000000000 --- a/arch/ia64/lib/strlen_user.S +++ /dev/null @@ -1,200 +0,0 @@ -/* - * Optimized version of the strlen_user() function - * - * Inputs: - * in0 address of buffer - * - * Outputs: - * ret0 0 in case of fault, strlen(buffer)+1 otherwise - * - * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co - * David Mosberger-Tang - * Stephane Eranian - * - * 01/19/99 S.Eranian heavily enhanced version (see details below) - * 09/24/99 S.Eranian added speculation recovery code - */ - -#include -#include - -// -// int strlen_user(char *) -// ------------------------ -// Returns: -// - length of string + 1 -// - 0 in case an exception is raised -// -// This is an enhanced version of the basic strlen_user. it includes a -// combination of compute zero index (czx), parallel comparisons, speculative -// loads and loop unroll using rotating registers. -// -// General Ideas about the algorithm: -// The goal is to look at the string in chunks of 8 bytes. -// so we need to do a few extra checks at the beginning because the -// string may not be 8-byte aligned. In this case we load the 8byte -// quantity which includes the start of the string and mask the unused -// bytes with 0xff to avoid confusing czx. -// We use speculative loads and software pipelining to hide memory -// latency and do read ahead safely. This way we defer any exception. -// -// Because we don't want the kernel to be relying on particular -// settings of the DCR register, we provide recovery code in case -// speculation fails. The recovery code is going to "redo" the work using -// only normal loads. If we still get a fault then we return an -// error (ret0=0). Otherwise we return the strlen+1 as usual. -// The fact that speculation may fail can be caused, for instance, by -// the DCR.dm bit being set. In this case TLB misses are deferred, i.e., -// a NaT bit will be set if the translation is not present. The normal -// load, on the other hand, will cause the translation to be inserted -// if the mapping exists. -// -// It should be noted that we execute recovery code only when we need -// to use the data that has been speculatively loaded: we don't execute -// recovery code on pure read ahead data. -// -// Remarks: -// - the cmp r0,r0 is used as a fast way to initialize a predicate -// register to 1. This is required to make sure that we get the parallel -// compare correct. -// -// - we don't use the epilogue counter to exit the loop but we need to set -// it to zero beforehand. -// -// - after the loop we must test for Nat values because neither the -// czx nor cmp instruction raise a NaT consumption fault. We must be -// careful not to look too far for a Nat for which we don't care. -// For instance we don't need to look at a NaT in val2 if the zero byte -// was in val1. -// -// - Clearly performance tuning is required. -// - -#define saved_pfs r11 -#define tmp r10 -#define base r16 -#define orig r17 -#define saved_pr r18 -#define src r19 -#define mask r20 -#define val r21 -#define val1 r22 -#define val2 r23 - -GLOBAL_ENTRY(__strlen_user) - .prologue - .save ar.pfs, saved_pfs - alloc saved_pfs=ar.pfs,11,0,0,8 - - .rotr v[2], w[2] // declares our 4 aliases - - extr.u tmp=in0,0,3 // tmp=least significant 3 bits - mov orig=in0 // keep trackof initial byte address - dep src=0,in0,0,3 // src=8byte-aligned in0 address - .save pr, saved_pr - mov saved_pr=pr // preserve predicates (rotation) - ;; - - .body - - ld8.s v[1]=[src],8 // load the initial 8bytes (must speculate) - shl tmp=tmp,3 // multiply by 8bits/byte - mov mask=-1 // our mask - ;; - ld8.s w[1]=[src],8 // load next 8 bytes in 2nd pipeline - cmp.eq p6,p0=r0,r0 // sets p6 (required because of // cmp.and) - sub tmp=64,tmp // how many bits to shift our mask on the right - ;; - shr.u mask=mask,tmp // zero enough bits to hold v[1] valuable part - mov ar.ec=r0 // clear epilogue counter (saved in ar.pfs) - ;; - add base=-16,src // keep track of aligned base - chk.s v[1], .recover // if already NaT, then directly skip to recover - or v[1]=v[1],mask // now we have a safe initial byte pattern - ;; -1: - ld8.s v[0]=[src],8 // speculatively load next - czx1.r val1=v[1] // search 0 byte from right - czx1.r val2=w[1] // search 0 byte from right following 8bytes - ;; - ld8.s w[0]=[src],8 // speculatively load next to next - cmp.eq.and p6,p0=8,val1 // p6 = p6 and val1==8 - cmp.eq.and p6,p0=8,val2 // p6 = p6 and mask==8 -(p6) br.wtop.dptk.few 1b // loop until p6 == 0 - ;; - // - // We must return try the recovery code iff - // val1_is_nat || (val1==8 && val2_is_nat) - // - // XXX Fixme - // - there must be a better way of doing the test - // - cmp.eq p8,p9=8,val1 // p6 = val1 had zero (disambiguate) - tnat.nz p6,p7=val1 // test NaT on val1 -(p6) br.cond.spnt .recover // jump to recovery if val1 is NaT - ;; - // - // if we come here p7 is true, i.e., initialized for // cmp - // - cmp.eq.and p7,p0=8,val1// val1==8? - tnat.nz.and p7,p0=val2 // test NaT if val2 -(p7) br.cond.spnt .recover // jump to recovery if val2 is NaT - ;; -(p8) mov val1=val2 // val2 contains the value -(p8) adds src=-16,src // correct position when 3 ahead -(p9) adds src=-24,src // correct position when 4 ahead - ;; - sub ret0=src,orig // distance from origin - sub tmp=7,val1 // 7=8-1 because this strlen returns strlen+1 - mov pr=saved_pr,0xffffffffffff0000 - ;; - sub ret0=ret0,tmp // length=now - back -1 - mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what - br.ret.sptk.many rp // end of normal execution - - // - // Outlined recovery code when speculation failed - // - // This time we don't use speculation and rely on the normal exception - // mechanism. that's why the loop is not as good as the previous one - // because read ahead is not possible - // - // XXX Fixme - // - today we restart from the beginning of the string instead - // of trying to continue where we left off. - // -.recover: - EX(.Lexit1, ld8 val=[base],8) // load the initial bytes - ;; - or val=val,mask // remask first bytes - cmp.eq p0,p6=r0,r0 // nullify first ld8 in loop - ;; - // - // ar.ec is still zero here - // -2: - EX(.Lexit1, (p6) ld8 val=[base],8) - ;; - czx1.r val1=val // search 0 byte from right - ;; - cmp.eq p6,p0=8,val1 // val1==8 ? -(p6) br.wtop.dptk.few 2b // loop until p6 == 0 - ;; - sub ret0=base,orig // distance from base - sub tmp=7,val1 // 7=8-1 because this strlen returns strlen+1 - mov pr=saved_pr,0xffffffffffff0000 - ;; - sub ret0=ret0,tmp // length=now - back -1 - mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what - br.ret.sptk.many rp // end of successful recovery code - - // - // We failed even on the normal load (called from exception handler) - // -.Lexit1: - mov ret0=0 - mov pr=saved_pr,0xffffffffffff0000 - mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what - br.ret.sptk.many rp -END(__strlen_user) -EXPORT_SYMBOL(__strlen_user) -- cgit v1.2.3-55-g7522 From 2a76213072c8d6ac4364455e236d52b7d36601ca Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Sun, 14 May 2017 11:54:11 -0300 Subject: ia64, scsi: update references for the device-io book The book is now at Documentation/driver-api/device-io.rst. Update such references. Signed-off-by: Mauro Carvalho Chehab --- arch/ia64/include/asm/io.h | 2 +- arch/ia64/sn/kernel/iomv.c | 2 +- drivers/scsi/qla1280.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/ia64') diff --git a/arch/ia64/include/asm/io.h b/arch/ia64/include/asm/io.h index 5de673ac9cb1..a2540e21f919 100644 --- a/arch/ia64/include/asm/io.h +++ b/arch/ia64/include/asm/io.h @@ -117,7 +117,7 @@ extern int valid_mmap_phys_addr_range (unsigned long pfn, size_t count); * following the barrier will arrive after all previous writes. For most * ia64 platforms, this is a simple 'mf.a' instruction. * - * See Documentation/DocBook/deviceiobook.tmpl for more information. + * See Documentation/driver-api/device-io.rst for more information. */ static inline void ___ia64_mmiowb(void) { diff --git a/arch/ia64/sn/kernel/iomv.c b/arch/ia64/sn/kernel/iomv.c index c77ebdf98119..2b22a71663c1 100644 --- a/arch/ia64/sn/kernel/iomv.c +++ b/arch/ia64/sn/kernel/iomv.c @@ -63,7 +63,7 @@ EXPORT_SYMBOL(sn_io_addr); /** * __sn_mmiowb - I/O space memory barrier * - * See arch/ia64/include/asm/io.h and Documentation/DocBook/deviceiobook.tmpl + * See arch/ia64/include/asm/io.h and Documentation/driver-api/device-io.rst * for details. * * On SN2, we wait for the PIO_WRITE_STATUS SHub register to clear. diff --git a/drivers/scsi/qla1280.c b/drivers/scsi/qla1280.c index 634254a52301..8a29fb09db14 100644 --- a/drivers/scsi/qla1280.c +++ b/drivers/scsi/qla1280.c @@ -3390,7 +3390,7 @@ qla1280_isp_cmd(struct scsi_qla_host *ha) * On PCI bus, order reverses and write of 6 posts, then index 5, * causing chip to issue full queue of stale commands * The mmiowb() prevents future writes from crossing the barrier. - * See Documentation/DocBook/deviceiobook.tmpl for more information. + * See Documentation/driver-api/device-io.rst for more information. */ WRT_REG_WORD(®->mailbox4, ha->req_ring_index); mmiowb(); -- cgit v1.2.3-55-g7522 From 1c4f676a68a502e7bef7d0e49952b042d00aa496 Mon Sep 17 00:00:00 2001 From: David S. Miller Date: Sun, 21 May 2017 23:13:37 -0400 Subject: net: Define SCM_TIMESTAMPING_PKTINFO on all architectures. A definition was only provided for asm-generic/socket.h using platforms, define it for the others as well Reported-by: Stephen Rothwell Signed-off-by: David S. Miller --- arch/alpha/include/uapi/asm/socket.h | 2 ++ arch/frv/include/uapi/asm/socket.h | 2 ++ arch/ia64/include/uapi/asm/socket.h | 2 ++ arch/m32r/include/uapi/asm/socket.h | 2 ++ arch/mips/include/uapi/asm/socket.h | 2 ++ arch/mn10300/include/uapi/asm/socket.h | 2 ++ arch/parisc/include/uapi/asm/socket.h | 2 ++ arch/powerpc/include/uapi/asm/socket.h | 2 ++ arch/s390/include/uapi/asm/socket.h | 2 ++ arch/sparc/include/uapi/asm/socket.h | 2 ++ arch/xtensa/include/uapi/asm/socket.h | 2 ++ 11 files changed, 22 insertions(+) (limited to 'arch/ia64') diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index 148d7a32754e..0926de63a62b 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -105,4 +105,6 @@ #define SO_COOKIE 57 +#define SCM_TIMESTAMPING_PKTINFO 58 + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/frv/include/uapi/asm/socket.h b/arch/frv/include/uapi/asm/socket.h index 1ccf45657472..e491ff08b9a9 100644 --- a/arch/frv/include/uapi/asm/socket.h +++ b/arch/frv/include/uapi/asm/socket.h @@ -98,5 +98,7 @@ #define SO_COOKIE 57 +#define SCM_TIMESTAMPING_PKTINFO 58 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h index 2c3f4b48042a..869372413333 100644 --- a/arch/ia64/include/uapi/asm/socket.h +++ b/arch/ia64/include/uapi/asm/socket.h @@ -107,4 +107,6 @@ #define SO_COOKIE 57 +#define SCM_TIMESTAMPING_PKTINFO 58 + #endif /* _ASM_IA64_SOCKET_H */ diff --git a/arch/m32r/include/uapi/asm/socket.h b/arch/m32r/include/uapi/asm/socket.h index ae6548d29a18..5d97890a8704 100644 --- a/arch/m32r/include/uapi/asm/socket.h +++ b/arch/m32r/include/uapi/asm/socket.h @@ -98,4 +98,6 @@ #define SO_COOKIE 57 +#define SCM_TIMESTAMPING_PKTINFO 58 + #endif /* _ASM_M32R_SOCKET_H */ diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index 3418ec9c1c50..365ff51f033a 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -116,4 +116,6 @@ #define SO_COOKIE 57 +#define SCM_TIMESTAMPING_PKTINFO 58 + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/mn10300/include/uapi/asm/socket.h b/arch/mn10300/include/uapi/asm/socket.h index 4526e92301a6..d013c0da0256 100644 --- a/arch/mn10300/include/uapi/asm/socket.h +++ b/arch/mn10300/include/uapi/asm/socket.h @@ -98,4 +98,6 @@ #define SO_COOKIE 57 +#define SCM_TIMESTAMPING_PKTINFO 58 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index 514701840bd9..784b871592f2 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -97,4 +97,6 @@ #define SO_COOKIE 0x4032 +#define SCM_TIMESTAMPING_PKTINFO 58 + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/powerpc/include/uapi/asm/socket.h b/arch/powerpc/include/uapi/asm/socket.h index 58e2ec0310fc..bc4ca72faf99 100644 --- a/arch/powerpc/include/uapi/asm/socket.h +++ b/arch/powerpc/include/uapi/asm/socket.h @@ -105,4 +105,6 @@ #define SO_COOKIE 57 +#define SCM_TIMESTAMPING_PKTINFO 58 + #endif /* _ASM_POWERPC_SOCKET_H */ diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h index e8e5ecf673fd..fb9769d7e74e 100644 --- a/arch/s390/include/uapi/asm/socket.h +++ b/arch/s390/include/uapi/asm/socket.h @@ -104,4 +104,6 @@ #define SO_COOKIE 57 +#define SCM_TIMESTAMPING_PKTINFO 58 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 3f4ad19d9ec7..5d673302fd41 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -94,6 +94,8 @@ #define SO_COOKIE 0x003b +#define SCM_TIMESTAMPING_PKTINFO 0x003c + /* Security levels - as per NRL IPv6 - don't actually do anything */ #define SO_SECURITY_AUTHENTICATION 0x5001 #define SO_SECURITY_ENCRYPTION_TRANSPORT 0x5002 diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h index 1eb6d2fe70d3..982c2533f912 100644 --- a/arch/xtensa/include/uapi/asm/socket.h +++ b/arch/xtensa/include/uapi/asm/socket.h @@ -109,4 +109,6 @@ #define SO_COOKIE 57 +#define SCM_TIMESTAMPING_PKTINFO 58 + #endif /* _XTENSA_SOCKET_H */ -- cgit v1.2.3-55-g7522 From 8535796579fd08f226878212b39b2682064d41b7 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 6 Jun 2017 12:54:36 +0200 Subject: tty: simserial: drop unused alt_speed handling This driver was setting the deprecated and broken alt_speed based on port flags, but never provided a means to change the flags or to actually change the speed. Signed-off-by: Johan Hovold Reviewed-by: Andy Shevchenko Reviewed-by: Alan Cox Signed-off-by: Greg Kroah-Hartman --- arch/ia64/hp/sim/simserial.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'arch/ia64') diff --git a/arch/ia64/hp/sim/simserial.c b/arch/ia64/hp/sim/simserial.c index de8cba121013..70d52e9bb575 100644 --- a/arch/ia64/hp/sim/simserial.c +++ b/arch/ia64/hp/sim/simserial.c @@ -387,19 +387,6 @@ static int activate(struct tty_port *port, struct tty_struct *tty) } state->xmit.head = state->xmit.tail = 0; - - /* - * Set up the tty->alt_speed kludge - */ - if ((port->flags & ASYNC_SPD_MASK) == ASYNC_SPD_HI) - tty->alt_speed = 57600; - if ((port->flags & ASYNC_SPD_MASK) == ASYNC_SPD_VHI) - tty->alt_speed = 115200; - if ((port->flags & ASYNC_SPD_MASK) == ASYNC_SPD_SHI) - tty->alt_speed = 230400; - if ((port->flags & ASYNC_SPD_MASK) == ASYNC_SPD_WARP) - tty->alt_speed = 460800; - errout: local_irq_restore(flags); return retval; -- cgit v1.2.3-55-g7522 From 75fbede1cf8415d23fe1cb59cd70faa4d08813ab Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 21 May 2017 12:28:08 +0200 Subject: ia64: remove DMA_ERROR_CODE All ia64 dma_mapping_ops instances already have a mapping_error member. Signed-off-by: Christoph Hellwig --- arch/ia64/include/asm/dma-mapping.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/ia64') diff --git a/arch/ia64/include/asm/dma-mapping.h b/arch/ia64/include/asm/dma-mapping.h index 73ec3c6f4cfe..3ce5ab4339f3 100644 --- a/arch/ia64/include/asm/dma-mapping.h +++ b/arch/ia64/include/asm/dma-mapping.h @@ -12,8 +12,6 @@ #define ARCH_HAS_DMA_GET_REQUIRED_MASK -#define DMA_ERROR_CODE 0 - extern const struct dma_map_ops *dma_ops; extern struct ia64_machine_vector ia64_mv; extern void set_iommu_machvec(void); -- cgit v1.2.3-55-g7522 From 28b5ba2aa0f55d80adb2624564ed2b170c19519e Mon Sep 17 00:00:00 2001 From: David Herrmann Date: Wed, 21 Jun 2017 10:47:15 +0200 Subject: net: introduce SO_PEERGROUPS getsockopt This adds the new getsockopt(2) option SO_PEERGROUPS on SOL_SOCKET to retrieve the auxiliary groups of the remote peer. It is designed to naturally extend SO_PEERCRED. That is, the underlying data is from the same credentials. Regarding its syntax, it is based on SO_PEERSEC. That is, if the provided buffer is too small, ERANGE is returned and @optlen is updated. Otherwise, the information is copied, @optlen is set to the actual size, and 0 is returned. While SO_PEERCRED (and thus `struct ucred') already returns the primary group, it lacks the auxiliary group vector. However, nearly all access controls (including kernel side VFS and SYSVIPC, but also user-space polkit, DBus, ...) consider the entire set of groups, rather than just the primary group. But this is currently not possible with pure SO_PEERCRED. Instead, user-space has to work around this and query the system database for the auxiliary groups of a UID retrieved via SO_PEERCRED. Unfortunately, there is no race-free way to query the auxiliary groups of the PID/UID retrieved via SO_PEERCRED. Hence, the current user-space solution is to use getgrouplist(3p), which itself falls back to NSS and whatever is configured in nsswitch.conf(3). This effectively checks which groups we *would* assign to the user if it logged in *now*. On normal systems it is as easy as reading /etc/group, but with NSS it can resort to quering network databases (eg., LDAP), using IPC or network communication. Long story short: Whenever we want to use auxiliary groups for access checks on IPC, we need further IPC to talk to the user/group databases, rather than just relying on SO_PEERCRED and the incoming socket. This is unfortunate, and might even result in dead-locks if the database query uses the same IPC as the original request. So far, those recursions / dead-locks have been avoided by using primitive IPC for all crucial NSS modules. However, we want to avoid re-inventing the wheel for each NSS module that might be involved in user/group queries. Hence, we would preferably make DBus (and other IPC that supports access-management based on groups) work without resorting to the user/group database. This new SO_PEERGROUPS ioctl would allow us to make dbus-daemon work without ever calling into NSS. Cc: Michal Sekletar Cc: Simon McVittie Reviewed-by: Tom Gundersen Signed-off-by: David Herrmann Signed-off-by: David S. Miller --- arch/alpha/include/uapi/asm/socket.h | 2 ++ arch/frv/include/uapi/asm/socket.h | 2 ++ arch/ia64/include/uapi/asm/socket.h | 2 ++ arch/m32r/include/uapi/asm/socket.h | 2 ++ arch/mips/include/uapi/asm/socket.h | 2 ++ arch/mn10300/include/uapi/asm/socket.h | 2 ++ arch/parisc/include/uapi/asm/socket.h | 2 ++ arch/s390/include/uapi/asm/socket.h | 2 ++ arch/sparc/include/uapi/asm/socket.h | 2 ++ arch/xtensa/include/uapi/asm/socket.h | 2 ++ include/uapi/asm-generic/socket.h | 2 ++ net/core/sock.c | 33 +++++++++++++++++++++++++++++++++ 12 files changed, 55 insertions(+) (limited to 'arch/ia64') diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index 0926de63a62b..7b285dd4fe05 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -107,4 +107,6 @@ #define SCM_TIMESTAMPING_PKTINFO 58 +#define SO_PEERGROUPS 59 + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/frv/include/uapi/asm/socket.h b/arch/frv/include/uapi/asm/socket.h index e491ff08b9a9..f1e3b20dce9f 100644 --- a/arch/frv/include/uapi/asm/socket.h +++ b/arch/frv/include/uapi/asm/socket.h @@ -100,5 +100,7 @@ #define SCM_TIMESTAMPING_PKTINFO 58 +#define SO_PEERGROUPS 59 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h index 869372413333..5dd5c5d0d642 100644 --- a/arch/ia64/include/uapi/asm/socket.h +++ b/arch/ia64/include/uapi/asm/socket.h @@ -109,4 +109,6 @@ #define SCM_TIMESTAMPING_PKTINFO 58 +#define SO_PEERGROUPS 59 + #endif /* _ASM_IA64_SOCKET_H */ diff --git a/arch/m32r/include/uapi/asm/socket.h b/arch/m32r/include/uapi/asm/socket.h index 5d97890a8704..f8f7b47e247f 100644 --- a/arch/m32r/include/uapi/asm/socket.h +++ b/arch/m32r/include/uapi/asm/socket.h @@ -100,4 +100,6 @@ #define SCM_TIMESTAMPING_PKTINFO 58 +#define SO_PEERGROUPS 59 + #endif /* _ASM_M32R_SOCKET_H */ diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index 365ff51f033a..882823bec153 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -118,4 +118,6 @@ #define SCM_TIMESTAMPING_PKTINFO 58 +#define SO_PEERGROUPS 59 + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/mn10300/include/uapi/asm/socket.h b/arch/mn10300/include/uapi/asm/socket.h index d013c0da0256..c710db354ff2 100644 --- a/arch/mn10300/include/uapi/asm/socket.h +++ b/arch/mn10300/include/uapi/asm/socket.h @@ -100,4 +100,6 @@ #define SCM_TIMESTAMPING_PKTINFO 58 +#define SO_PEERGROUPS 59 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index b893ca14fade..a0d4dc9f4eb2 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -99,4 +99,6 @@ #define SCM_TIMESTAMPING_PKTINFO 0x4033 +#define SO_PEERGROUPS 0x4034 + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h index fb9769d7e74e..52a63f4175cb 100644 --- a/arch/s390/include/uapi/asm/socket.h +++ b/arch/s390/include/uapi/asm/socket.h @@ -106,4 +106,6 @@ #define SCM_TIMESTAMPING_PKTINFO 58 +#define SO_PEERGROUPS 59 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 5d673302fd41..186fd8199f54 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -96,6 +96,8 @@ #define SCM_TIMESTAMPING_PKTINFO 0x003c +#define SO_PEERGROUPS 0x003d + /* Security levels - as per NRL IPv6 - don't actually do anything */ #define SO_SECURITY_AUTHENTICATION 0x5001 #define SO_SECURITY_ENCRYPTION_TRANSPORT 0x5002 diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h index 982c2533f912..3eed2761c149 100644 --- a/arch/xtensa/include/uapi/asm/socket.h +++ b/arch/xtensa/include/uapi/asm/socket.h @@ -111,4 +111,6 @@ #define SCM_TIMESTAMPING_PKTINFO 58 +#define SO_PEERGROUPS 59 + #endif /* _XTENSA_SOCKET_H */ diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index a5f6e819fafd..9861be8da65e 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -102,4 +102,6 @@ #define SCM_TIMESTAMPING_PKTINFO 58 +#define SO_PEERGROUPS 59 + #endif /* __ASM_GENERIC_SOCKET_H */ diff --git a/net/core/sock.c b/net/core/sock.c index ad8a4bc84126..6f4b090241c1 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1078,6 +1078,18 @@ static void cred_to_ucred(struct pid *pid, const struct cred *cred, } } +static int groups_to_user(gid_t __user *dst, const struct group_info *src) +{ + struct user_namespace *user_ns = current_user_ns(); + int i; + + for (i = 0; i < src->ngroups; i++) + if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i)) + return -EFAULT; + + return 0; +} + int sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { @@ -1231,6 +1243,27 @@ int sock_getsockopt(struct socket *sock, int level, int optname, goto lenout; } + case SO_PEERGROUPS: + { + int ret, n; + + if (!sk->sk_peer_cred) + return -ENODATA; + + n = sk->sk_peer_cred->group_info->ngroups; + if (len < n * sizeof(gid_t)) { + len = n * sizeof(gid_t); + return put_user(len, optlen) ? -EFAULT : -ERANGE; + } + len = n * sizeof(gid_t); + + ret = groups_to_user((gid_t __user *)optval, + sk->sk_peer_cred->group_info); + if (ret) + return ret; + goto lenout; + } + case SO_PEERNAME: { char address[128]; -- cgit v1.2.3-55-g7522 From df91b0262e2cff23db5eac77126ea0bef06d54d2 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 30 Jun 2017 00:52:06 +0900 Subject: ia64: remove unneeded extra-y in Makefile.gate All the files listed in "extra-y" are generated according to the dependency. They are still needed in "targets" to include .*.cmd for incremental building. Signed-off-by: Masahiro Yamada --- arch/ia64/kernel/Makefile.gate | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/ia64') diff --git a/arch/ia64/kernel/Makefile.gate b/arch/ia64/kernel/Makefile.gate index a32903ada016..92693c1d65f4 100644 --- a/arch/ia64/kernel/Makefile.gate +++ b/arch/ia64/kernel/Makefile.gate @@ -1,8 +1,6 @@ # The gate DSO image is built using a special linker script. -targets += gate.so gate-syms.o - -extra-y += gate.so gate-syms.o gate.lds gate.o +targets += gate.so gate-syms.o gate.lds gate.o CPPFLAGS_gate.lds := -P -C -U$(ARCH) -- cgit v1.2.3-55-g7522 From 26160371359ea91d8e2bdad6245f39e7ed0c17a3 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 23 Jun 2017 23:44:15 +1000 Subject: ia64: thin archives fix linking The VDSO symbols can't be linked into built-in.o when building with thin archives, so change this to linking a new object file that is included into the built-in.o. Cc: Tony Luck Cc: Fenghua Yu Cc: linux-ia64@vger.kernel.org Signed-off-by: Nicholas Piggin Signed-off-by: Masahiro Yamada --- arch/ia64/kernel/Makefile.gate | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'arch/ia64') diff --git a/arch/ia64/kernel/Makefile.gate b/arch/ia64/kernel/Makefile.gate index 92693c1d65f4..7da7c65a9bb1 100644 --- a/arch/ia64/kernel/Makefile.gate +++ b/arch/ia64/kernel/Makefile.gate @@ -1,6 +1,8 @@ # The gate DSO image is built using a special linker script. -targets += gate.so gate-syms.o gate.lds gate.o +targets += gate.so gate.lds gate.o gate-dummy.o + +obj-y += gate-syms.o CPPFLAGS_gate.lds := -P -C -U$(ARCH) @@ -12,13 +14,14 @@ GATECFLAGS_gate.so = -shared -s -Wl,-soname=linux-gate.so.1 \ $(obj)/gate.so: $(obj)/gate.lds $(obj)/gate.o FORCE $(call if_changed,gate) -$(obj)/built-in.o: $(obj)/gate-syms.o -$(obj)/built-in.o: ld_flags += -R $(obj)/gate-syms.o - -GATECFLAGS_gate-syms.o = -r -$(obj)/gate-syms.o: $(obj)/gate.lds $(obj)/gate.o FORCE +GATECFLAGS_gate-dummy.o = -r +$(obj)/gate-dummy.o: $(obj)/gate.lds $(obj)/gate.o FORCE $(call if_changed,gate) +LDFLAGS_gate-syms.o := -r -R +$(obj)/gate-syms.o: $(obj)/gate-dummy.o FORCE + $(call if_changed,ld) + # gate-data.o contains the gate DSO image as data in section .data..gate. # We must build gate.so before we can assemble it. # Note: kbuild does not track this dependency due to usage of .incbin -- cgit v1.2.3-55-g7522 From 3170d8d226c2053355f3946b4b5ded4c006fe6d4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 2 May 2017 20:06:33 -0400 Subject: kill {__,}{get,put}_user_unaligned() no users left Signed-off-by: Al Viro --- arch/arm/include/asm/uaccess.h | 7 - arch/arm64/include/asm/uaccess.h | 4 - arch/ia64/include/asm/uaccess.h | 36 ----- arch/m68k/include/asm/uaccess.h | 7 - arch/mips/include/asm/uaccess.h | 277 -------------------------------- arch/parisc/include/asm/uaccess.h | 1 - arch/powerpc/include/asm/uaccess.h | 3 - arch/s390/include/asm/uaccess.h | 3 - arch/sparc/include/asm/uaccess_64.h | 1 - arch/tile/include/asm/uaccess.h | 1 - arch/x86/include/asm/uaccess.h | 3 - include/asm-generic/uaccess-unaligned.h | 26 --- 12 files changed, 369 deletions(-) delete mode 100644 include/asm-generic/uaccess-unaligned.h (limited to 'arch/ia64') diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h index 2577405d082d..0726091a8964 100644 --- a/arch/arm/include/asm/uaccess.h +++ b/arch/arm/include/asm/uaccess.h @@ -17,13 +17,6 @@ #include #include -#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS -#include -#else -#define __get_user_unaligned __get_user -#define __put_user_unaligned __put_user -#endif - #include /* diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 7b8a04789cef..f2b465e129de 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -254,8 +254,6 @@ do { \ (void)0; \ }) -#define __get_user_unaligned __get_user - #define get_user(x, ptr) \ ({ \ __typeof__(*(ptr)) __user *__p = (ptr); \ @@ -320,8 +318,6 @@ do { \ (void)0; \ }) -#define __put_user_unaligned __put_user - #define put_user(x, ptr) \ ({ \ __typeof__(*(ptr)) __user *__p = (ptr); \ diff --git a/arch/ia64/include/asm/uaccess.h b/arch/ia64/include/asm/uaccess.h index 82a7646c4416..a217bcfe6700 100644 --- a/arch/ia64/include/asm/uaccess.h +++ b/arch/ia64/include/asm/uaccess.h @@ -87,42 +87,6 @@ static inline int __access_ok(const void __user *p, unsigned long size) #define __put_user(x, ptr) __put_user_nocheck((__typeof__(*(ptr))) (x), (ptr), sizeof(*(ptr))) #define __get_user(x, ptr) __get_user_nocheck((x), (ptr), sizeof(*(ptr))) -extern long __put_user_unaligned_unknown (void); - -#define __put_user_unaligned(x, ptr) \ -({ \ - long __ret; \ - switch (sizeof(*(ptr))) { \ - case 1: __ret = __put_user((x), (ptr)); break; \ - case 2: __ret = (__put_user((x), (u8 __user *)(ptr))) \ - | (__put_user((x) >> 8, ((u8 __user *)(ptr) + 1))); break; \ - case 4: __ret = (__put_user((x), (u16 __user *)(ptr))) \ - | (__put_user((x) >> 16, ((u16 __user *)(ptr) + 1))); break; \ - case 8: __ret = (__put_user((x), (u32 __user *)(ptr))) \ - | (__put_user((x) >> 32, ((u32 __user *)(ptr) + 1))); break; \ - default: __ret = __put_user_unaligned_unknown(); \ - } \ - __ret; \ -}) - -extern long __get_user_unaligned_unknown (void); - -#define __get_user_unaligned(x, ptr) \ -({ \ - long __ret; \ - switch (sizeof(*(ptr))) { \ - case 1: __ret = __get_user((x), (ptr)); break; \ - case 2: __ret = (__get_user((x), (u8 __user *)(ptr))) \ - | (__get_user((x) >> 8, ((u8 __user *)(ptr) + 1))); break; \ - case 4: __ret = (__get_user((x), (u16 __user *)(ptr))) \ - | (__get_user((x) >> 16, ((u16 __user *)(ptr) + 1))); break; \ - case 8: __ret = (__get_user((x), (u32 __user *)(ptr))) \ - | (__get_user((x) >> 32, ((u32 __user *)(ptr) + 1))); break; \ - default: __ret = __get_user_unaligned_unknown(); \ - } \ - __ret; \ -}) - #ifdef ASM_SUPPORTED struct __large_struct { unsigned long buf[100]; }; # define __m(x) (*(struct __large_struct __user *)(x)) diff --git a/arch/m68k/include/asm/uaccess.h b/arch/m68k/include/asm/uaccess.h index 67b3481d6020..63ba18e4c9a2 100644 --- a/arch/m68k/include/asm/uaccess.h +++ b/arch/m68k/include/asm/uaccess.h @@ -3,11 +3,4 @@ #else #include #endif - #include -#ifdef CONFIG_CPU_HAS_NO_UNALIGNED -#include -#else -#define __get_user_unaligned(x, ptr) __get_user((x), (ptr)) -#define __put_user_unaligned(x, ptr) __put_user((x), (ptr)) -#endif diff --git a/arch/mips/include/asm/uaccess.h b/arch/mips/include/asm/uaccess.h index 99e629a590a5..c5fc42429ce8 100644 --- a/arch/mips/include/asm/uaccess.h +++ b/arch/mips/include/asm/uaccess.h @@ -496,283 +496,6 @@ do { \ extern void __put_user_unknown(void); -/* - * ul{b,h,w} are macros and there are no equivalent macros for EVA. - * EVA unaligned access is handled in the ADE exception handler. - */ -#ifndef CONFIG_EVA -/* - * put_user_unaligned: - Write a simple value into user space. - * @x: Value to copy to user space. - * @ptr: Destination address, in user space. - * - * Context: User context only. This function may sleep if pagefaults are - * enabled. - * - * This macro copies a single simple value from kernel space to user - * space. It supports simple types like char and int, but not larger - * data types like structures or arrays. - * - * @ptr must have pointer-to-simple-variable type, and @x must be assignable - * to the result of dereferencing @ptr. - * - * Returns zero on success, or -EFAULT on error. - */ -#define put_user_unaligned(x,ptr) \ - __put_user_unaligned_check((x),(ptr),sizeof(*(ptr))) - -/* - * get_user_unaligned: - Get a simple variable from user space. - * @x: Variable to store result. - * @ptr: Source address, in user space. - * - * Context: User context only. This function may sleep if pagefaults are - * enabled. - * - * This macro copies a single simple variable from user space to kernel - * space. It supports simple types like char and int, but not larger - * data types like structures or arrays. - * - * @ptr must have pointer-to-simple-variable type, and the result of - * dereferencing @ptr must be assignable to @x without a cast. - * - * Returns zero on success, or -EFAULT on error. - * On error, the variable @x is set to zero. - */ -#define get_user_unaligned(x,ptr) \ - __get_user_unaligned_check((x),(ptr),sizeof(*(ptr))) - -/* - * __put_user_unaligned: - Write a simple value into user space, with less checking. - * @x: Value to copy to user space. - * @ptr: Destination address, in user space. - * - * Context: User context only. This function may sleep if pagefaults are - * enabled. - * - * This macro copies a single simple value from kernel space to user - * space. It supports simple types like char and int, but not larger - * data types like structures or arrays. - * - * @ptr must have pointer-to-simple-variable type, and @x must be assignable - * to the result of dereferencing @ptr. - * - * Caller must check the pointer with access_ok() before calling this - * function. - * - * Returns zero on success, or -EFAULT on error. - */ -#define __put_user_unaligned(x,ptr) \ - __put_user_unaligned_nocheck((x),(ptr),sizeof(*(ptr))) - -/* - * __get_user_unaligned: - Get a simple variable from user space, with less checking. - * @x: Variable to store result. - * @ptr: Source address, in user space. - * - * Context: User context only. This function may sleep if pagefaults are - * enabled. - * - * This macro copies a single simple variable from user space to kernel - * space. It supports simple types like char and int, but not larger - * data types like structures or arrays. - * - * @ptr must have pointer-to-simple-variable type, and the result of - * dereferencing @ptr must be assignable to @x without a cast. - * - * Caller must check the pointer with access_ok() before calling this - * function. - * - * Returns zero on success, or -EFAULT on error. - * On error, the variable @x is set to zero. - */ -#define __get_user_unaligned(x,ptr) \ - __get_user_unaligned_nocheck((x),(ptr),sizeof(*(ptr))) - -/* - * Yuck. We need two variants, one for 64bit operation and one - * for 32 bit mode and old iron. - */ -#ifdef CONFIG_32BIT -#define __GET_USER_UNALIGNED_DW(val, ptr) \ - __get_user_unaligned_asm_ll32(val, ptr) -#endif -#ifdef CONFIG_64BIT -#define __GET_USER_UNALIGNED_DW(val, ptr) \ - __get_user_unaligned_asm(val, "uld", ptr) -#endif - -extern void __get_user_unaligned_unknown(void); - -#define __get_user_unaligned_common(val, size, ptr) \ -do { \ - switch (size) { \ - case 1: __get_data_asm(val, "lb", ptr); break; \ - case 2: __get_data_unaligned_asm(val, "ulh", ptr); break; \ - case 4: __get_data_unaligned_asm(val, "ulw", ptr); break; \ - case 8: __GET_USER_UNALIGNED_DW(val, ptr); break; \ - default: __get_user_unaligned_unknown(); break; \ - } \ -} while (0) - -#define __get_user_unaligned_nocheck(x,ptr,size) \ -({ \ - int __gu_err; \ - \ - __get_user_unaligned_common((x), size, ptr); \ - __gu_err; \ -}) - -#define __get_user_unaligned_check(x,ptr,size) \ -({ \ - int __gu_err = -EFAULT; \ - const __typeof__(*(ptr)) __user * __gu_ptr = (ptr); \ - \ - if (likely(access_ok(VERIFY_READ, __gu_ptr, size))) \ - __get_user_unaligned_common((x), size, __gu_ptr); \ - \ - __gu_err; \ -}) - -#define __get_data_unaligned_asm(val, insn, addr) \ -{ \ - long __gu_tmp; \ - \ - __asm__ __volatile__( \ - "1: " insn " %1, %3 \n" \ - "2: \n" \ - " .insn \n" \ - " .section .fixup,\"ax\" \n" \ - "3: li %0, %4 \n" \ - " move %1, $0 \n" \ - " j 2b \n" \ - " .previous \n" \ - " .section __ex_table,\"a\" \n" \ - " "__UA_ADDR "\t1b, 3b \n" \ - " "__UA_ADDR "\t1b + 4, 3b \n" \ - " .previous \n" \ - : "=r" (__gu_err), "=r" (__gu_tmp) \ - : "0" (0), "o" (__m(addr)), "i" (-EFAULT)); \ - \ - (val) = (__typeof__(*(addr))) __gu_tmp; \ -} - -/* - * Get a long long 64 using 32 bit registers. - */ -#define __get_user_unaligned_asm_ll32(val, addr) \ -{ \ - unsigned long long __gu_tmp; \ - \ - __asm__ __volatile__( \ - "1: ulw %1, (%3) \n" \ - "2: ulw %D1, 4(%3) \n" \ - " move %0, $0 \n" \ - "3: \n" \ - " .insn \n" \ - " .section .fixup,\"ax\" \n" \ - "4: li %0, %4 \n" \ - " move %1, $0 \n" \ - " move %D1, $0 \n" \ - " j 3b \n" \ - " .previous \n" \ - " .section __ex_table,\"a\" \n" \ - " " __UA_ADDR " 1b, 4b \n" \ - " " __UA_ADDR " 1b + 4, 4b \n" \ - " " __UA_ADDR " 2b, 4b \n" \ - " " __UA_ADDR " 2b + 4, 4b \n" \ - " .previous \n" \ - : "=r" (__gu_err), "=&r" (__gu_tmp) \ - : "0" (0), "r" (addr), "i" (-EFAULT)); \ - (val) = (__typeof__(*(addr))) __gu_tmp; \ -} - -/* - * Yuck. We need two variants, one for 64bit operation and one - * for 32 bit mode and old iron. - */ -#ifdef CONFIG_32BIT -#define __PUT_USER_UNALIGNED_DW(ptr) __put_user_unaligned_asm_ll32(ptr) -#endif -#ifdef CONFIG_64BIT -#define __PUT_USER_UNALIGNED_DW(ptr) __put_user_unaligned_asm("usd", ptr) -#endif - -#define __put_user_unaligned_common(ptr, size) \ -do { \ - switch (size) { \ - case 1: __put_data_asm("sb", ptr); break; \ - case 2: __put_user_unaligned_asm("ush", ptr); break; \ - case 4: __put_user_unaligned_asm("usw", ptr); break; \ - case 8: __PUT_USER_UNALIGNED_DW(ptr); break; \ - default: __put_user_unaligned_unknown(); break; \ -} while (0) - -#define __put_user_unaligned_nocheck(x,ptr,size) \ -({ \ - __typeof__(*(ptr)) __pu_val; \ - int __pu_err = 0; \ - \ - __pu_val = (x); \ - __put_user_unaligned_common(ptr, size); \ - __pu_err; \ -}) - -#define __put_user_unaligned_check(x,ptr,size) \ -({ \ - __typeof__(*(ptr)) __user *__pu_addr = (ptr); \ - __typeof__(*(ptr)) __pu_val = (x); \ - int __pu_err = -EFAULT; \ - \ - if (likely(access_ok(VERIFY_WRITE, __pu_addr, size))) \ - __put_user_unaligned_common(__pu_addr, size); \ - \ - __pu_err; \ -}) - -#define __put_user_unaligned_asm(insn, ptr) \ -{ \ - __asm__ __volatile__( \ - "1: " insn " %z2, %3 # __put_user_unaligned_asm\n" \ - "2: \n" \ - " .insn \n" \ - " .section .fixup,\"ax\" \n" \ - "3: li %0, %4 \n" \ - " j 2b \n" \ - " .previous \n" \ - " .section __ex_table,\"a\" \n" \ - " " __UA_ADDR " 1b, 3b \n" \ - " .previous \n" \ - : "=r" (__pu_err) \ - : "0" (0), "Jr" (__pu_val), "o" (__m(ptr)), \ - "i" (-EFAULT)); \ -} - -#define __put_user_unaligned_asm_ll32(ptr) \ -{ \ - __asm__ __volatile__( \ - "1: sw %2, (%3) # __put_user_unaligned_asm_ll32 \n" \ - "2: sw %D2, 4(%3) \n" \ - "3: \n" \ - " .insn \n" \ - " .section .fixup,\"ax\" \n" \ - "4: li %0, %4 \n" \ - " j 3b \n" \ - " .previous \n" \ - " .section __ex_table,\"a\" \n" \ - " " __UA_ADDR " 1b, 4b \n" \ - " " __UA_ADDR " 1b + 4, 4b \n" \ - " " __UA_ADDR " 2b, 4b \n" \ - " " __UA_ADDR " 2b + 4, 4b \n" \ - " .previous" \ - : "=r" (__pu_err) \ - : "0" (0), "r" (__pu_val), "r" (ptr), \ - "i" (-EFAULT)); \ -} - -extern void __put_user_unaligned_unknown(void); -#endif - /* * We're generating jump to subroutines which will be outside the range of * jump instructions diff --git a/arch/parisc/include/asm/uaccess.h b/arch/parisc/include/asm/uaccess.h index 6b113f39f30c..d24091e1907a 100644 --- a/arch/parisc/include/asm/uaccess.h +++ b/arch/parisc/include/asm/uaccess.h @@ -6,7 +6,6 @@ */ #include #include -#include #include #include diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index 5c0d8a8cdae5..9e7b08bbde5b 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -90,9 +90,6 @@ #define __put_user_inatomic(x, ptr) \ __put_user_nosleep((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr))) -#define __get_user_unaligned __get_user -#define __put_user_unaligned __put_user - extern long __put_user_bad(void); /* diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index 78f3f093d143..fad9df965ec2 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -249,9 +249,6 @@ int __put_user_bad(void) __attribute__((noreturn)); int __get_user_bad(void) __attribute__((noreturn)); -#define __put_user_unaligned __put_user -#define __get_user_unaligned __get_user - unsigned long __must_check raw_copy_in_user(void __user *to, const void __user *from, unsigned long n); diff --git a/arch/sparc/include/asm/uaccess_64.h b/arch/sparc/include/asm/uaccess_64.h index 6096d671aa63..b5f976ee7510 100644 --- a/arch/sparc/include/asm/uaccess_64.h +++ b/arch/sparc/include/asm/uaccess_64.h @@ -9,7 +9,6 @@ #include #include #include -#include #include #include diff --git a/arch/tile/include/asm/uaccess.h b/arch/tile/include/asm/uaccess.h index a803f6bb4d92..7f777411d1d6 100644 --- a/arch/tile/include/asm/uaccess.h +++ b/arch/tile/include/asm/uaccess.h @@ -19,7 +19,6 @@ * User space memory access functions */ #include -#include #include #include diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 68766b276d9e..fd91722315c8 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -535,9 +535,6 @@ struct __large_struct { unsigned long buf[100]; }; #define __put_user(x, ptr) \ __put_user_nocheck((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr))) -#define __get_user_unaligned __get_user -#define __put_user_unaligned __put_user - /* * {get|put}_user_try and catch * diff --git a/include/asm-generic/uaccess-unaligned.h b/include/asm-generic/uaccess-unaligned.h deleted file mode 100644 index 67deb898f0c5..000000000000 --- a/include/asm-generic/uaccess-unaligned.h +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef __ASM_GENERIC_UACCESS_UNALIGNED_H -#define __ASM_GENERIC_UACCESS_UNALIGNED_H - -/* - * This macro should be used instead of __get_user() when accessing - * values at locations that are not known to be aligned. - */ -#define __get_user_unaligned(x, ptr) \ -({ \ - __typeof__ (*(ptr)) __x; \ - __copy_from_user(&__x, (ptr), sizeof(*(ptr))) ? -EFAULT : 0; \ - (x) = __x; \ -}) - - -/* - * This macro should be used instead of __put_user() when accessing - * values at locations that are not known to be aligned. - */ -#define __put_user_unaligned(x, ptr) \ -({ \ - __typeof__ (*(ptr)) __x = (x); \ - __copy_to_user((ptr), &__x, sizeof(*(ptr))) ? -EFAULT : 0; \ -}) - -#endif /* __ASM_GENERIC_UACCESS_UNALIGNED_H */ -- cgit v1.2.3-55-g7522 From 1b862aecfbd419cdc4553645bf86d07554279bed Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 6 Jul 2017 15:37:45 -0700 Subject: mm, memory_hotplug: get rid of is_zone_device_section Device memory hotplug hooks into regular memory hotplug only half way. It needs memory sections to track struct pages but there is no need/desire to associate those sections with memory blocks and export them to the userspace via sysfs because they cannot be onlined anyway. This is currently expressed by for_device argument to arch_add_memory which then makes sure to associate the given memory range with ZONE_DEVICE. register_new_memory then relies on is_zone_device_section to distinguish special memory hotplug from the regular one. While this works now, later patches in this series want to move __add_zone outside of arch_add_memory path so we have to come up with something else. Add want_memblock down the __add_pages path and use it to control whether the section->memblock association should be done. arch_add_memory then just trivially want memblock for everything but for_device hotplug. remove_memory_section doesn't need is_zone_device_section either. We can simply skip all the memblock specific cleanup if there is no memblock for the given section. This shouldn't introduce any functional change. Link: http://lkml.kernel.org/r/20170515085827.16474-5-mhocko@kernel.org Signed-off-by: Michal Hocko Tested-by: Dan Williams Acked-by: Vlastimil Babka Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Balbir Singh Cc: Daniel Kiper Cc: David Rientjes Cc: Heiko Carstens Cc: Igor Mammedov Cc: Jerome Glisse Cc: Joonsoo Kim Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Reza Arbab Cc: Tobias Regnery Cc: Toshi Kani Cc: Vitaly Kuznetsov Cc: Xishi Qiu Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/mm/init.c | 2 +- arch/powerpc/mm/mem.c | 2 +- arch/s390/mm/init.c | 2 +- arch/sh/mm/init.c | 2 +- arch/x86/mm/init_32.c | 2 +- arch/x86/mm/init_64.c | 2 +- drivers/base/memory.c | 23 +++++++++-------------- include/linux/memory_hotplug.h | 2 +- mm/memory_hotplug.c | 9 ++++++--- 9 files changed, 22 insertions(+), 24 deletions(-) (limited to 'arch/ia64') diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 8f3efa682ee8..39e2aeb4669d 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -658,7 +658,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) zone = pgdat->node_zones + zone_for_memory(nid, start, size, ZONE_NORMAL, for_device); - ret = __add_pages(nid, zone, start_pfn, nr_pages); + ret = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); if (ret) printk("%s: Problem encountered in __add_pages() as ret=%d\n", diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 9ee536ec0739..e6b2e6618b6c 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -151,7 +151,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) zone = pgdata->node_zones + zone_for_memory(nid, start, size, 0, for_device); - return __add_pages(nid, zone, start_pfn, nr_pages); + return __add_pages(nid, zone, start_pfn, nr_pages, !for_device); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 3348e60dd8ad..a3d549966b6a 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -195,7 +195,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) continue; nr_pages = (start_pfn + size_pages > zone_end_pfn) ? zone_end_pfn - start_pfn : size_pages; - rc = __add_pages(nid, zone, start_pfn, nr_pages); + rc = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); if (rc) break; start_pfn += nr_pages; diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 75491862d900..a9d57f75ae8c 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -498,7 +498,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) ret = __add_pages(nid, pgdat->node_zones + zone_for_memory(nid, start, size, ZONE_NORMAL, for_device), - start_pfn, nr_pages); + start_pfn, nr_pages, !for_device); if (unlikely(ret)) printk("%s: Failed, __add_pages() == %d\n", __func__, ret); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 99fb83819a5f..94594b889144 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -831,7 +831,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - return __add_pages(nid, zone, start_pfn, nr_pages); + return __add_pages(nid, zone, start_pfn, nr_pages, !for_device); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index dae6a5e5ad4a..9d64291459b6 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -787,7 +787,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) init_memory_mapping(start, start + size); - ret = __add_pages(nid, zone, start_pfn, nr_pages); + ret = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); WARN_ON_ONCE(ret); /* update max_pfn, max_low_pfn and high_memory */ diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 90225ffee501..f8fd562c3f18 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -685,14 +685,6 @@ static int add_memory_block(int base_section_nr) return 0; } -static bool is_zone_device_section(struct mem_section *ms) -{ - struct page *page; - - page = sparse_decode_mem_map(ms->section_mem_map, __section_nr(ms)); - return is_zone_device_page(page); -} - /* * need an interface for the VM to add new memory regions, * but without onlining it. @@ -702,9 +694,6 @@ int register_new_memory(int nid, struct mem_section *section) int ret = 0; struct memory_block *mem; - if (is_zone_device_section(section)) - return 0; - mutex_lock(&mem_sysfs_mutex); mem = find_memory_block(section); @@ -741,11 +730,16 @@ static int remove_memory_section(unsigned long node_id, { struct memory_block *mem; - if (is_zone_device_section(section)) - return 0; - mutex_lock(&mem_sysfs_mutex); + + /* + * Some users of the memory hotplug do not want/need memblock to + * track all sections. Skip over those. + */ mem = find_memory_block(section); + if (!mem) + goto out_unlock; + unregister_mem_sect_under_nodes(mem, __section_nr(section)); mem->section_count--; @@ -754,6 +748,7 @@ static int remove_memory_section(unsigned long node_id, else put_device(&mem->dev); +out_unlock: mutex_unlock(&mem_sysfs_mutex); return 0; } diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 134a2f69c21a..3c8cf86201c3 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -111,7 +111,7 @@ extern int __remove_pages(struct zone *zone, unsigned long start_pfn, /* reasonably generic interface to expand the physical pages in a zone */ extern int __add_pages(int nid, struct zone *zone, unsigned long start_pfn, - unsigned long nr_pages); + unsigned long nr_pages, bool want_memblock); #ifdef CONFIG_NUMA extern int memory_add_physaddr_to_nid(u64 start); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6b6362819be2..c0147d3024eb 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -494,7 +494,7 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) } static int __meminit __add_section(int nid, struct zone *zone, - unsigned long phys_start_pfn) + unsigned long phys_start_pfn, bool want_memblock) { int ret; @@ -511,6 +511,9 @@ static int __meminit __add_section(int nid, struct zone *zone, if (ret < 0) return ret; + if (!want_memblock) + return 0; + return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); } @@ -521,7 +524,7 @@ static int __meminit __add_section(int nid, struct zone *zone, * add the new pages. */ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, - unsigned long nr_pages) + unsigned long nr_pages, bool want_memblock) { unsigned long i; int err = 0; @@ -549,7 +552,7 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, } for (i = start_sec; i <= end_sec; i++) { - err = __add_section(nid, zone, section_nr_to_pfn(i)); + err = __add_section(nid, zone, section_nr_to_pfn(i), want_memblock); /* * EEXIST is finally dealt with by ioresource collision -- cgit v1.2.3-55-g7522 From f1dd2cd13c4bbbc9a7c4617b3b034fa643de98fe Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 6 Jul 2017 15:38:11 -0700 Subject: mm, memory_hotplug: do not associate hotadded memory to zones until online The current memory hotplug implementation relies on having all the struct pages associate with a zone/node during the physical hotplug phase (arch_add_memory->__add_pages->__add_section->__add_zone). In the vast majority of cases this means that they are added to ZONE_NORMAL. This has been so since 9d99aaa31f59 ("[PATCH] x86_64: Support memory hotadd without sparsemem") and it wasn't a big deal back then because movable onlining didn't exist yet. Much later memory hotplug wanted to (ab)use ZONE_MOVABLE for movable onlining 511c2aba8f07 ("mm, memory-hotplug: dynamic configure movable memory and portion memory") and then things got more complicated. Rather than reconsidering the zone association which was no longer needed (because the memory hotplug already depended on SPARSEMEM) a convoluted semantic of zone shifting has been developed. Only the currently last memblock or the one adjacent to the zone_movable can be onlined movable. This essentially means that the online type changes as the new memblocks are added. Let's simulate memory hot online manually $ echo 0x100000000 > /sys/devices/system/memory/probe $ grep . /sys/devices/system/memory/memory32/valid_zones Normal Movable $ echo $((0x100000000+(128<<20))) > /sys/devices/system/memory/probe $ grep . /sys/devices/system/memory/memory3?/valid_zones /sys/devices/system/memory/memory32/valid_zones:Normal /sys/devices/system/memory/memory33/valid_zones:Normal Movable $ echo $((0x100000000+2*(128<<20))) > /sys/devices/system/memory/probe $ grep . /sys/devices/system/memory/memory3?/valid_zones /sys/devices/system/memory/memory32/valid_zones:Normal /sys/devices/system/memory/memory33/valid_zones:Normal /sys/devices/system/memory/memory34/valid_zones:Normal Movable $ echo online_movable > /sys/devices/system/memory/memory34/state $ grep . /sys/devices/system/memory/memory3?/valid_zones /sys/devices/system/memory/memory32/valid_zones:Normal /sys/devices/system/memory/memory33/valid_zones:Normal Movable /sys/devices/system/memory/memory34/valid_zones:Movable Normal This is an awkward semantic because an udev event is sent as soon as the block is onlined and an udev handler might want to online it based on some policy (e.g. association with a node) but it will inherently race with new blocks showing up. This patch changes the physical online phase to not associate pages with any zone at all. All the pages are just marked reserved and wait for the onlining phase to be associated with the zone as per the online request. There are only two requirements - existing ZONE_NORMAL and ZONE_MOVABLE cannot overlap - ZONE_NORMAL precedes ZONE_MOVABLE in physical addresses the latter one is not an inherent requirement and can be changed in the future. It preserves the current behavior and made the code slightly simpler. This is subject to change in future. This means that the same physical online steps as above will lead to the following state: Normal Movable /sys/devices/system/memory/memory32/valid_zones:Normal Movable /sys/devices/system/memory/memory33/valid_zones:Normal Movable /sys/devices/system/memory/memory32/valid_zones:Normal Movable /sys/devices/system/memory/memory33/valid_zones:Normal Movable /sys/devices/system/memory/memory34/valid_zones:Normal Movable /sys/devices/system/memory/memory32/valid_zones:Normal Movable /sys/devices/system/memory/memory33/valid_zones:Normal Movable /sys/devices/system/memory/memory34/valid_zones:Movable Implementation: The current move_pfn_range is reimplemented to check the above requirements (allow_online_pfn_range) and then updates the respective zone (move_pfn_range_to_zone), the pgdat and links all the pages in the pfn range with the zone/node. __add_pages is updated to not require the zone and only initializes sections in the range. This allowed to simplify the arch_add_memory code (s390 could get rid of quite some of code). devm_memremap_pages is the only user of arch_add_memory which relies on the zone association because it only hooks into the memory hotplug only half way. It uses it to associate the new memory with ZONE_DEVICE but doesn't allow it to be {on,off}lined via sysfs. This means that this particular code path has to call move_pfn_range_to_zone explicitly. The original zone shifting code is kept in place and will be removed in the follow up patch for an easier review. Please note that this patch also changes the original behavior when offlining a memory block adjacent to another zone (Normal vs. Movable) used to allow to change its movable type. This will be handled later. [richard.weiyang@gmail.com: simplify zone_intersects()] Link: http://lkml.kernel.org/r/20170616092335.5177-1-richard.weiyang@gmail.com [richard.weiyang@gmail.com: remove duplicate call for set_page_links] Link: http://lkml.kernel.org/r/20170616092335.5177-2-richard.weiyang@gmail.com [akpm@linux-foundation.org: remove unused local `i'] Link: http://lkml.kernel.org/r/20170515085827.16474-12-mhocko@kernel.org Signed-off-by: Michal Hocko Signed-off-by: Wei Yang Tested-by: Dan Williams Tested-by: Reza Arbab Acked-by: Heiko Carstens # For s390 bits Acked-by: Vlastimil Babka Cc: Martin Schwidefsky Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Balbir Singh Cc: Daniel Kiper Cc: David Rientjes Cc: Igor Mammedov Cc: Jerome Glisse Cc: Joonsoo Kim Cc: Mel Gorman Cc: Tobias Regnery Cc: Toshi Kani Cc: Vitaly Kuznetsov Cc: Xishi Qiu Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/mm/init.c | 9 +- arch/powerpc/mm/mem.c | 10 +- arch/s390/mm/init.c | 30 +----- arch/sh/mm/init.c | 8 +- arch/x86/mm/init_32.c | 5 +- arch/x86/mm/init_64.c | 9 +- drivers/base/memory.c | 52 ++++++----- include/linux/memory_hotplug.h | 13 +-- include/linux/mmzone.h | 16 ++++ kernel/memremap.c | 4 + mm/memory_hotplug.c | 201 +++++++++++++++++++++++++---------------- mm/sparse.c | 3 +- 12 files changed, 185 insertions(+), 175 deletions(-) (limited to 'arch/ia64') diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 39e2aeb4669d..80db57d063d0 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -648,18 +648,11 @@ mem_init (void) #ifdef CONFIG_MEMORY_HOTPLUG int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { - pg_data_t *pgdat; - struct zone *zone; unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - pgdat = NODE_DATA(nid); - - zone = pgdat->node_zones + - zone_for_memory(nid, start, size, ZONE_NORMAL, for_device); - ret = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); - + ret = __add_pages(nid, start_pfn, nr_pages, !for_device); if (ret) printk("%s: Problem encountered in __add_pages() as ret=%d\n", __func__, ret); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index e6b2e6618b6c..72c46eb53215 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -128,16 +128,12 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end) int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { - struct pglist_data *pgdata; - struct zone *zone; unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int rc; resize_hpt_for_hotplug(memblock_phys_mem_size()); - pgdata = NODE_DATA(nid); - start = (unsigned long)__va(start); rc = create_section_mapping(start, start + size); if (rc) { @@ -147,11 +143,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) return -EFAULT; } - /* this should work for most non-highmem platforms */ - zone = pgdata->node_zones + - zone_for_memory(nid, start, size, 0, for_device); - - return __add_pages(nid, zone, start_pfn, nr_pages, !for_device); + return __add_pages(nid, start_pfn, nr_pages, !for_device); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index a3d549966b6a..bfa918e3592b 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -168,41 +168,15 @@ unsigned long memory_block_size_bytes(void) #ifdef CONFIG_MEMORY_HOTPLUG int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { - unsigned long zone_start_pfn, zone_end_pfn, nr_pages; unsigned long start_pfn = PFN_DOWN(start); unsigned long size_pages = PFN_DOWN(size); - pg_data_t *pgdat = NODE_DATA(nid); - struct zone *zone; - int rc, i; + int rc; rc = vmem_add_mapping(start, size); if (rc) return rc; - for (i = 0; i < MAX_NR_ZONES; i++) { - zone = pgdat->node_zones + i; - if (zone_idx(zone) != ZONE_MOVABLE) { - /* Add range within existing zone limits, if possible */ - zone_start_pfn = zone->zone_start_pfn; - zone_end_pfn = zone->zone_start_pfn + - zone->spanned_pages; - } else { - /* Add remaining range to ZONE_MOVABLE */ - zone_start_pfn = start_pfn; - zone_end_pfn = start_pfn + size_pages; - } - if (start_pfn < zone_start_pfn || start_pfn >= zone_end_pfn) - continue; - nr_pages = (start_pfn + size_pages > zone_end_pfn) ? - zone_end_pfn - start_pfn : size_pages; - rc = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); - if (rc) - break; - start_pfn += nr_pages; - size_pages -= nr_pages; - if (!size_pages) - break; - } + rc = __add_pages(nid, start_pfn, size_pages, !for_device); if (rc) vmem_remove_mapping(start, size); return rc; diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index a9d57f75ae8c..3813a610a2bb 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -487,18 +487,12 @@ void free_initrd_mem(unsigned long start, unsigned long end) #ifdef CONFIG_MEMORY_HOTPLUG int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { - pg_data_t *pgdat; unsigned long start_pfn = PFN_DOWN(start); unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - pgdat = NODE_DATA(nid); - /* We only have ZONE_NORMAL, so this is easy.. */ - ret = __add_pages(nid, pgdat->node_zones + - zone_for_memory(nid, start, size, ZONE_NORMAL, - for_device), - start_pfn, nr_pages, !for_device); + ret = __add_pages(nid, start_pfn, nr_pages, !for_device); if (unlikely(ret)) printk("%s: Failed, __add_pages() == %d\n", __func__, ret); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 94594b889144..a424066d0552 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -825,13 +825,10 @@ void __init mem_init(void) #ifdef CONFIG_MEMORY_HOTPLUG int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { - struct pglist_data *pgdata = NODE_DATA(nid); - struct zone *zone = pgdata->node_zones + - zone_for_memory(nid, start, size, ZONE_HIGHMEM, for_device); unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - return __add_pages(nid, zone, start_pfn, nr_pages, !for_device); + return __add_pages(nid, start_pfn, nr_pages, !for_device); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 9d64291459b6..06afa84ac0a0 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -772,22 +772,15 @@ static void update_end_of_memory_vars(u64 start, u64 size) } } -/* - * Memory is added always to NORMAL zone. This means you will never get - * additional DMA/DMA32 memory. - */ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { - struct pglist_data *pgdat = NODE_DATA(nid); - struct zone *zone = pgdat->node_zones + - zone_for_memory(nid, start, size, ZONE_NORMAL, for_device); unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; init_memory_mapping(start, start + size); - ret = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); + ret = __add_pages(nid, start_pfn, nr_pages, !for_device); WARN_ON_ONCE(ret); /* update max_pfn, max_low_pfn and high_memory */ diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 1e884d82af6f..b86fda30ce62 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -392,39 +392,43 @@ static ssize_t show_valid_zones(struct device *dev, struct device_attribute *attr, char *buf) { struct memory_block *mem = to_memory_block(dev); - unsigned long start_pfn, end_pfn; - unsigned long valid_start, valid_end, valid_pages; + unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; - struct zone *zone; - int zone_shift = 0; + unsigned long valid_start_pfn, valid_end_pfn; + bool append = false; + int nid; - start_pfn = section_nr_to_pfn(mem->start_section_nr); - end_pfn = start_pfn + nr_pages; - - /* The block contains more than one zone can not be offlined. */ - if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, &valid_end)) + /* + * The block contains more than one zone can not be offlined. + * This can happen e.g. for ZONE_DMA and ZONE_DMA32 + */ + if (!test_pages_in_a_zone(start_pfn, start_pfn + nr_pages, &valid_start_pfn, &valid_end_pfn)) return sprintf(buf, "none\n"); - zone = page_zone(pfn_to_page(valid_start)); - valid_pages = valid_end - valid_start; - - /* MMOP_ONLINE_KEEP */ - sprintf(buf, "%s", zone->name); + start_pfn = valid_start_pfn; + nr_pages = valid_end_pfn - start_pfn; - /* MMOP_ONLINE_KERNEL */ - zone_can_shift(valid_start, valid_pages, ZONE_NORMAL, &zone_shift); - if (zone_shift) { - strcat(buf, " "); - strcat(buf, (zone + zone_shift)->name); + /* + * Check the existing zone. Make sure that we do that only on the + * online nodes otherwise the page_zone is not reliable + */ + if (mem->state == MEM_ONLINE) { + strcat(buf, page_zone(pfn_to_page(start_pfn))->name); + goto out; } - /* MMOP_ONLINE_MOVABLE */ - zone_can_shift(valid_start, valid_pages, ZONE_MOVABLE, &zone_shift); - if (zone_shift) { - strcat(buf, " "); - strcat(buf, (zone + zone_shift)->name); + nid = pfn_to_nid(start_pfn); + if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL)) { + strcat(buf, NODE_DATA(nid)->node_zones[ZONE_NORMAL].name); + append = true; } + if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_MOVABLE)) { + if (append) + strcat(buf, " "); + strcat(buf, NODE_DATA(nid)->node_zones[ZONE_MOVABLE].name); + } +out: strcat(buf, "\n"); return strlen(buf); diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index a61aede1b391..8a07a49fd8dc 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -123,8 +123,8 @@ extern int __remove_pages(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages); #endif /* CONFIG_MEMORY_HOTREMOVE */ -/* reasonably generic interface to expand the physical pages in a zone */ -extern int __add_pages(int nid, struct zone *zone, unsigned long start_pfn, +/* reasonably generic interface to expand the physical pages */ +extern int __add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, bool want_memblock); #ifdef CONFIG_NUMA @@ -299,15 +299,16 @@ extern int add_memory_resource(int nid, struct resource *resource, bool online); extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default, bool for_device); extern int arch_add_memory(int nid, u64 start, u64 size, bool for_device); +extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); extern bool is_memblock_offlined(struct memory_block *mem); extern void remove_memory(int nid, u64 start, u64 size); -extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn); +extern int sparse_add_one_section(struct pglist_data *pgdat, unsigned long start_pfn); extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, unsigned long map_offset); extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); -extern bool zone_can_shift(unsigned long pfn, unsigned long nr_pages, - enum zone_type target, int *zone_shift); - +extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, + int online_type); #endif /* __LINUX_MEMORY_HOTPLUG_H */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 2aaf7e08c5a8..abc1641011f2 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -532,6 +532,22 @@ static inline bool zone_is_empty(struct zone *zone) return zone->spanned_pages == 0; } +/* + * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty + * intersection with the given zone + */ +static inline bool zone_intersects(struct zone *zone, + unsigned long start_pfn, unsigned long nr_pages) +{ + if (zone_is_empty(zone)) + return false; + if (start_pfn >= zone_end_pfn(zone) || + start_pfn + nr_pages <= zone->zone_start_pfn) + return false; + + return true; +} + /* * The "priority" of VM scanning is how much of the queues we will scan in one * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the diff --git a/kernel/memremap.c b/kernel/memremap.c index 23a6483c3666..281eb478856a 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -359,6 +359,10 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, mem_hotplug_begin(); error = arch_add_memory(nid, align_start, align_size, true); + if (!error) + move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], + align_start >> PAGE_SHIFT, + align_size >> PAGE_SHIFT); mem_hotplug_done(); if (error) goto err_add_memory; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b2ebe9ad7f6c..9438ffe24cb2 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -433,25 +433,6 @@ out_fail: return -1; } -static struct zone * __meminit move_pfn_range(int zone_shift, - unsigned long start_pfn, unsigned long end_pfn) -{ - struct zone *zone = page_zone(pfn_to_page(start_pfn)); - int ret = 0; - - if (zone_shift < 0) - ret = move_pfn_range_left(zone + zone_shift, zone, - start_pfn, end_pfn); - else if (zone_shift) - ret = move_pfn_range_right(zone, zone + zone_shift, - start_pfn, end_pfn); - - if (ret) - return NULL; - - return zone + zone_shift; -} - static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, unsigned long end_pfn) { @@ -493,23 +474,35 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) return 0; } -static int __meminit __add_section(int nid, struct zone *zone, - unsigned long phys_start_pfn, bool want_memblock) +static int __meminit __add_section(int nid, unsigned long phys_start_pfn, + bool want_memblock) { int ret; + int i; if (pfn_valid(phys_start_pfn)) return -EEXIST; - ret = sparse_add_one_section(zone, phys_start_pfn); - + ret = sparse_add_one_section(NODE_DATA(nid), phys_start_pfn); if (ret < 0) return ret; - ret = __add_zone(zone, phys_start_pfn); + /* + * Make all the pages reserved so that nobody will stumble over half + * initialized state. + * FIXME: We also have to associate it with a node because pfn_to_node + * relies on having page with the proper node. + */ + for (i = 0; i < PAGES_PER_SECTION; i++) { + unsigned long pfn = phys_start_pfn + i; + struct page *page; + if (!pfn_valid(pfn)) + continue; - if (ret < 0) - return ret; + page = pfn_to_page(pfn); + set_page_node(page, nid); + SetPageReserved(page); + } if (!want_memblock) return 0; @@ -523,7 +516,7 @@ static int __meminit __add_section(int nid, struct zone *zone, * call this function after deciding the zone to which to * add the new pages. */ -int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, +int __ref __add_pages(int nid, unsigned long phys_start_pfn, unsigned long nr_pages, bool want_memblock) { unsigned long i; @@ -531,8 +524,6 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, int start_sec, end_sec; struct vmem_altmap *altmap; - clear_zone_contiguous(zone); - /* during initialize mem_map, align hot-added range to section */ start_sec = pfn_to_section_nr(phys_start_pfn); end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); @@ -552,7 +543,7 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, } for (i = start_sec; i <= end_sec; i++) { - err = __add_section(nid, zone, section_nr_to_pfn(i), want_memblock); + err = __add_section(nid, section_nr_to_pfn(i), want_memblock); /* * EEXIST is finally dealt with by ioresource collision @@ -565,7 +556,6 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, } vmemmap_populate_print_last(); out: - set_zone_contiguous(zone); return err; } EXPORT_SYMBOL_GPL(__add_pages); @@ -1034,39 +1024,109 @@ static void node_states_set_node(int node, struct memory_notify *arg) node_set_state(node, N_MEMORY); } -bool zone_can_shift(unsigned long pfn, unsigned long nr_pages, - enum zone_type target, int *zone_shift) +bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type) { - struct zone *zone = page_zone(pfn_to_page(pfn)); - enum zone_type idx = zone_idx(zone); - int i; + struct pglist_data *pgdat = NODE_DATA(nid); + struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE]; + struct zone *normal_zone = &pgdat->node_zones[ZONE_NORMAL]; - *zone_shift = 0; + /* + * TODO there shouldn't be any inherent reason to have ZONE_NORMAL + * physically before ZONE_MOVABLE. All we need is they do not + * overlap. Historically we didn't allow ZONE_NORMAL after ZONE_MOVABLE + * though so let's stick with it for simplicity for now. + * TODO make sure we do not overlap with ZONE_DEVICE + */ + if (online_type == MMOP_ONLINE_KERNEL) { + if (zone_is_empty(movable_zone)) + return true; + return movable_zone->zone_start_pfn >= pfn + nr_pages; + } else if (online_type == MMOP_ONLINE_MOVABLE) { + return zone_end_pfn(normal_zone) <= pfn; + } - if (idx < target) { - /* pages must be at end of current zone */ - if (pfn + nr_pages != zone_end_pfn(zone)) - return false; + /* MMOP_ONLINE_KEEP will always succeed and inherits the current zone */ + return online_type == MMOP_ONLINE_KEEP; +} - /* no zones in use between current zone and target */ - for (i = idx + 1; i < target; i++) - if (zone_is_initialized(zone - idx + i)) - return false; - } +static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages) +{ + unsigned long old_end_pfn = zone_end_pfn(zone); - if (target < idx) { - /* pages must be at beginning of current zone */ - if (pfn != zone->zone_start_pfn) - return false; + if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) + zone->zone_start_pfn = start_pfn; + + zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn; +} + +static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn, + unsigned long nr_pages) +{ + unsigned long old_end_pfn = pgdat_end_pfn(pgdat); - /* no zones in use between current zone and target */ - for (i = target + 1; i < idx; i++) - if (zone_is_initialized(zone - idx + i)) - return false; + if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) + pgdat->node_start_pfn = start_pfn; + + pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn; +} + +void move_pfn_range_to_zone(struct zone *zone, + unsigned long start_pfn, unsigned long nr_pages) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + int nid = pgdat->node_id; + unsigned long flags; + + if (zone_is_empty(zone)) + init_currently_empty_zone(zone, start_pfn, nr_pages); + + clear_zone_contiguous(zone); + + /* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */ + pgdat_resize_lock(pgdat, &flags); + zone_span_writelock(zone); + resize_zone_range(zone, start_pfn, nr_pages); + zone_span_writeunlock(zone); + resize_pgdat_range(pgdat, start_pfn, nr_pages); + pgdat_resize_unlock(pgdat, &flags); + + /* + * TODO now we have a visible range of pages which are not associated + * with their zone properly. Not nice but set_pfnblock_flags_mask + * expects the zone spans the pfn range. All the pages in the range + * are reserved so nobody should be touching them so we should be safe + */ + memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, MEMMAP_HOTPLUG); + + set_zone_contiguous(zone); +} + +/* + * Associates the given pfn range with the given node and the zone appropriate + * for the given online type. + */ +static struct zone * __meminit move_pfn_range(int online_type, int nid, + unsigned long start_pfn, unsigned long nr_pages) +{ + struct pglist_data *pgdat = NODE_DATA(nid); + struct zone *zone = &pgdat->node_zones[ZONE_NORMAL]; + + if (online_type == MMOP_ONLINE_KEEP) { + struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE]; + /* + * MMOP_ONLINE_KEEP inherits the current zone which is + * ZONE_NORMAL by default but we might be within ZONE_MOVABLE + * already. + */ + if (zone_intersects(movable_zone, start_pfn, nr_pages)) + zone = movable_zone; + } else if (online_type == MMOP_ONLINE_MOVABLE) { + zone = &pgdat->node_zones[ZONE_MOVABLE]; } - *zone_shift = target - idx; - return true; + move_pfn_range_to_zone(zone, start_pfn, nr_pages); + return zone; } /* Must be protected by mem_hotplug_begin() */ @@ -1079,38 +1139,21 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ int nid; int ret; struct memory_notify arg; - int zone_shift = 0; - /* - * This doesn't need a lock to do pfn_to_page(). - * The section can't be removed here because of the - * memory_block->state_mutex. - */ - zone = page_zone(pfn_to_page(pfn)); - - if ((zone_idx(zone) > ZONE_NORMAL || - online_type == MMOP_ONLINE_MOVABLE) && - !can_online_high_movable(pfn_to_nid(pfn))) + nid = pfn_to_nid(pfn); + if (!allow_online_pfn_range(nid, pfn, nr_pages, online_type)) return -EINVAL; - if (online_type == MMOP_ONLINE_KERNEL) { - if (!zone_can_shift(pfn, nr_pages, ZONE_NORMAL, &zone_shift)) - return -EINVAL; - } else if (online_type == MMOP_ONLINE_MOVABLE) { - if (!zone_can_shift(pfn, nr_pages, ZONE_MOVABLE, &zone_shift)) - return -EINVAL; - } - - zone = move_pfn_range(zone_shift, pfn, pfn + nr_pages); - if (!zone) + if (online_type == MMOP_ONLINE_MOVABLE && !can_online_high_movable(nid)) return -EINVAL; + /* associate pfn range with the zone */ + zone = move_pfn_range(online_type, nid, pfn, nr_pages); + arg.start_pfn = pfn; arg.nr_pages = nr_pages; node_states_check_changes_online(nr_pages, zone, &arg); - nid = zone_to_nid(zone); - ret = memory_notify(MEM_GOING_ONLINE, &arg); ret = notifier_to_errno(ret); if (ret) diff --git a/mm/sparse.c b/mm/sparse.c index 9d7fd666015e..7b4be3fd5cac 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -761,10 +761,9 @@ static void free_map_bootmem(struct page *memmap) * set. If this is <=0, then that means that the passed-in * map was not consumed and must be freed. */ -int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn) +int __meminit sparse_add_one_section(struct pglist_data *pgdat, unsigned long start_pfn) { unsigned long section_nr = pfn_to_section_nr(start_pfn); - struct pglist_data *pgdat = zone->zone_pgdat; struct mem_section *ms; struct page *memmap; unsigned long *usemap; -- cgit v1.2.3-55-g7522 From 3d79a728f9b2e6ddcce4e02c91c4de1076548a4c Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 6 Jul 2017 15:38:21 -0700 Subject: mm, memory_hotplug: replace for_device by want_memblock in arch_add_memory arch_add_memory gets for_device argument which then controls whether we want to create memblocks for created memory sections. Simplify the logic by telling whether we want memblocks directly rather than going through pointless negation. This also makes the api easier to understand because it is clear what we want rather than nothing telling for_device which can mean anything. This shouldn't introduce any functional change. Link: http://lkml.kernel.org/r/20170515085827.16474-13-mhocko@kernel.org Signed-off-by: Michal Hocko Tested-by: Dan Williams Acked-by: Vlastimil Babka Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Balbir Singh Cc: Daniel Kiper Cc: David Rientjes Cc: Heiko Carstens Cc: Igor Mammedov Cc: Jerome Glisse Cc: Joonsoo Kim Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Reza Arbab Cc: Tobias Regnery Cc: Toshi Kani Cc: Vitaly Kuznetsov Cc: Xishi Qiu Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/mm/init.c | 4 ++-- arch/powerpc/mm/mem.c | 4 ++-- arch/s390/mm/init.c | 4 ++-- arch/sh/mm/init.c | 4 ++-- arch/x86/mm/init_32.c | 4 ++-- arch/x86/mm/init_64.c | 4 ++-- include/linux/memory_hotplug.h | 2 +- kernel/memremap.c | 2 +- mm/memory_hotplug.c | 2 +- 9 files changed, 15 insertions(+), 15 deletions(-) (limited to 'arch/ia64') diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 80db57d063d0..a4e8d6bd9cfa 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -646,13 +646,13 @@ mem_init (void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, bool for_device) +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - ret = __add_pages(nid, start_pfn, nr_pages, !for_device); + ret = __add_pages(nid, start_pfn, nr_pages, want_memblock); if (ret) printk("%s: Problem encountered in __add_pages() as ret=%d\n", __func__, ret); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 72c46eb53215..de5a90e1ceaa 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -126,7 +126,7 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end) return -ENODEV; } -int arch_add_memory(int nid, u64 start, u64 size, bool for_device) +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; @@ -143,7 +143,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) return -EFAULT; } - return __add_pages(nid, start_pfn, nr_pages, !for_device); + return __add_pages(nid, start_pfn, nr_pages, want_memblock); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index bfa918e3592b..8111694ce55a 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -166,7 +166,7 @@ unsigned long memory_block_size_bytes(void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, bool for_device) +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { unsigned long start_pfn = PFN_DOWN(start); unsigned long size_pages = PFN_DOWN(size); @@ -176,7 +176,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) if (rc) return rc; - rc = __add_pages(nid, start_pfn, size_pages, !for_device); + rc = __add_pages(nid, start_pfn, size_pages, want_memblock); if (rc) vmem_remove_mapping(start, size); return rc; diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 3813a610a2bb..bf726af5f1a5 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -485,14 +485,14 @@ void free_initrd_mem(unsigned long start, unsigned long end) #endif #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, bool for_device) +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { unsigned long start_pfn = PFN_DOWN(start); unsigned long nr_pages = size >> PAGE_SHIFT; int ret; /* We only have ZONE_NORMAL, so this is easy.. */ - ret = __add_pages(nid, start_pfn, nr_pages, !for_device); + ret = __add_pages(nid, start_pfn, nr_pages, want_memblock); if (unlikely(ret)) printk("%s: Failed, __add_pages() == %d\n", __func__, ret); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index a424066d0552..8a64a6f2848d 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -823,12 +823,12 @@ void __init mem_init(void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, bool for_device) +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - return __add_pages(nid, start_pfn, nr_pages, !for_device); + return __add_pages(nid, start_pfn, nr_pages, want_memblock); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 06afa84ac0a0..136422d7d539 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -772,7 +772,7 @@ static void update_end_of_memory_vars(u64 start, u64 size) } } -int arch_add_memory(int nid, u64 start, u64 size, bool for_device) +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; @@ -780,7 +780,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) init_memory_mapping(start, start + size); - ret = __add_pages(nid, start_pfn, nr_pages, !for_device); + ret = __add_pages(nid, start_pfn, nr_pages, want_memblock); WARN_ON_ONCE(ret); /* update max_pfn, max_low_pfn and high_memory */ diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 4d65a2fcac15..780c806e17d3 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -298,7 +298,7 @@ extern int add_memory(int nid, u64 start, u64 size); extern int add_memory_resource(int nid, struct resource *resource, bool online); extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default, bool for_device); -extern int arch_add_memory(int nid, u64 start, u64 size, bool for_device); +extern int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock); extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); diff --git a/kernel/memremap.c b/kernel/memremap.c index 281eb478856a..124bed776532 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -358,7 +358,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, goto err_pfn_remap; mem_hotplug_begin(); - error = arch_add_memory(nid, align_start, align_size, true); + error = arch_add_memory(nid, align_start, align_size, false); if (!error) move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], align_start >> PAGE_SHIFT, diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 4263fa6f2ab4..9b04cf5ea813 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1448,7 +1448,7 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online) } /* call arch's memory hotadd */ - ret = arch_add_memory(nid, start, size, false); + ret = arch_add_memory(nid, start, size, true); if (ret < 0) goto error; -- cgit v1.2.3-55-g7522 From 7868a2087ec13ec4a5df0c5e00999863be132ba8 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Thu, 6 Jul 2017 15:39:42 -0700 Subject: mm/hugetlb: add size parameter to huge_pte_offset() A poisoned or migrated hugepage is stored as a swap entry in the page tables. On architectures that support hugepages consisting of contiguous page table entries (such as on arm64) this leads to ambiguity in determining the page table entry to return in huge_pte_offset() when a poisoned entry is encountered. Let's remove the ambiguity by adding a size parameter to convey additional information about the requested address. Also fixup the definition/usage of huge_pte_offset() throughout the tree. Link: http://lkml.kernel.org/r/20170522133604.11392-4-punit.agrawal@arm.com Signed-off-by: Punit Agrawal Acked-by: Steve Capper Cc: Catalin Marinas Cc: Will Deacon Cc: Tony Luck Cc: Fenghua Yu Cc: James Hogan (odd fixer:METAG ARCHITECTURE) Cc: Ralf Baechle (supporter:MIPS) Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Yoshinori Sato Cc: Rich Felker Cc: "David S. Miller" Cc: Chris Metcalf Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Alexander Viro Cc: Michal Hocko Cc: Mike Kravetz Cc: Naoya Horiguchi Cc: "Aneesh Kumar K.V" Cc: "Kirill A. Shutemov" Cc: Hillf Danton Cc: Mark Rutland Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/mm/hugetlbpage.c | 3 ++- arch/ia64/mm/hugetlbpage.c | 4 ++-- arch/metag/mm/hugetlbpage.c | 3 ++- arch/mips/mm/hugetlbpage.c | 3 ++- arch/parisc/mm/hugetlbpage.c | 3 ++- arch/powerpc/mm/hugetlbpage.c | 2 +- arch/s390/mm/hugetlbpage.c | 3 ++- arch/sh/mm/hugetlbpage.c | 3 ++- arch/sparc/mm/hugetlbpage.c | 3 ++- arch/tile/mm/hugetlbpage.c | 3 ++- arch/x86/mm/hugetlbpage.c | 2 +- fs/userfaultfd.c | 7 +++++-- include/linux/hugetlb.h | 5 +++-- mm/hugetlb.c | 23 ++++++++++++++--------- mm/page_vma_mapped.c | 3 ++- mm/pagewalk.c | 3 ++- 16 files changed, 46 insertions(+), 27 deletions(-) (limited to 'arch/ia64') diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index f89aa8fa5855..656e0ece2289 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -131,7 +131,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, return pte; } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, unsigned long sz) { pgd_t *pgd; pud_t *pud; diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c index 85de86d36fdf..ae35140332f7 100644 --- a/arch/ia64/mm/hugetlbpage.c +++ b/arch/ia64/mm/hugetlbpage.c @@ -44,7 +44,7 @@ huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) } pte_t * -huge_pte_offset (struct mm_struct *mm, unsigned long addr) +huge_pte_offset (struct mm_struct *mm, unsigned long addr, unsigned long sz) { unsigned long taddr = htlbpage_to_page(addr); pgd_t *pgd; @@ -92,7 +92,7 @@ struct page *follow_huge_addr(struct mm_struct *mm, unsigned long addr, int writ if (REGION_NUMBER(addr) != RGN_HPAGE) return ERR_PTR(-EINVAL); - ptep = huge_pte_offset(mm, addr); + ptep = huge_pte_offset(mm, addr, HPAGE_SIZE); if (!ptep || pte_none(*ptep)) return NULL; page = pte_page(*ptep); diff --git a/arch/metag/mm/hugetlbpage.c b/arch/metag/mm/hugetlbpage.c index db1b7da91e4f..67fd53e2935a 100644 --- a/arch/metag/mm/hugetlbpage.c +++ b/arch/metag/mm/hugetlbpage.c @@ -74,7 +74,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, return pte; } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, unsigned long sz) { pgd_t *pgd; pud_t *pud; diff --git a/arch/mips/mm/hugetlbpage.c b/arch/mips/mm/hugetlbpage.c index 74aa6f62468f..cef152234312 100644 --- a/arch/mips/mm/hugetlbpage.c +++ b/arch/mips/mm/hugetlbpage.c @@ -36,7 +36,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, return pte; } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, + unsigned long sz) { pgd_t *pgd; pud_t *pud; diff --git a/arch/parisc/mm/hugetlbpage.c b/arch/parisc/mm/hugetlbpage.c index aa50ac090e9b..5eb8f633b282 100644 --- a/arch/parisc/mm/hugetlbpage.c +++ b/arch/parisc/mm/hugetlbpage.c @@ -69,7 +69,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, return pte; } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, unsigned long sz) { pgd_t *pgd; pud_t *pud; diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 1816b965a142..c41dc44472c5 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -57,7 +57,7 @@ static unsigned nr_gpages; #define hugepd_none(hpd) (hpd_val(hpd) == 0) -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz) { /* Only called for hugetlbfs pages, hence can ignore THP */ return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL, NULL); diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index d3a5e39756f6..44a8e6f0391e 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -180,7 +180,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, return (pte_t *) pmdp; } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, unsigned long sz) { pgd_t *pgdp; p4d_t *p4dp; diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c index cc948db74878..d2412d2d6462 100644 --- a/arch/sh/mm/hugetlbpage.c +++ b/arch/sh/mm/hugetlbpage.c @@ -42,7 +42,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, return pte; } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, unsigned long sz) { pgd_t *pgd; pud_t *pud; diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index 88855e383b34..28ee8d8ffa07 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -277,7 +277,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, return pte; } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, unsigned long sz) { pgd_t *pgd; pud_t *pud; diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c index 03e5cc4e76e4..0986d426a413 100644 --- a/arch/tile/mm/hugetlbpage.c +++ b/arch/tile/mm/hugetlbpage.c @@ -102,7 +102,8 @@ static pte_t *get_pte(pte_t *base, int index, int level) return ptep; } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, unsigned long sz) { pgd_t *pgd; pud_t *pud; diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index adad702b39cd..2824607df108 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -33,7 +33,7 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) if (!vma || !is_vm_hugetlb_page(vma)) return ERR_PTR(-EINVAL); - pte = huge_pte_offset(mm, address); + pte = huge_pte_offset(mm, address, vma_mmu_pagesize(vma)); /* hugetlb should be locked, and hence, prefaulted */ WARN_ON(!pte || pte_none(*pte)); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 3d0dd082337a..cadcd12a3d35 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -214,6 +214,7 @@ static inline struct uffd_msg userfault_msg(unsigned long address, * hugepmd ranges. */ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, + struct vm_area_struct *vma, unsigned long address, unsigned long flags, unsigned long reason) @@ -224,7 +225,7 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); - pte = huge_pte_offset(mm, address); + pte = huge_pte_offset(mm, address, vma_mmu_pagesize(vma)); if (!pte) goto out; @@ -243,6 +244,7 @@ out: } #else static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, + struct vm_area_struct *vma, unsigned long address, unsigned long flags, unsigned long reason) @@ -448,7 +450,8 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason) must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags, reason); else - must_wait = userfaultfd_huge_must_wait(ctx, vmf->address, + must_wait = userfaultfd_huge_must_wait(ctx, vmf->vma, + vmf->address, vmf->flags, reason); up_read(&mm->mmap_sem); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index c92a1f0c7240..31e665fbcf76 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -137,7 +137,8 @@ extern struct list_head huge_boot_pages; pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz); -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr); +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, unsigned long sz); int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep); struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, int write); @@ -190,7 +191,7 @@ static inline void hugetlb_show_meminfo(void) #define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; }) #define hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \ src_addr, pagep) ({ BUG(); 0; }) -#define huge_pte_offset(mm, address) 0 +#define huge_pte_offset(mm, address, sz) 0 static inline int dequeue_hwpoisoned_huge_page(struct page *page) { return 0; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c73828e43100..075345532396 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3246,7 +3246,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { spinlock_t *src_ptl, *dst_ptl; - src_pte = huge_pte_offset(src, addr); + src_pte = huge_pte_offset(src, addr, sz); if (!src_pte) continue; dst_pte = huge_pte_alloc(dst, addr, sz); @@ -3330,7 +3330,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); address = start; for (; address < end; address += sz) { - ptep = huge_pte_offset(mm, address); + ptep = huge_pte_offset(mm, address, sz); if (!ptep) continue; @@ -3548,7 +3548,8 @@ retry_avoidcopy: unmap_ref_private(mm, vma, old_page, address); BUG_ON(huge_pte_none(pte)); spin_lock(ptl); - ptep = huge_pte_offset(mm, address & huge_page_mask(h)); + ptep = huge_pte_offset(mm, address & huge_page_mask(h), + huge_page_size(h)); if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) goto retry_avoidcopy; @@ -3587,7 +3588,8 @@ retry_avoidcopy: * before the page tables are altered */ spin_lock(ptl); - ptep = huge_pte_offset(mm, address & huge_page_mask(h)); + ptep = huge_pte_offset(mm, address & huge_page_mask(h), + huge_page_size(h)); if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { ClearPagePrivate(new_page); @@ -3874,7 +3876,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, address &= huge_page_mask(h); - ptep = huge_pte_offset(mm, address); + ptep = huge_pte_offset(mm, address, huge_page_size(h)); if (ptep) { entry = huge_ptep_get(ptep); if (unlikely(is_hugetlb_entry_migration(entry))) { @@ -4131,7 +4133,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * * Note that page table lock is not held when pte is null. */ - pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); + pte = huge_pte_offset(mm, vaddr & huge_page_mask(h), + huge_page_size(h)); if (pte) ptl = huge_pte_lock(h, mm, pte); absent = !pte || huge_pte_none(huge_ptep_get(pte)); @@ -4270,7 +4273,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, i_mmap_lock_write(vma->vm_file->f_mapping); for (; address < end; address += huge_page_size(h)) { spinlock_t *ptl; - ptep = huge_pte_offset(mm, address); + ptep = huge_pte_offset(mm, address, huge_page_size(h)); if (!ptep) continue; ptl = huge_pte_lock(h, mm, ptep); @@ -4534,7 +4537,8 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) saddr = page_table_shareable(svma, vma, addr, idx); if (saddr) { - spte = huge_pte_offset(svma->vm_mm, saddr); + spte = huge_pte_offset(svma->vm_mm, saddr, + vma_mmu_pagesize(svma)); if (spte) { get_page(virt_to_page(spte)); break; @@ -4630,7 +4634,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, return pte; } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, unsigned long sz) { pgd_t *pgd; p4d_t *p4d; diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index de9c40d7304a..8ec6ba230bb9 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -116,7 +116,8 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) if (unlikely(PageHuge(pvmw->page))) { /* when pud is not present, pte will be NULL */ - pvmw->pte = huge_pte_offset(mm, pvmw->address); + pvmw->pte = huge_pte_offset(mm, pvmw->address, + PAGE_SIZE << compound_order(page)); if (!pvmw->pte) return false; diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 60f7856e508f..1a4197965415 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -180,12 +180,13 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end, struct hstate *h = hstate_vma(vma); unsigned long next; unsigned long hmask = huge_page_mask(h); + unsigned long sz = huge_page_size(h); pte_t *pte; int err = 0; do { next = hugetlb_entry_end(h, addr, end); - pte = huge_pte_offset(walk->mm, addr & hmask); + pte = huge_pte_offset(walk->mm, addr & hmask, sz); if (pte && walk->hugetlb_entry) err = walk->hugetlb_entry(pte, hmask, addr, next, walk); if (err) -- cgit v1.2.3-55-g7522 From e18e340fd171fbb9bd38125c3d0ba5d3fd3ddfb9 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 10 Jul 2017 03:32:44 +0900 Subject: ia64: remove redundant generic-y += kvm_para.h from asm/Kbuild "generic-y += kvm_para.h" is doubled in asm/Kbuild and uapi/asm/Kbuild. The one in the former should be simply removed because kvm_para.h is exported. Signed-off-by: Masahiro Yamada --- arch/ia64/include/asm/Kbuild | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/ia64') diff --git a/arch/ia64/include/asm/Kbuild b/arch/ia64/include/asm/Kbuild index 502a91d8dbbd..1d7641f891e1 100644 --- a/arch/ia64/include/asm/Kbuild +++ b/arch/ia64/include/asm/Kbuild @@ -1,8 +1,6 @@ - generic-y += clkdev.h generic-y += exec.h generic-y += irq_work.h -generic-y += kvm_para.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h generic-y += preempt.h -- cgit v1.2.3-55-g7522 From 203e9e41219b4e7357104e525e91ac609fba2c6c Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Wed, 12 Jul 2017 14:33:14 -0700 Subject: kexec: move vmcoreinfo out of the kernel's .bss section As Eric said, "what we need to do is move the variable vmcoreinfo_note out of the kernel's .bss section. And modify the code to regenerate and keep this information in something like the control page. Definitely something like this needs a page all to itself, and ideally far away from any other kernel data structures. I clearly was not watching closely the data someone decided to keep this silly thing in the kernel's .bss section." This patch allocates extra pages for these vmcoreinfo_XXX variables, one advantage is that it enhances some safety of vmcoreinfo, because vmcoreinfo now is kept far away from other kernel data structures. Link: http://lkml.kernel.org/r/1493281021-20737-1-git-send-email-xlpang@redhat.com Signed-off-by: Xunlei Pang Tested-by: Michael Holzheu Reviewed-by: Juergen Gross Suggested-by: Eric Biederman Cc: Benjamin Herrenschmidt Cc: Dave Young Cc: Hari Bathini Cc: Mahesh Salgaonkar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/machine_kexec.c | 5 ----- arch/s390/kernel/machine_kexec.c | 1 + arch/s390/kernel/setup.c | 6 ------ arch/x86/kernel/crash.c | 2 +- arch/x86/xen/mmu_pv.c | 4 ++-- include/linux/crash_core.h | 4 ++-- kernel/crash_core.c | 26 ++++++++++++++++++++++---- kernel/ksysfs.c | 2 +- 8 files changed, 29 insertions(+), 21 deletions(-) (limited to 'arch/ia64') diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c index 599507bcec91..c14815dca747 100644 --- a/arch/ia64/kernel/machine_kexec.c +++ b/arch/ia64/kernel/machine_kexec.c @@ -163,8 +163,3 @@ void arch_crash_save_vmcoreinfo(void) #endif } -phys_addr_t paddr_vmcoreinfo_note(void) -{ - return ia64_tpa((unsigned long)(char *)&vmcoreinfo_note); -} - diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c index 49a6bd45957b..3d0b14afa232 100644 --- a/arch/s390/kernel/machine_kexec.c +++ b/arch/s390/kernel/machine_kexec.c @@ -246,6 +246,7 @@ void arch_crash_save_vmcoreinfo(void) VMCOREINFO_SYMBOL(lowcore_ptr); VMCOREINFO_SYMBOL(high_memory); VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS); + mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note()); } void machine_shutdown(void) diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 3ae756c0db3d..3d1d808ea8a9 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -496,11 +496,6 @@ static void __init setup_memory_end(void) pr_notice("The maximum memory size is %luMB\n", memory_end >> 20); } -static void __init setup_vmcoreinfo(void) -{ - mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note()); -} - #ifdef CONFIG_CRASH_DUMP /* @@ -939,7 +934,6 @@ void __init setup_arch(char **cmdline_p) #endif setup_resources(); - setup_vmcoreinfo(); setup_lowcore(); smp_fill_possible_mask(); cpu_detect_mhz_feature(); diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 22217ece26c8..44404e2307bb 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -457,7 +457,7 @@ static int prepare_elf64_headers(struct crash_elf_data *ced, bufp += sizeof(Elf64_Phdr); phdr->p_type = PT_NOTE; phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note(); - phdr->p_filesz = phdr->p_memsz = sizeof(vmcoreinfo_note); + phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE; (ehdr->e_phnum)++; #ifdef CONFIG_X86_64 diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 1d7a7213a310..cab28cf2cffb 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2693,8 +2693,8 @@ EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region); phys_addr_t paddr_vmcoreinfo_note(void) { if (xen_pv_domain()) - return virt_to_machine(&vmcoreinfo_note).maddr; + return virt_to_machine(vmcoreinfo_note).maddr; else - return __pa_symbol(&vmcoreinfo_note); + return __pa(vmcoreinfo_note); } #endif /* CONFIG_KEXEC_CORE */ diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 4090a42578a8..87506a02e914 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -19,7 +19,7 @@ CRASH_CORE_NOTE_NAME_BYTES + \ CRASH_CORE_NOTE_DESC_BYTES) -#define VMCOREINFO_BYTES (4096) +#define VMCOREINFO_BYTES PAGE_SIZE #define VMCOREINFO_NOTE_NAME "VMCOREINFO" #define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4) #define VMCOREINFO_NOTE_SIZE ((CRASH_CORE_NOTE_HEAD_BYTES * 2) + \ @@ -56,7 +56,7 @@ phys_addr_t paddr_vmcoreinfo_note(void); #define VMCOREINFO_CONFIG(name) \ vmcoreinfo_append_str("CONFIG_%s=y\n", #name) -extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; +extern u32 *vmcoreinfo_note; extern size_t vmcoreinfo_size; extern size_t vmcoreinfo_max_size; diff --git a/kernel/crash_core.c b/kernel/crash_core.c index fcbd568f1e95..2837d6164db8 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -14,10 +14,10 @@ #include /* vmcoreinfo stuff */ -static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; -u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; +static unsigned char *vmcoreinfo_data; size_t vmcoreinfo_size; -size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); +size_t vmcoreinfo_max_size = VMCOREINFO_BYTES; +u32 *vmcoreinfo_note; /* * parsing the "crashkernel" commandline @@ -326,6 +326,9 @@ static void update_vmcoreinfo_note(void) void crash_save_vmcoreinfo(void) { + if (!vmcoreinfo_note) + return; + vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); update_vmcoreinfo_note(); } @@ -356,11 +359,26 @@ void __weak arch_crash_save_vmcoreinfo(void) phys_addr_t __weak paddr_vmcoreinfo_note(void) { - return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note); + return __pa(vmcoreinfo_note); } static int __init crash_save_vmcoreinfo_init(void) { + vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL); + if (!vmcoreinfo_data) { + pr_warn("Memory allocation for vmcoreinfo_data failed\n"); + return -ENOMEM; + } + + vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE, + GFP_KERNEL | __GFP_ZERO); + if (!vmcoreinfo_note) { + free_page((unsigned long)vmcoreinfo_data); + vmcoreinfo_data = NULL; + pr_warn("Memory allocation for vmcoreinfo_note failed\n"); + return -ENOMEM; + } + VMCOREINFO_OSRELEASE(init_uts_ns.name.release); VMCOREINFO_PAGESIZE(PAGE_SIZE); diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index df1a9aa602a0..46ba853656f6 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -134,7 +134,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj, { phys_addr_t vmcore_base = paddr_vmcoreinfo_note(); return sprintf(buf, "%pa %x\n", &vmcore_base, - (unsigned int)sizeof(vmcoreinfo_note)); + (unsigned int)VMCOREINFO_NOTE_SIZE); } KERNEL_ATTR_RO(vmcoreinfo); -- cgit v1.2.3-55-g7522 From d778931d7b32ada39de0a45267b0c4af0600e277 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 12 Jul 2017 14:37:16 -0700 Subject: ia64: move inline before return type Make the use of inline like the rest of the kernel. Link: http://lkml.kernel.org/r/d47074493af80ce12590340294bc49618165c30d.1499284835.git.joe@perches.com Signed-off-by: Joe Perches Cc: Tony Luck Cc: Fenghua Yu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/mca.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/ia64') diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index 79c7c46d7dc1..555b11180156 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -334,7 +334,7 @@ static void ia64_mlogbuf_dump_from_init(void) ia64_mlogbuf_dump(); } -static void inline +static inline void ia64_mca_spin(const char *func) { if (monarch_cpu == smp_processor_id()) -- cgit v1.2.3-55-g7522 From c02f2a911f2bb833f982d869e3a4ae67e1468969 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 12 Jul 2017 14:37:19 -0700 Subject: ia64: sn: pci: move inline before type Make the use of inline like the rest of the kernel. Link: http://lkml.kernel.org/r/f42b2202bd0d4e7ccf79ce5348bb255a035e67bb.1499284835.git.joe@perches.com Signed-off-by: Joe Perches Cc: Tony Luck Cc: Fenghua Yu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/sn/pci/pcibr/pcibr_ate.c | 2 +- arch/ia64/sn/pci/tioce_provider.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/ia64') diff --git a/arch/ia64/sn/pci/pcibr/pcibr_ate.c b/arch/ia64/sn/pci/pcibr/pcibr_ate.c index 5bc34eac9e01..b67bb4cb73ff 100644 --- a/arch/ia64/sn/pci/pcibr/pcibr_ate.c +++ b/arch/ia64/sn/pci/pcibr/pcibr_ate.c @@ -140,7 +140,7 @@ static inline u64 __iomem *pcibr_ate_addr(struct pcibus_info *pcibus_info, /* * Update the ate. */ -void inline +inline void ate_write(struct pcibus_info *pcibus_info, int ate_index, int count, volatile u64 ate) { diff --git a/arch/ia64/sn/pci/tioce_provider.c b/arch/ia64/sn/pci/tioce_provider.c index 46d3df4b03a1..3bd9abc35485 100644 --- a/arch/ia64/sn/pci/tioce_provider.c +++ b/arch/ia64/sn/pci/tioce_provider.c @@ -52,7 +52,7 @@ * All registers defined in struct tioce will meet that criteria. */ -static void inline +static inline void tioce_mmr_war_pre(struct tioce_kernel *kern, void __iomem *mmr_addr) { u64 mmr_base; @@ -78,7 +78,7 @@ tioce_mmr_war_pre(struct tioce_kernel *kern, void __iomem *mmr_addr) } } -static void inline +static inline void tioce_mmr_war_post(struct tioce_kernel *kern, void __iomem *mmr_addr) { u64 mmr_base; -- cgit v1.2.3-55-g7522